From d6ce507681f46fddbe191dd4439726eb0c0ad59f Mon Sep 17 00:00:00 2001 From: tianyutong Date: Fri, 23 May 2025 09:54:48 +0800 Subject: [PATCH] Initial Commit of Megatron-LM-0.8.0 Change-Id: Ifb4c061207ee2644a21e161ad52fc6ff40564e39 --- .coveragerc | 5 + .github/ISSUE_TEMPLATE/bug.md | 32 + .github/ISSUE_TEMPLATE/enhancement.md | 23 + .github/ISSUE_TEMPLATE/question.md | 12 + .github/ISSUE_TEMPLATE/regression.md | 39 + .github/workflows/stale.yml | 31 + .gitignore | 10 + .gitlab-ci.yml | 357 ++++ CODEOWNERS | 8 + CONTRIBUTING.md | 66 + Dockerfile.ci | 33 + Dockerfile.linting | 17 + LICENSE | 292 +++ MANIFEST.in | 2 + docs/llama_mistral.md | 397 ++++ docs/source/api-guide/context_parallel.rst | 35 + docs/source/api-guide/datasets.rst | 104 + docs/source/api-guide/dist_checkpointing.rst | 79 + .../dist_checkpointing.strategies.rst | 50 + docs/source/api-guide/distributed.rst | 53 + docs/source/api-guide/fusions.rst | 65 + docs/source/api-guide/index.rst | 17 + docs/source/api-guide/models.bert.rst | 22 + docs/source/api-guide/models.gpt.rst | 22 + docs/source/api-guide/models.rst | 21 + docs/source/api-guide/models.t5.rst | 21 + docs/source/api-guide/moe.rst | 4 + .../api-guide/num_microbatches_calculator.rst | 12 + docs/source/api-guide/pipeline_parallel.rst | 47 + docs/source/api-guide/tensor_parallel.rst | 67 + docs/source/api-guide/transformer.rst | 136 ++ docs/source/distrib_optimizer.md | 54 + .../images/context_parallel/CP_overview.png | Bin 0 -> 154304 bytes .../images/context_parallel/CP_results.png | Bin 0 -> 184693 bytes .../images/distrib_optimizer/data_flow.png | Bin 0 -> 90014 bytes .../distrib_optimizer/sharding_scheme.png | Bin 0 -> 99135 bytes docs/source/index.rst | 23 + docs/source/user-guide/index.rst | 4 + .../detxoify_lm/README.md | 112 + .../annotations/filter-selfgeneration.py | 75 + .../annotations/perspective_api_annotate.py | 182 ++ .../detxoify_lm/annotations/preprocess.sh | 14 + .../detxoify_lm/finetune_gpt.py | 157 ++ .../finetune_gpt_distributed-1.3b.sh | 63 + .../detxoify_lm/generate-1.3b.sh | 41 + .../detxoify_lm/generate_samples_gpt.py | 260 +++ .../detxoify_lm/perspective_api.py | 170 ++ .../selfgenerate-1.3b-unconditional.sh | 42 + .../academic_paper_scripts/msdp/README.md | 5 + .../msdp/data_processing.sh | 83 + .../msdp/eval_knwl_generation.sh | 43 + .../msdp/eval_resp_generation.sh | 64 + .../msdp/prep_resp_gen.sh | 18 + .../msdp/prompt_knwl_gen.sh | 46 + .../msdp/prompt_resp_gen.sh | 46 + .../academic_paper_scripts/sc21/CONFIG.sh | 57 + .../academic_paper_scripts/sc21/README.md | 50 + .../academic_paper_scripts/sc21/SBATCH.sh | 13 + examples/academic_paper_scripts/sc21/SRUN.sh | 18 + .../sc21/run_figure_11.sh | 46 + .../sc21/run_figure_12.sh | 54 + .../sc21/run_figure_13.sh | 46 + .../sc21/run_figure_14.sh | 47 + .../sc21/run_figure_15.sh | 47 + .../sc21/run_figure_16.sh | 43 + .../sc21/run_figure_17.sh | 54 + .../sc21/run_figure_18.sh | 54 + .../sc21/run_table_1.sh | 145 ++ examples/bert/README.md | 53 + examples/bert/train_bert_340m_distributed.sh | 77 + examples/gpt3/README.md | 57 + examples/gpt3/gpt_config.yaml | 302 +++ examples/gpt3/train_gpt3_175b_distributed.sh | 81 + examples/inference/README.md | 274 +++ .../gpt/simple_gpt_batch_inference.py | 115 ++ examples/inference/quantization/README.md | 128 ++ .../quantization/ptq_trtllm_llama_7b.sh | 82 + .../quantization/ptq_trtllm_nemotron3_8b.sh | 77 + .../quantization/text_generation_ptq.py | 223 ++ .../quantization/trtllm_text_generation.py | 116 ++ .../run_text_generation_server_345M.sh | 31 + ...eneration_server_345M_8_tensor_parallel.sh | 29 + examples/mamba/.gitignore | 4 + examples/mamba/Dockerfile | 32 + examples/mamba/README.md | 91 + examples/mamba/run_text_gen_server_8b.sh | 50 + examples/mamba/run_text_gen_server_8b_gpt3.sh | 46 + examples/mamba/train.sh | 105 + examples/mixtral/README.md | 120 ++ .../mixtral/train_mixtral_8x7b_distributed.sh | 116 ++ examples/multimodal/Dockerfile | 27 + examples/multimodal/README.md | 148 ++ .../multimodal/assets/pretrain_curves.png | Bin 0 -> 329882 bytes examples/multimodal/clip_converter.py | 155 ++ examples/multimodal/combine_mistral_clip.sh | 21 + examples/multimodal/combine_state_dicts.py | 81 + examples/multimodal/config.py | 107 + .../convert_llava_pretrain_to_wds.py | 31 + examples/multimodal/dataloader_provider.py | 131 ++ examples/multimodal/dataset_helpers.py | 521 +++++ examples/multimodal/evaluate_coco.py | 60 + examples/multimodal/evaluate_mmmu.py | 66 + examples/multimodal/evaluate_textvqa.py | 86 + examples/multimodal/evaluate_vqav2.py | 41 + examples/multimodal/layer_specs.py | 115 ++ examples/multimodal/manual_prompts.json | 29 + examples/multimodal/pretrain_dataset.yaml | 15 + examples/multimodal/pretrain_mistral_clip.sh | 132 ++ examples/multimodal/run_text_generation.py | 378 ++++ examples/multimodal/sft_dataset.yaml | 15 + examples/multimodal/sft_mistral_clip.sh | 134 ++ .../text_generation_mistral_clip.sh | 117 ++ examples/multimodal/train.py | 314 +++ examples/retro/README.md | 74 + examples/retro/preprocess_data.sh | 144 ++ examples/retro/train_retro_2b_distributed.sh | 98 + examples/run_simple_mcore_train_loop.py | 158 ++ examples/t5/README.md | 55 + examples/t5/t5_mcore_train_curve.png | Bin 0 -> 62988 bytes examples/t5/train_t5_220m_distributed.sh | 77 + images/model_table.png | Bin 0 -> 200144 bytes images/strong_scaling.png | Bin 0 -> 406248 bytes images/weak_scaling.png | Bin 0 -> 433007 bytes jet-tests.yml | 115 ++ megatron/core/QuickStart.md | 228 +++ megatron/core/README.md | 14 + megatron/core/README_STRAGGLER.md | 93 + megatron/core/__init__.py | 35 + megatron/core/datasets/Makefile | 9 + megatron/core/datasets/__init__.py | 0 megatron/core/datasets/bert_dataset.py | 199 ++ megatron/core/datasets/blended_dataset.py | 205 ++ .../blended_megatron_dataset_builder.py | 528 +++++ .../blended_megatron_dataset_config.py | 172 ++ megatron/core/datasets/gpt_dataset.py | 780 +++++++ megatron/core/datasets/helpers.cpp | 839 ++++++++ megatron/core/datasets/indexed_dataset.py | 864 ++++++++ megatron/core/datasets/masked_dataset.py | 431 ++++ megatron/core/datasets/megatron_dataset.py | 139 ++ megatron/core/datasets/megatron_tokenizer.py | 141 ++ megatron/core/datasets/multimodal_dataset.py | 62 + megatron/core/datasets/readme.md | 193 ++ megatron/core/datasets/retro/__init__.py | 5 + .../core/datasets/retro/config/__init__.py | 16 + .../datasets/retro/config/bert_embedders.py | 48 + megatron/core/datasets/retro/config/config.py | 135 ++ .../retro/config/gpt_chunk_datasets.py | 15 + .../core/datasets/retro/config/tokenizers.py | 15 + megatron/core/datasets/retro/db/__init__.py | 9 + megatron/core/datasets/retro/db/build.py | 631 ++++++ megatron/core/datasets/retro/db/dataset.py | 108 + megatron/core/datasets/retro/db/utils.py | 369 ++++ megatron/core/datasets/retro/external_libs.py | 19 + .../core/datasets/retro/index/__init__.py | 11 + megatron/core/datasets/retro/index/build.py | 313 +++ megatron/core/datasets/retro/index/factory.py | 40 + megatron/core/datasets/retro/index/index.py | 134 ++ .../datasets/retro/index/indexes/__init__.py | 10 + .../retro/index/indexes/faiss_base.py | 150 ++ .../retro/index/indexes/faiss_par_add.py | 208 ++ megatron/core/datasets/retro/index/utils.py | 126 ++ .../core/datasets/retro/index/validate.py | 191 ++ .../core/datasets/retro/query/__init__.py | 1 + .../datasets/retro/query/gpt_chunk_dataset.py | 110 + .../retro/query/multi_split_gpt_dataset.py | 107 + megatron/core/datasets/retro/query/query.py | 394 ++++ .../datasets/retro/query/retro_dataset.py | 242 +++ megatron/core/datasets/retro/query/utils.py | 35 + megatron/core/datasets/retro/utils.py | 349 ++++ megatron/core/datasets/t5_dataset.py | 231 +++ megatron/core/datasets/utils.py | 88 + megatron/core/datasets/utils_s3.py | 163 ++ megatron/core/dist_checkpointing/__init__.py | 11 + megatron/core/dist_checkpointing/core.py | 77 + .../core/dist_checkpointing/dict_utils.py | 232 +++ megatron/core/dist_checkpointing/mapping.py | 523 +++++ megatron/core/dist_checkpointing/optimizer.py | 129 ++ .../core/dist_checkpointing/serialization.py | 420 ++++ .../dist_checkpointing/strategies/__init__.py | 5 + .../strategies/async_utils.py | 231 +++ .../dist_checkpointing/strategies/base.py | 185 ++ .../dist_checkpointing/strategies/common.py | 147 ++ .../strategies/filesystem_async.py | 390 ++++ .../strategies/fully_parallel.py | 876 ++++++++ .../strategies/resharding.py | 315 +++ .../strategies/state_dict_saver.py | 162 ++ .../strategies/tensorstore.py | 131 ++ .../dist_checkpointing/strategies/torch.py | 815 ++++++++ .../strategies/two_stage.py | 257 +++ .../dist_checkpointing/strategies/zarr.py | 307 +++ megatron/core/dist_checkpointing/utils.py | 158 ++ .../core/dist_checkpointing/validation.py | 528 +++++ megatron/core/distributed/__init__.py | 6 + .../distributed/distributed_data_parallel.py | 329 +++ .../distributed_data_parallel_config.py | 32 + .../core/distributed/finalize_model_grads.py | 151 ++ .../core/distributed/param_and_grad_buffer.py | 549 +++++ megatron/core/enums.py | 10 + megatron/core/fusions/__init__.py | 0 megatron/core/fusions/fused_bias_dropout.py | 73 + megatron/core/fusions/fused_bias_geglu.py | 85 + megatron/core/fusions/fused_bias_gelu.py | 50 + megatron/core/fusions/fused_bias_swiglu.py | 89 + megatron/core/fusions/fused_cross_entropy.py | 153 ++ megatron/core/fusions/fused_layer_norm.py | 169 ++ megatron/core/fusions/fused_softmax.py | 220 ++ megatron/core/inference/__init__.py | 1 + .../core/inference/ammo_support/__init__.py | 0 .../inference/ammo_support/gpt/__init__.py | 1 + .../inference/ammo_support/gpt/model_specs.py | 58 + .../ammo_support/gpt/state_dict_hooks.py | 145 ++ .../core/inference/common_inference_params.py | 28 + .../core/inference/communication_utils.py | 49 + megatron/core/inference/engines/__init__.py | 0 .../core/inference/engines/abstract_engine.py | 16 + .../core/inference/engines/mcore_engine.py | 91 + megatron/core/inference/inference_request.py | 29 + .../model_inference_wrappers/__init__.py | 0 .../abstract_model_inference_wrapper.py | 233 +++ .../model_inference_wrappers/gpt/__init__.py | 0 .../gpt/gpt_inference_wrapper.py | 84 + .../inference_wrapper_config.py | 39 + megatron/core/inference/scheduler.py | 116 ++ .../text_generation_controllers/__init__.py | 0 .../simple_text_generation_controller.py | 352 ++++ megatron/core/inference/utils.py | 16 + megatron/core/inference_params.py | 30 + megatron/core/jit.py | 11 + megatron/core/model_parallel_config.py | 324 +++ megatron/core/models/T5/__init__.py | 1 + megatron/core/models/T5/t5_model.py | 449 ++++ megatron/core/models/T5/t5_spec.py | 253 +++ megatron/core/models/__init__.py | 0 megatron/core/models/bert/__init__.py | 0 megatron/core/models/bert/bert_layer_specs.py | 95 + megatron/core/models/bert/bert_lm_head.py | 59 + megatron/core/models/bert/bert_model.py | 284 +++ megatron/core/models/bert/pooler.py | 51 + megatron/core/models/common/__init__.py | 0 .../core/models/common/embeddings/__init__.py | 0 .../embeddings/language_model_embedding.py | 137 ++ .../common/embeddings/rotary_pos_embedding.py | 255 +++ .../models/common/language_module/__init__.py | 0 .../common/language_module/language_module.py | 204 ++ .../models/common/vision_module/__init__.py | 0 .../common/vision_module/vision_module.py | 17 + megatron/core/models/gpt/__init__.py | 1 + megatron/core/models/gpt/gpt_layer_specs.py | 141 ++ megatron/core/models/gpt/gpt_model.py | 240 +++ megatron/core/models/mamba/__init__.py | 1 + .../core/models/mamba/mamba_layer_specs.py | 69 + megatron/core/models/mamba/mamba_model.py | 210 ++ megatron/core/models/multimodal/__init__.py | 0 .../core/models/multimodal/llava_model.py | 237 +++ megatron/core/models/retro/__init__.py | 13 + megatron/core/models/retro/base_attention.py | 44 + megatron/core/models/retro/config.py | 87 + .../core/models/retro/decoder_attention.py | 309 +++ megatron/core/models/retro/decoder_spec.py | 184 ++ .../core/models/retro/encoder_attention.py | 233 +++ megatron/core/models/retro/encoder_spec.py | 186 ++ megatron/core/models/retro/model.py | 100 + megatron/core/models/retro/utils.py | 24 + megatron/core/models/vision/__init__.py | 0 megatron/core/models/vision/clip_vit_model.py | 139 ++ .../models/vision/multimodal_projector.py | 58 + .../core/models/vision/vit_layer_specs.py | 52 + megatron/core/num_microbatches_calculator.py | 301 +++ megatron/core/optimizer/__init__.py | 371 ++++ megatron/core/optimizer/clip_grads.py | 193 ++ megatron/core/optimizer/distrib_optimizer.py | 1616 +++++++++++++++ megatron/core/optimizer/grad_scaler.py | 142 ++ megatron/core/optimizer/optimizer.py | 1064 ++++++++++ megatron/core/optimizer/optimizer_config.py | 116 ++ megatron/core/package_info.py | 29 + megatron/core/packed_seq_params.py | 13 + megatron/core/parallel_state.py | 1357 ++++++++++++ megatron/core/pipeline_parallel/__init__.py | 1 + .../pipeline_parallel/p2p_communication.py | 596 ++++++ megatron/core/pipeline_parallel/schedules.py | 1524 ++++++++++++++ megatron/core/requirements.txt | 1 + megatron/core/ssm/__init__.py | 0 megatron/core/ssm/mamba_block.py | 228 +++ .../core/ssm/mamba_hybrid_layer_allocation.py | 191 ++ megatron/core/ssm/mamba_layer.py | 80 + megatron/core/ssm/mamba_mixer.py | 518 +++++ megatron/core/ssm/triton_cache_manager.py | 44 + megatron/core/tensor_parallel/__init__.py | 75 + .../core/tensor_parallel/cross_entropy.py | 232 +++ megatron/core/tensor_parallel/data.py | 104 + megatron/core/tensor_parallel/layers.py | 1147 +++++++++++ megatron/core/tensor_parallel/mappings.py | 530 +++++ megatron/core/tensor_parallel/random.py | 301 +++ megatron/core/tensor_parallel/utils.py | 118 ++ megatron/core/timers.py | 398 ++++ megatron/core/transformer/__init__.py | 6 + megatron/core/transformer/attention.py | 620 ++++++ .../transformer/custom_layers/__init__.py | 0 .../custom_layers/transformer_engine.py | 908 ++++++++ .../core/transformer/dot_product_attention.py | 205 ++ megatron/core/transformer/enums.py | 27 + megatron/core/transformer/identity_op.py | 28 + megatron/core/transformer/mlp.py | 255 +++ megatron/core/transformer/module.py | 190 ++ megatron/core/transformer/moe/README.md | 215 ++ megatron/core/transformer/moe/__init__.py | 0 megatron/core/transformer/moe/experts.py | 571 ++++++ .../core/transformer/moe/grouped_gemm_util.py | 20 + megatron/core/transformer/moe/moe_layer.py | 121 ++ megatron/core/transformer/moe/moe_utils.py | 505 +++++ megatron/core/transformer/moe/router.py | 313 +++ .../core/transformer/moe/token_dispatcher.py | 592 ++++++ megatron/core/transformer/spec_utils.py | 109 + megatron/core/transformer/torch_layer_norm.py | 43 + .../core/transformer/transformer_block.py | 492 +++++ .../core/transformer/transformer_config.py | 440 ++++ .../core/transformer/transformer_layer.py | 255 +++ megatron/core/transformer/utils.py | 188 ++ megatron/core/utils.py | 1242 +++++++++++ megatron/inference/__init__.py | 1 + megatron/inference/arguments.py | 26 + megatron/inference/checkpointing.py | 135 ++ megatron/inference/gpt/__init__.py | 1 + megatron/inference/gpt/model_provider.py | 89 + megatron/inference/static/index.html | 124 ++ .../inference/text_generation/__init__.py | 7 + megatron/inference/text_generation/api.py | 213 ++ .../inference/text_generation/beam_utils.py | 64 + .../text_generation/communication.py | 185 ++ .../inference/text_generation/forward_step.py | 164 ++ .../inference/text_generation/generation.py | 437 ++++ .../inference/text_generation/sampling.py | 93 + .../inference/text_generation/tokenization.py | 135 ++ megatron/inference/text_generation_server.py | 241 +++ megatron/legacy/data/__init__.py | 0 megatron/legacy/data/autoaugment.py | 320 +++ .../legacy/data/biencoder_dataset_utils.py | 209 ++ megatron/legacy/data/data_samplers.py | 192 ++ megatron/legacy/data/dataset_utils.py | 726 +++++++ megatron/legacy/data/ict_dataset.py | 156 ++ megatron/legacy/data/image_folder.py | 302 +++ megatron/legacy/data/multimodal_dataset.py | 54 + megatron/legacy/data/orqa_wiki_dataset.py | 193 ++ megatron/legacy/data/realm_dataset_utils.py | 199 ++ megatron/legacy/data/realm_index.py | 224 ++ megatron/legacy/data/vit_dataset.py | 249 +++ .../legacy/fp16_deprecated/loss_scaler.py | 26 + megatron/legacy/fused_kernels/__init__.py | 75 + megatron/legacy/fused_kernels/compat.h | 17 + .../legacy/fused_kernels/tests/__init__.py | 0 .../fused_kernels/tests/test_fused_kernels.py | 388 ++++ megatron/legacy/fused_kernels/type_shim.h | 103 + megatron/legacy/indexer.py | 129 ++ megatron/legacy/model/__init__.py | 10 + megatron/legacy/model/bert_model.py | 257 +++ megatron/legacy/model/biencoder_model.py | 328 +++ megatron/legacy/model/classification.py | 101 + megatron/legacy/model/enums.py | 21 + megatron/legacy/model/fused_bias_gelu.py | 44 + megatron/legacy/model/fused_layer_norm.py | 99 + megatron/legacy/model/fused_softmax.py | 234 +++ megatron/legacy/model/gpt_model.py | 122 ++ megatron/legacy/model/language_model.py | 627 ++++++ megatron/legacy/model/module.py | 206 ++ megatron/legacy/model/multiple_choice.py | 112 + megatron/legacy/model/realm_model.py | 204 ++ megatron/legacy/model/rms_norm.py | 31 + megatron/legacy/model/t5_model.py | 186 ++ megatron/legacy/model/transformer.py | 1818 +++++++++++++++++ megatron/legacy/model/utils.py | 79 + .../legacy/model/vision/classification.py | 86 + megatron/legacy/model/vision/dino.py | 291 +++ .../model/vision/esvit_swin_backbone.py | 849 ++++++++ megatron/legacy/model/vision/inpainting.py | 152 ++ megatron/legacy/model/vision/knn_monitor.py | 129 ++ megatron/legacy/model/vision/mit_backbone.py | 415 ++++ megatron/legacy/model/vision/swin_backbone.py | 625 ++++++ megatron/legacy/model/vision/utils.py | 27 + megatron/legacy/model/vision/vit_backbone.py | 248 +++ megatron/legacy/mpu/tests/__init__.py | 0 megatron/legacy/mpu/tests/commons.py | 70 + .../legacy/mpu/tests/test_cross_entropy.py | 95 + megatron/legacy/mpu/tests/test_data.py | 75 + megatron/legacy/mpu/tests/test_initialize.py | 82 + megatron/legacy/mpu/tests/test_layers.py | 517 +++++ megatron/legacy/mpu/tests/test_random.py | 191 ++ megatron/training/__init__.py | 18 + megatron/training/activations.py | 18 + megatron/training/arguments.py | 1793 ++++++++++++++++ megatron/training/async_utils.py | 43 + megatron/training/checkpointing.py | 972 +++++++++ megatron/training/dist_signal_handler.py | 81 + megatron/training/global_vars.py | 223 ++ megatron/training/initialize.py | 423 ++++ megatron/training/log_handler.py | 24 + megatron/training/one_logger_utils.py | 463 +++++ .../training/optimizer_param_scheduler.py | 249 +++ megatron/training/theoretical_memory_usage.py | 187 ++ megatron/training/tokenizer/__init__.py | 4 + .../training/tokenizer/bert_tokenization.py | 431 ++++ .../training/tokenizer/gpt2_tokenization.py | 321 +++ megatron/training/tokenizer/tokenizer.py | 859 ++++++++ megatron/training/training.py | 1549 ++++++++++++++ megatron/training/utils.py | 386 ++++ megatron/training/yaml_arguments.py | 456 +++++ pretrain_bert.py | 192 ++ pretrain_gpt.py | 251 +++ pretrain_ict.py | 166 ++ pretrain_mamba.py | 239 +++ pretrain_retro.py | 244 +++ pretrain_t5.py | 263 +++ pretrain_vision_classify.py | 105 + pretrain_vision_dino.py | 105 + pretrain_vision_inpaint.py | 141 ++ pretrain_vlm.py | 221 ++ pyproject.toml | 24 + setup.py | 109 + tasks/data_utils.py | 105 + tasks/ensemble_classifier.py | 149 ++ tasks/eval_utils.py | 182 ++ tasks/finetune_utils.py | 304 +++ tasks/glue/data.py | 56 + tasks/glue/finetune.py | 81 + tasks/glue/mnli.py | 71 + tasks/glue/qqp.py | 88 + tasks/main.py | 100 + tasks/msdp/README.md | 19 + tasks/msdp/evaluate.py | 45 + tasks/msdp/main.py | 66 + tasks/msdp/metrics.py | 77 + tasks/msdp/preprocessing.py | 582 ++++++ tasks/msdp/prompt.py | 309 +++ tasks/orqa/README.md | 36 + tasks/orqa/evaluate_orqa.py | 39 + tasks/orqa/evaluate_utils.py | 175 ++ tasks/orqa/supervised/data.py | 287 +++ tasks/orqa/supervised/eval_utils.py | 193 ++ tasks/orqa/supervised/finetune.py | 238 +++ tasks/orqa/unsupervised/nq.py | 215 ++ tasks/orqa/unsupervised/qa_utils.py | 177 ++ tasks/orqa/unsupervised/tokenizers.py | 243 +++ tasks/race/data.py | 135 ++ tasks/race/finetune.py | 55 + tasks/vision/classification/classification.py | 81 + tasks/vision/classification/eval_utils.py | 116 ++ tasks/vision/finetune_utils.py | 297 +++ tasks/vision/main.py | 53 + tasks/vision/segmentation/cityscapes.py | 207 ++ tasks/vision/segmentation/data.py | 154 ++ .../vision/segmentation/finetune_segformer.py | 239 +++ tasks/vision/segmentation/finetune_setr.py | 213 ++ tasks/vision/segmentation/metrics.py | 594 ++++++ tasks/vision/segmentation/seg_heads.py | 127 ++ tasks/vision/segmentation/seg_models.py | 79 + tasks/vision/segmentation/transforms.py | 433 ++++ tasks/vision/segmentation/utils.py | 85 + tasks/zeroshot_gpt/datasets.py | 148 ++ tasks/zeroshot_gpt/detokenizer.py | 67 + tasks/zeroshot_gpt/evaluate.py | 210 ++ tests/__init__.py | 0 tests/functional_tests/__init__.py | 0 .../functional_tests/jet_recipes/MR-bert.yaml | 58 + .../jet_recipes/MR-gpt-nemo.yaml | 46 + .../functional_tests/jet_recipes/MR-gpt.yaml | 119 ++ .../jet_recipes/MR-multimodal.yaml | 55 + tests/functional_tests/jet_recipes/MR-t5.yaml | 52 + .../jet_recipes/build-pyt.yaml | 23 + .../jet_recipes/local-generator.py | 84 + .../jet_recipes/monthly-t5.yaml | 56 + .../jet_recipes/nightly-bert.yaml | 52 + .../jet_recipes/nightly-gpt.yaml | 69 + .../jet_recipes/weekly-gpt.yaml | 60 + .../python_test_utils/__init__.py | 0 .../python_test_utils/common.py | 81 + .../get_test_results_from_tensorboard_logs.py | 32 + .../python_test_utils/jet_test_pipeline.py | 142 ++ .../multitest_ci_pipeline.py | 47 + .../python_test_utils/test_ci_pipeline.py | 97 + .../python_test_utils/test_fp8_ci_pipeline.py | 124 ++ .../test_resume_checkpoint_pipeline.py | 70 + .../shell_test_utils/_run_local_training.sh | 85 + .../shell_test_utils/restart_jet_log_jobs.sh | 123 ++ .../shell_test_utils/run_release_record.sh | 106 + ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json | 1 + ...ghtly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json | 1 + ...m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json | 1 + ...rt_345m_nightly_dgx_a100_1N8G_tp1_pp2.json | 1 + ...rt_345m_nightly_dgx_a100_1N8G_tp4_pp1.json | 1 + .../bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json | 1 + ...core_tp2_pp2_local_spec_dgx_a100_1N8G.json | 70 + .../bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json | 1 + .../bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json | 1 + .../jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json | 1 + ...izer_overlap_grad_reduce_param_gather.json | 1 + ...izer_overlap_grad_reduce_param_gather.json | 1 + ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json | 1 + ..._1N8G_mcore_tp1_pp2_resume_torch_dist.json | 1 + ...m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json | 1 + ..._1N8G_mcore_tp1_pp4_resume_torch_dist.json | 1 + ...tp2_pp2_resume_torch_dist_te_2experts.json | 1 + ...esume_torch_dist_te_4experts2parallel.json | 1 + ...x_a100_1N8G_mcore_tp2_pp2_te_2experts.json | 1 + ...8G_mcore_tp2_pp2_te_4experts2parallel.json | 1 + ...m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json | 1 + ..._a100_1N8G_mcore_tp4_pp1_resume_torch.json | 1 + ..._1N8G_mcore_tp4_pp1_resume_torch_dist.json | 1 + ...p1_dist_optimizer_overlap_grad_reduce.json | 1 + ...a100_1N8G_tp1_pp1_overlap_grad_reduce.json | 1 + ...t3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json | 1 + ...ly_dgx_a100_1N8G_tp1_pp2_resume_torch.json | 1 + ...t3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json | 1 + ...a100_1N8G_tp1_pp4_overlap_grad_reduce.json | 1 + ...ly_dgx_a100_1N8G_tp1_pp4_resume_torch.json | 1 + ..._1N8G_tp1_pp4_vp1_overlap_grad_reduce.json | 1 + ...ightly_dgx_a100_1N8G_tp2_pp2_4experts.json | 1 + ...a100_1N8G_tp2_pp2_overlap_grad_reduce.json | 1 + ...00_1N8G_tp2_pp2_resume_torch_4experts.json | 1 + ..._pp2_resume_torch_overlap_grad_reduce.json | 1 + ...t3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json | 1 + ...a100_1N8G_tp4_pp1_overlap_grad_reduce.json | 1 + ...ly_dgx_a100_1N8G_tp4_pp1_resume_torch.json | 1 + ..._tp1_pp1_dist_optimizer_dgx_a100_1N8G.json | 1 + ...mizer_no_mmap_bin_files_dgx_a100_1N8G.json | 1 + ..._uniform_full_recompute_dgx_a100_1N8G.json | 1 + ...tp1_pp2_rope_embeddings_dgx_a100_1N8G.json | 1 + ...s_interleaved_no_fusion_dgx_a100_1N8G.json | 1 + ...pp4_disable_bias_linear_dgx_a100_1N8G.json | 1 + ...1_pp4_sequence_parallel_dgx_a100_1N8G.json | 1 + ...mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json | 1 + ..._embeddings_and_outputs_dgx_a100_1N8G.json | 1 + ...alculate_per_token_loss_dgx_a100_1N8G.json | 1 + ...p1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json | 1 + ...mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json | 1 + ...zer_overlap_grad_reduce_dgx_a100_1N8G.json | 1 + ...rad_reduce_param_gather_dgx_a100_1N8G.json | 1 + ...rlap_grad_reduce_untied_dgx_a100_1N8G.json | 1 + ...p1_cp2_nondeterministic_dgx_a100_1N8G.json | 1 + ...p1_te_8experts2parallel_dgx_a100_1N8G.json | 1 + ...parallel_dist_optimizer_dgx_a100_1N8G.json | 1 + ...ts2parallel_groupedGEMM_dgx_a100_1N8G.json | 1 + ...aram_gather_groupedGEMM_dgx_a100_1N8G.json | 1 + ...rts2parallel_top2router_dgx_a100_1N8G.json | 1 + ...p2_cp2_nondeterministic_dgx_a100_1N8G.json | 1 + ...oss_entropy_loss_fusion_dgx_a100_1N8G.json | 1 + ...p_average_in_collective_dgx_a100_1N8G.json | 1 + ...embedding_wgrad_compute_dgx_a100_1N8G.json | 1 + ...pt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json | 1 + ...tion_mask_in_dataloader_dgx_a100_1N8G.json | 1 + ...2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json | 1 + ...zer_overlap_grad_reduce_dgx_a100_1N8G.json | 1 + ...rad_reduce_param_gather_dgx_a100_1N8G.json | 1 + ..._qk_layernorm_test_mode_dgx_a100_1N8G.json | 1 + .../gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json | 1 + .../jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json | 1 + .../gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json | 1 + .../jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json | 1 + ...ava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json | 1 + ...alculate_per_token_loss_dgx_a100_1N8G.json | 1 + .../bert/pretrain_bert_distributed_test.sh | 139 ++ .../gpt3/pretrain_gpt3_distributed_test.sh | 206 ++ .../gpt3/pretrain_gpt3_nemo_test.sh | 65 + .../pretrain_llava_distributed_test.sh | 194 ++ .../retro/pretrain_retro_distributed_test.sh | 168 ++ .../t5/pretrain_t5_distributed_test.sh | 155 ++ tests/unit_tests/__init__.py | 2 + tests/unit_tests/data/__init__.py | 0 tests/unit_tests/data/test_bin_reader.py | 162 ++ tests/unit_tests/data/test_builder.py | 367 ++++ tests/unit_tests/data/test_gpt_dataset.py | 116 ++ .../data/test_multimodal_dataset.py | 58 + tests/unit_tests/data/test_preprocess_data.py | 241 +++ .../unit_tests/data/test_preprocess_mmdata.py | 221 ++ .../unit_tests/dist_checkpointing/__init__.py | 57 + .../unit_tests/dist_checkpointing/conftest.py | 37 + .../dist_checkpointing/models/__init__.py | 0 .../dist_checkpointing/models/common.py | 160 ++ .../models/test_bert_model.py | 86 + .../models/test_gpt_model.py | 80 + .../models/test_grouped_mlp.py | 165 ++ .../dist_checkpointing/models/test_mlp_glu.py | 67 + .../models/test_retro_model.py | 76 + .../models/test_sequential_mlp.py | 215 ++ .../models/test_t5_model.py | 85 + .../dist_checkpointing/test_async_save.py | 97 + .../test_cached_metadata.py | 85 + .../test_flattened_resharding.py | 190 ++ .../dist_checkpointing/test_fully_parallel.py | 220 ++ .../dist_checkpointing/test_mapping.py | 126 ++ .../dist_checkpointing/test_optimizer.py | 513 +++++ .../dist_checkpointing/test_serialization.py | 554 +++++ .../distributed/test_param_and_grad_buffer.py | 175 ++ .../unit_tests/fusions/test_torch_softmax.py | 44 + tests/unit_tests/inference/__init__.py | 0 .../unit_tests/inference/engines/__init__.py | 0 .../inference/engines/test_mcore_engine.py | 64 + .../model_inference_wrappers/__init__.py | 0 .../gpt/test_gpt_inference_wrapper.py | 81 + .../test_model_inference_wrapper_config.py | 15 + .../inference/test_common_inference_params.py | 8 + .../inference/test_inference_utils.py | 11 + .../inference/test_modelopt_gpt_model.py | 44 + tests/unit_tests/inference/test_scheduler.py | 63 + .../text_generation_controllers/__init__.py | 0 .../test_simple_text_generation_controller.py | 115 ++ tests/unit_tests/models/__init__.py | 0 .../unit_tests/models/test_base_embedding.py | 58 + tests/unit_tests/models/test_bert_model.py | 77 + .../unit_tests/models/test_clip_vit_model.py | 54 + tests/unit_tests/models/test_gpt_model.py | 75 + tests/unit_tests/models/test_llava_model.py | 122 ++ .../models/test_multimodal_projector.py | 68 + tests/unit_tests/models/test_t5_model.py | 85 + .../unit_tests/pipeline_parallel/__init__.py | 0 .../pipeline_parallel/test_schedules.py | 212 ++ tests/unit_tests/tensor_parallel/__init__.py | 0 .../tensor_parallel/test_cross_entropy.py | 14 + tests/unit_tests/tensor_parallel/test_data.py | 21 + .../tensor_parallel/test_initialization.py | 97 + .../unit_tests/tensor_parallel/test_layers.py | 54 + .../tensor_parallel/test_mappings.py | 135 ++ .../unit_tests/tensor_parallel/test_random.py | 44 + .../test_tensor_parallel_utils.py | 43 + tests/unit_tests/test_basic.py | 3 + tests/unit_tests/test_imports.py | 157 ++ .../unit_tests/test_local_multi_tensor_fns.py | 36 + .../test_num_microbatches_calculator.py | 128 ++ tests/unit_tests/test_optimizer.py | 66 + tests/unit_tests/test_parallel_state.py | 468 +++++ tests/unit_tests/test_training.py | 43 + tests/unit_tests/test_utilities.py | 82 + tests/unit_tests/test_utils.py | 183 ++ tests/unit_tests/transformer/__init__.py | 0 tests/unit_tests/transformer/moe/__init__.py | 0 .../moe/test_a2a_token_dispatcher.py | 84 + .../transformer/moe/test_aux_loss.py | 97 + .../transformer/moe/test_grouped_mlp.py | 358 ++++ .../transformer/moe/test_routers.py | 91 + .../transformer/moe/test_sequential_mlp.py | 61 + .../transformer/moe/test_token_dispatcher.py | 290 +++ .../unit_tests/transformer/test_attention.py | 111 + .../transformer/test_attention_packed_seq.py | 131 ++ .../transformer/test_core_attention.py | 64 + tests/unit_tests/transformer/test_mlp.py | 58 + tests/unit_tests/transformer/test_module.py | 98 + .../transformer/test_retro_attention.py | 207 ++ tests/unit_tests/transformer/test_rope.py | 52 + .../transformer/test_spec_customization.py | 243 +++ .../transformer/test_transformer_block.py | 107 + .../transformer/test_transformer_layer.py | 106 + tools/autoformat.sh | 21 + tools/bert_embedding/__init__.py | 3 + tools/bert_embedding/dataset.py | 55 + tools/bert_embedding/embed.py | 278 +++ tools/bert_embedding/external_libs.py | 14 + tools/bert_embedding/huggingface.py | 126 ++ tools/checkpoint/convert.py | 154 ++ tools/checkpoint/hybrid_conversion.py | 398 ++++ tools/checkpoint/loader_llama_mistral.py | 667 ++++++ tools/checkpoint/loader_mcore.py | 383 ++++ tools/checkpoint/loader_megatron.py | 371 ++++ tools/checkpoint/loader_mixtral_hf.py | 335 +++ tools/checkpoint/saver_mcore.py | 796 ++++++++ tools/checkpoint/saver_megatron.py | 410 ++++ tools/checkpoint/setter.py | 113 + tools/checkpoint/utils.py | 23 + tools/linter.py | 36 + tools/merge_datasets.py | 93 + tools/openwebtext/README.md | 59 + tools/openwebtext/add_id.py | 54 + tools/openwebtext/blacklist_urls.py | 302 +++ tools/openwebtext/cleanup_dataset.py | 102 + tools/openwebtext/cleanup_fix_dataset.py | 178 ++ tools/openwebtext/filter_ngrams.py | 479 +++++ tools/openwebtext/find_duplicates.py | 292 +++ tools/openwebtext/group_duplicate_url.py | 77 + tools/openwebtext/merge_jsons.py | 42 + tools/openwebtext/remove_group_duplicates.py | 56 + tools/preprocess_data.py | 409 ++++ tools/preprocess_data_nmt.py | 111 + tools/preprocess_mmdata.py | 170 ++ tools/report_theoretical_memory.py | 14 + tools/retro/README.md | 256 +++ tools/retro/build_db.md | 421 ++++ tools/retro/cli/__init__.py | 3 + tools/retro/cli/__main__.py | 9 + tools/retro/cli/cli.py | 301 +++ tools/retro/config_utils.py | 632 ++++++ tools/retro/docker/Dockerfile | 19 + tools/retro/preprocess_data.py | 295 +++ tools/retro/sft/README.md | 3 + tools/retro/sft/dataset_conv.py | 446 ++++ tools/retro/sft/open_inst.sh | 1 + tools/retro/sft/sft_retro.py | 275 +++ tools/retro/sft/sft_retro_lm.sh | 150 ++ tools/retro/text_generation/evaluate.py | 200 ++ tools/retro/text_generation/metrics.py | 80 + tools/retro/text_generation/retro_api.py | 221 ++ tools/retro/text_generation/retro_generate.sh | 125 ++ .../retro/text_generation/retro_generation.py | 250 +++ .../text_generation/retro_text_generation.py | 263 +++ tools/run_mamba_text_generation_server.py | 121 ++ tools/run_text_generation_server.py | 133 ++ tools/run_vlm_text_generation.py | 218 ++ tools/text_generation_cli.py | 23 + 704 files changed, 105619 insertions(+) create mode 100644 .coveragerc create mode 100644 .github/ISSUE_TEMPLATE/bug.md create mode 100644 .github/ISSUE_TEMPLATE/enhancement.md create mode 100644 .github/ISSUE_TEMPLATE/question.md create mode 100644 .github/ISSUE_TEMPLATE/regression.md create mode 100644 .github/workflows/stale.yml create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 CODEOWNERS create mode 100644 CONTRIBUTING.md create mode 100644 Dockerfile.ci create mode 100644 Dockerfile.linting create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 docs/llama_mistral.md create mode 100644 docs/source/api-guide/context_parallel.rst create mode 100644 docs/source/api-guide/datasets.rst create mode 100644 docs/source/api-guide/dist_checkpointing.rst create mode 100644 docs/source/api-guide/dist_checkpointing.strategies.rst create mode 100644 docs/source/api-guide/distributed.rst create mode 100644 docs/source/api-guide/fusions.rst create mode 100644 docs/source/api-guide/index.rst create mode 100644 docs/source/api-guide/models.bert.rst create mode 100644 docs/source/api-guide/models.gpt.rst create mode 100644 docs/source/api-guide/models.rst create mode 100644 docs/source/api-guide/models.t5.rst create mode 100644 docs/source/api-guide/moe.rst create mode 100644 docs/source/api-guide/num_microbatches_calculator.rst create mode 100644 docs/source/api-guide/pipeline_parallel.rst create mode 100644 docs/source/api-guide/tensor_parallel.rst create mode 100644 docs/source/api-guide/transformer.rst create mode 100644 docs/source/distrib_optimizer.md create mode 100644 docs/source/images/context_parallel/CP_overview.png create mode 100644 docs/source/images/context_parallel/CP_results.png create mode 100644 docs/source/images/distrib_optimizer/data_flow.png create mode 100644 docs/source/images/distrib_optimizer/sharding_scheme.png create mode 100644 docs/source/index.rst create mode 100644 docs/source/user-guide/index.rst create mode 100644 examples/academic_paper_scripts/detxoify_lm/README.md create mode 100644 examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py create mode 100644 examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py create mode 100644 examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh create mode 100644 examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py create mode 100644 examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh create mode 100644 examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh create mode 100644 examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py create mode 100644 examples/academic_paper_scripts/detxoify_lm/perspective_api.py create mode 100644 examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh create mode 100644 examples/academic_paper_scripts/msdp/README.md create mode 100644 examples/academic_paper_scripts/msdp/data_processing.sh create mode 100644 examples/academic_paper_scripts/msdp/eval_knwl_generation.sh create mode 100644 examples/academic_paper_scripts/msdp/eval_resp_generation.sh create mode 100644 examples/academic_paper_scripts/msdp/prep_resp_gen.sh create mode 100644 examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh create mode 100644 examples/academic_paper_scripts/msdp/prompt_resp_gen.sh create mode 100644 examples/academic_paper_scripts/sc21/CONFIG.sh create mode 100644 examples/academic_paper_scripts/sc21/README.md create mode 100644 examples/academic_paper_scripts/sc21/SBATCH.sh create mode 100644 examples/academic_paper_scripts/sc21/SRUN.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_11.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_12.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_13.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_14.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_15.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_16.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_17.sh create mode 100644 examples/academic_paper_scripts/sc21/run_figure_18.sh create mode 100644 examples/academic_paper_scripts/sc21/run_table_1.sh create mode 100644 examples/bert/README.md create mode 100644 examples/bert/train_bert_340m_distributed.sh create mode 100644 examples/gpt3/README.md create mode 100644 examples/gpt3/gpt_config.yaml create mode 100644 examples/gpt3/train_gpt3_175b_distributed.sh create mode 100644 examples/inference/README.md create mode 100644 examples/inference/gpt/simple_gpt_batch_inference.py create mode 100644 examples/inference/quantization/README.md create mode 100644 examples/inference/quantization/ptq_trtllm_llama_7b.sh create mode 100644 examples/inference/quantization/ptq_trtllm_nemotron3_8b.sh create mode 100644 examples/inference/quantization/text_generation_ptq.py create mode 100644 examples/inference/quantization/trtllm_text_generation.py create mode 100644 examples/inference/run_text_generation_server_345M.sh create mode 100644 examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh create mode 100644 examples/mamba/.gitignore create mode 100644 examples/mamba/Dockerfile create mode 100644 examples/mamba/README.md create mode 100644 examples/mamba/run_text_gen_server_8b.sh create mode 100644 examples/mamba/run_text_gen_server_8b_gpt3.sh create mode 100644 examples/mamba/train.sh create mode 100644 examples/mixtral/README.md create mode 100644 examples/mixtral/train_mixtral_8x7b_distributed.sh create mode 100644 examples/multimodal/Dockerfile create mode 100644 examples/multimodal/README.md create mode 100644 examples/multimodal/assets/pretrain_curves.png create mode 100644 examples/multimodal/clip_converter.py create mode 100644 examples/multimodal/combine_mistral_clip.sh create mode 100644 examples/multimodal/combine_state_dicts.py create mode 100644 examples/multimodal/config.py create mode 100644 examples/multimodal/convert_llava_pretrain_to_wds.py create mode 100644 examples/multimodal/dataloader_provider.py create mode 100644 examples/multimodal/dataset_helpers.py create mode 100644 examples/multimodal/evaluate_coco.py create mode 100644 examples/multimodal/evaluate_mmmu.py create mode 100644 examples/multimodal/evaluate_textvqa.py create mode 100644 examples/multimodal/evaluate_vqav2.py create mode 100644 examples/multimodal/layer_specs.py create mode 100644 examples/multimodal/manual_prompts.json create mode 100644 examples/multimodal/pretrain_dataset.yaml create mode 100644 examples/multimodal/pretrain_mistral_clip.sh create mode 100644 examples/multimodal/run_text_generation.py create mode 100644 examples/multimodal/sft_dataset.yaml create mode 100644 examples/multimodal/sft_mistral_clip.sh create mode 100644 examples/multimodal/text_generation_mistral_clip.sh create mode 100644 examples/multimodal/train.py create mode 100644 examples/retro/README.md create mode 100644 examples/retro/preprocess_data.sh create mode 100644 examples/retro/train_retro_2b_distributed.sh create mode 100644 examples/run_simple_mcore_train_loop.py create mode 100644 examples/t5/README.md create mode 100644 examples/t5/t5_mcore_train_curve.png create mode 100644 examples/t5/train_t5_220m_distributed.sh create mode 100644 images/model_table.png create mode 100644 images/strong_scaling.png create mode 100644 images/weak_scaling.png create mode 100644 jet-tests.yml create mode 100644 megatron/core/QuickStart.md create mode 100644 megatron/core/README.md create mode 100644 megatron/core/README_STRAGGLER.md create mode 100644 megatron/core/__init__.py create mode 100644 megatron/core/datasets/Makefile create mode 100644 megatron/core/datasets/__init__.py create mode 100644 megatron/core/datasets/bert_dataset.py create mode 100644 megatron/core/datasets/blended_dataset.py create mode 100644 megatron/core/datasets/blended_megatron_dataset_builder.py create mode 100644 megatron/core/datasets/blended_megatron_dataset_config.py create mode 100644 megatron/core/datasets/gpt_dataset.py create mode 100644 megatron/core/datasets/helpers.cpp create mode 100644 megatron/core/datasets/indexed_dataset.py create mode 100644 megatron/core/datasets/masked_dataset.py create mode 100644 megatron/core/datasets/megatron_dataset.py create mode 100644 megatron/core/datasets/megatron_tokenizer.py create mode 100644 megatron/core/datasets/multimodal_dataset.py create mode 100644 megatron/core/datasets/readme.md create mode 100644 megatron/core/datasets/retro/__init__.py create mode 100644 megatron/core/datasets/retro/config/__init__.py create mode 100644 megatron/core/datasets/retro/config/bert_embedders.py create mode 100644 megatron/core/datasets/retro/config/config.py create mode 100644 megatron/core/datasets/retro/config/gpt_chunk_datasets.py create mode 100644 megatron/core/datasets/retro/config/tokenizers.py create mode 100644 megatron/core/datasets/retro/db/__init__.py create mode 100644 megatron/core/datasets/retro/db/build.py create mode 100644 megatron/core/datasets/retro/db/dataset.py create mode 100644 megatron/core/datasets/retro/db/utils.py create mode 100644 megatron/core/datasets/retro/external_libs.py create mode 100644 megatron/core/datasets/retro/index/__init__.py create mode 100644 megatron/core/datasets/retro/index/build.py create mode 100644 megatron/core/datasets/retro/index/factory.py create mode 100644 megatron/core/datasets/retro/index/index.py create mode 100644 megatron/core/datasets/retro/index/indexes/__init__.py create mode 100644 megatron/core/datasets/retro/index/indexes/faiss_base.py create mode 100644 megatron/core/datasets/retro/index/indexes/faiss_par_add.py create mode 100644 megatron/core/datasets/retro/index/utils.py create mode 100644 megatron/core/datasets/retro/index/validate.py create mode 100644 megatron/core/datasets/retro/query/__init__.py create mode 100644 megatron/core/datasets/retro/query/gpt_chunk_dataset.py create mode 100644 megatron/core/datasets/retro/query/multi_split_gpt_dataset.py create mode 100644 megatron/core/datasets/retro/query/query.py create mode 100644 megatron/core/datasets/retro/query/retro_dataset.py create mode 100644 megatron/core/datasets/retro/query/utils.py create mode 100644 megatron/core/datasets/retro/utils.py create mode 100644 megatron/core/datasets/t5_dataset.py create mode 100644 megatron/core/datasets/utils.py create mode 100644 megatron/core/datasets/utils_s3.py create mode 100644 megatron/core/dist_checkpointing/__init__.py create mode 100644 megatron/core/dist_checkpointing/core.py create mode 100644 megatron/core/dist_checkpointing/dict_utils.py create mode 100644 megatron/core/dist_checkpointing/mapping.py create mode 100644 megatron/core/dist_checkpointing/optimizer.py create mode 100644 megatron/core/dist_checkpointing/serialization.py create mode 100644 megatron/core/dist_checkpointing/strategies/__init__.py create mode 100644 megatron/core/dist_checkpointing/strategies/async_utils.py create mode 100644 megatron/core/dist_checkpointing/strategies/base.py create mode 100644 megatron/core/dist_checkpointing/strategies/common.py create mode 100644 megatron/core/dist_checkpointing/strategies/filesystem_async.py create mode 100644 megatron/core/dist_checkpointing/strategies/fully_parallel.py create mode 100644 megatron/core/dist_checkpointing/strategies/resharding.py create mode 100644 megatron/core/dist_checkpointing/strategies/state_dict_saver.py create mode 100644 megatron/core/dist_checkpointing/strategies/tensorstore.py create mode 100644 megatron/core/dist_checkpointing/strategies/torch.py create mode 100644 megatron/core/dist_checkpointing/strategies/two_stage.py create mode 100644 megatron/core/dist_checkpointing/strategies/zarr.py create mode 100644 megatron/core/dist_checkpointing/utils.py create mode 100644 megatron/core/dist_checkpointing/validation.py create mode 100644 megatron/core/distributed/__init__.py create mode 100644 megatron/core/distributed/distributed_data_parallel.py create mode 100644 megatron/core/distributed/distributed_data_parallel_config.py create mode 100644 megatron/core/distributed/finalize_model_grads.py create mode 100644 megatron/core/distributed/param_and_grad_buffer.py create mode 100644 megatron/core/enums.py create mode 100644 megatron/core/fusions/__init__.py create mode 100644 megatron/core/fusions/fused_bias_dropout.py create mode 100644 megatron/core/fusions/fused_bias_geglu.py create mode 100644 megatron/core/fusions/fused_bias_gelu.py create mode 100644 megatron/core/fusions/fused_bias_swiglu.py create mode 100644 megatron/core/fusions/fused_cross_entropy.py create mode 100644 megatron/core/fusions/fused_layer_norm.py create mode 100644 megatron/core/fusions/fused_softmax.py create mode 100644 megatron/core/inference/__init__.py create mode 100644 megatron/core/inference/ammo_support/__init__.py create mode 100644 megatron/core/inference/ammo_support/gpt/__init__.py create mode 100644 megatron/core/inference/ammo_support/gpt/model_specs.py create mode 100644 megatron/core/inference/ammo_support/gpt/state_dict_hooks.py create mode 100644 megatron/core/inference/common_inference_params.py create mode 100644 megatron/core/inference/communication_utils.py create mode 100644 megatron/core/inference/engines/__init__.py create mode 100644 megatron/core/inference/engines/abstract_engine.py create mode 100644 megatron/core/inference/engines/mcore_engine.py create mode 100644 megatron/core/inference/inference_request.py create mode 100644 megatron/core/inference/model_inference_wrappers/__init__.py create mode 100644 megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py create mode 100644 megatron/core/inference/model_inference_wrappers/gpt/__init__.py create mode 100644 megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py create mode 100644 megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py create mode 100644 megatron/core/inference/scheduler.py create mode 100644 megatron/core/inference/text_generation_controllers/__init__.py create mode 100644 megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py create mode 100644 megatron/core/inference/utils.py create mode 100644 megatron/core/inference_params.py create mode 100644 megatron/core/jit.py create mode 100644 megatron/core/model_parallel_config.py create mode 100644 megatron/core/models/T5/__init__.py create mode 100644 megatron/core/models/T5/t5_model.py create mode 100644 megatron/core/models/T5/t5_spec.py create mode 100644 megatron/core/models/__init__.py create mode 100644 megatron/core/models/bert/__init__.py create mode 100644 megatron/core/models/bert/bert_layer_specs.py create mode 100644 megatron/core/models/bert/bert_lm_head.py create mode 100644 megatron/core/models/bert/bert_model.py create mode 100644 megatron/core/models/bert/pooler.py create mode 100644 megatron/core/models/common/__init__.py create mode 100644 megatron/core/models/common/embeddings/__init__.py create mode 100644 megatron/core/models/common/embeddings/language_model_embedding.py create mode 100644 megatron/core/models/common/embeddings/rotary_pos_embedding.py create mode 100644 megatron/core/models/common/language_module/__init__.py create mode 100644 megatron/core/models/common/language_module/language_module.py create mode 100644 megatron/core/models/common/vision_module/__init__.py create mode 100644 megatron/core/models/common/vision_module/vision_module.py create mode 100644 megatron/core/models/gpt/__init__.py create mode 100644 megatron/core/models/gpt/gpt_layer_specs.py create mode 100644 megatron/core/models/gpt/gpt_model.py create mode 100644 megatron/core/models/mamba/__init__.py create mode 100644 megatron/core/models/mamba/mamba_layer_specs.py create mode 100644 megatron/core/models/mamba/mamba_model.py create mode 100644 megatron/core/models/multimodal/__init__.py create mode 100644 megatron/core/models/multimodal/llava_model.py create mode 100644 megatron/core/models/retro/__init__.py create mode 100644 megatron/core/models/retro/base_attention.py create mode 100644 megatron/core/models/retro/config.py create mode 100644 megatron/core/models/retro/decoder_attention.py create mode 100644 megatron/core/models/retro/decoder_spec.py create mode 100644 megatron/core/models/retro/encoder_attention.py create mode 100644 megatron/core/models/retro/encoder_spec.py create mode 100644 megatron/core/models/retro/model.py create mode 100644 megatron/core/models/retro/utils.py create mode 100644 megatron/core/models/vision/__init__.py create mode 100644 megatron/core/models/vision/clip_vit_model.py create mode 100644 megatron/core/models/vision/multimodal_projector.py create mode 100644 megatron/core/models/vision/vit_layer_specs.py create mode 100644 megatron/core/num_microbatches_calculator.py create mode 100644 megatron/core/optimizer/__init__.py create mode 100644 megatron/core/optimizer/clip_grads.py create mode 100644 megatron/core/optimizer/distrib_optimizer.py create mode 100644 megatron/core/optimizer/grad_scaler.py create mode 100644 megatron/core/optimizer/optimizer.py create mode 100644 megatron/core/optimizer/optimizer_config.py create mode 100644 megatron/core/package_info.py create mode 100644 megatron/core/packed_seq_params.py create mode 100644 megatron/core/parallel_state.py create mode 100644 megatron/core/pipeline_parallel/__init__.py create mode 100644 megatron/core/pipeline_parallel/p2p_communication.py create mode 100644 megatron/core/pipeline_parallel/schedules.py create mode 100644 megatron/core/requirements.txt create mode 100644 megatron/core/ssm/__init__.py create mode 100644 megatron/core/ssm/mamba_block.py create mode 100644 megatron/core/ssm/mamba_hybrid_layer_allocation.py create mode 100644 megatron/core/ssm/mamba_layer.py create mode 100644 megatron/core/ssm/mamba_mixer.py create mode 100644 megatron/core/ssm/triton_cache_manager.py create mode 100644 megatron/core/tensor_parallel/__init__.py create mode 100644 megatron/core/tensor_parallel/cross_entropy.py create mode 100644 megatron/core/tensor_parallel/data.py create mode 100644 megatron/core/tensor_parallel/layers.py create mode 100644 megatron/core/tensor_parallel/mappings.py create mode 100644 megatron/core/tensor_parallel/random.py create mode 100644 megatron/core/tensor_parallel/utils.py create mode 100644 megatron/core/timers.py create mode 100644 megatron/core/transformer/__init__.py create mode 100644 megatron/core/transformer/attention.py create mode 100644 megatron/core/transformer/custom_layers/__init__.py create mode 100644 megatron/core/transformer/custom_layers/transformer_engine.py create mode 100644 megatron/core/transformer/dot_product_attention.py create mode 100644 megatron/core/transformer/enums.py create mode 100644 megatron/core/transformer/identity_op.py create mode 100644 megatron/core/transformer/mlp.py create mode 100644 megatron/core/transformer/module.py create mode 100644 megatron/core/transformer/moe/README.md create mode 100644 megatron/core/transformer/moe/__init__.py create mode 100644 megatron/core/transformer/moe/experts.py create mode 100644 megatron/core/transformer/moe/grouped_gemm_util.py create mode 100644 megatron/core/transformer/moe/moe_layer.py create mode 100644 megatron/core/transformer/moe/moe_utils.py create mode 100644 megatron/core/transformer/moe/router.py create mode 100644 megatron/core/transformer/moe/token_dispatcher.py create mode 100644 megatron/core/transformer/spec_utils.py create mode 100644 megatron/core/transformer/torch_layer_norm.py create mode 100644 megatron/core/transformer/transformer_block.py create mode 100644 megatron/core/transformer/transformer_config.py create mode 100644 megatron/core/transformer/transformer_layer.py create mode 100644 megatron/core/transformer/utils.py create mode 100644 megatron/core/utils.py create mode 100644 megatron/inference/__init__.py create mode 100644 megatron/inference/arguments.py create mode 100644 megatron/inference/checkpointing.py create mode 100644 megatron/inference/gpt/__init__.py create mode 100644 megatron/inference/gpt/model_provider.py create mode 100644 megatron/inference/static/index.html create mode 100644 megatron/inference/text_generation/__init__.py create mode 100644 megatron/inference/text_generation/api.py create mode 100644 megatron/inference/text_generation/beam_utils.py create mode 100644 megatron/inference/text_generation/communication.py create mode 100644 megatron/inference/text_generation/forward_step.py create mode 100644 megatron/inference/text_generation/generation.py create mode 100644 megatron/inference/text_generation/sampling.py create mode 100644 megatron/inference/text_generation/tokenization.py create mode 100644 megatron/inference/text_generation_server.py create mode 100644 megatron/legacy/data/__init__.py create mode 100644 megatron/legacy/data/autoaugment.py create mode 100644 megatron/legacy/data/biencoder_dataset_utils.py create mode 100644 megatron/legacy/data/data_samplers.py create mode 100644 megatron/legacy/data/dataset_utils.py create mode 100644 megatron/legacy/data/ict_dataset.py create mode 100644 megatron/legacy/data/image_folder.py create mode 100644 megatron/legacy/data/multimodal_dataset.py create mode 100644 megatron/legacy/data/orqa_wiki_dataset.py create mode 100644 megatron/legacy/data/realm_dataset_utils.py create mode 100644 megatron/legacy/data/realm_index.py create mode 100644 megatron/legacy/data/vit_dataset.py create mode 100644 megatron/legacy/fp16_deprecated/loss_scaler.py create mode 100644 megatron/legacy/fused_kernels/__init__.py create mode 100644 megatron/legacy/fused_kernels/compat.h create mode 100644 megatron/legacy/fused_kernels/tests/__init__.py create mode 100644 megatron/legacy/fused_kernels/tests/test_fused_kernels.py create mode 100644 megatron/legacy/fused_kernels/type_shim.h create mode 100644 megatron/legacy/indexer.py create mode 100644 megatron/legacy/model/__init__.py create mode 100644 megatron/legacy/model/bert_model.py create mode 100644 megatron/legacy/model/biencoder_model.py create mode 100644 megatron/legacy/model/classification.py create mode 100644 megatron/legacy/model/enums.py create mode 100644 megatron/legacy/model/fused_bias_gelu.py create mode 100644 megatron/legacy/model/fused_layer_norm.py create mode 100644 megatron/legacy/model/fused_softmax.py create mode 100644 megatron/legacy/model/gpt_model.py create mode 100644 megatron/legacy/model/language_model.py create mode 100644 megatron/legacy/model/module.py create mode 100644 megatron/legacy/model/multiple_choice.py create mode 100644 megatron/legacy/model/realm_model.py create mode 100644 megatron/legacy/model/rms_norm.py create mode 100644 megatron/legacy/model/t5_model.py create mode 100644 megatron/legacy/model/transformer.py create mode 100644 megatron/legacy/model/utils.py create mode 100644 megatron/legacy/model/vision/classification.py create mode 100644 megatron/legacy/model/vision/dino.py create mode 100644 megatron/legacy/model/vision/esvit_swin_backbone.py create mode 100644 megatron/legacy/model/vision/inpainting.py create mode 100644 megatron/legacy/model/vision/knn_monitor.py create mode 100644 megatron/legacy/model/vision/mit_backbone.py create mode 100644 megatron/legacy/model/vision/swin_backbone.py create mode 100644 megatron/legacy/model/vision/utils.py create mode 100644 megatron/legacy/model/vision/vit_backbone.py create mode 100644 megatron/legacy/mpu/tests/__init__.py create mode 100644 megatron/legacy/mpu/tests/commons.py create mode 100644 megatron/legacy/mpu/tests/test_cross_entropy.py create mode 100644 megatron/legacy/mpu/tests/test_data.py create mode 100644 megatron/legacy/mpu/tests/test_initialize.py create mode 100644 megatron/legacy/mpu/tests/test_layers.py create mode 100644 megatron/legacy/mpu/tests/test_random.py create mode 100644 megatron/training/__init__.py create mode 100644 megatron/training/activations.py create mode 100644 megatron/training/arguments.py create mode 100644 megatron/training/async_utils.py create mode 100644 megatron/training/checkpointing.py create mode 100644 megatron/training/dist_signal_handler.py create mode 100644 megatron/training/global_vars.py create mode 100644 megatron/training/initialize.py create mode 100644 megatron/training/log_handler.py create mode 100644 megatron/training/one_logger_utils.py create mode 100644 megatron/training/optimizer_param_scheduler.py create mode 100644 megatron/training/theoretical_memory_usage.py create mode 100644 megatron/training/tokenizer/__init__.py create mode 100644 megatron/training/tokenizer/bert_tokenization.py create mode 100644 megatron/training/tokenizer/gpt2_tokenization.py create mode 100644 megatron/training/tokenizer/tokenizer.py create mode 100644 megatron/training/training.py create mode 100644 megatron/training/utils.py create mode 100644 megatron/training/yaml_arguments.py create mode 100644 pretrain_bert.py create mode 100644 pretrain_gpt.py create mode 100644 pretrain_ict.py create mode 100644 pretrain_mamba.py create mode 100644 pretrain_retro.py create mode 100644 pretrain_t5.py create mode 100644 pretrain_vision_classify.py create mode 100644 pretrain_vision_dino.py create mode 100644 pretrain_vision_inpaint.py create mode 100644 pretrain_vlm.py create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 tasks/data_utils.py create mode 100644 tasks/ensemble_classifier.py create mode 100644 tasks/eval_utils.py create mode 100644 tasks/finetune_utils.py create mode 100644 tasks/glue/data.py create mode 100644 tasks/glue/finetune.py create mode 100644 tasks/glue/mnli.py create mode 100644 tasks/glue/qqp.py create mode 100644 tasks/main.py create mode 100644 tasks/msdp/README.md create mode 100644 tasks/msdp/evaluate.py create mode 100644 tasks/msdp/main.py create mode 100644 tasks/msdp/metrics.py create mode 100644 tasks/msdp/preprocessing.py create mode 100644 tasks/msdp/prompt.py create mode 100644 tasks/orqa/README.md create mode 100644 tasks/orqa/evaluate_orqa.py create mode 100644 tasks/orqa/evaluate_utils.py create mode 100644 tasks/orqa/supervised/data.py create mode 100644 tasks/orqa/supervised/eval_utils.py create mode 100644 tasks/orqa/supervised/finetune.py create mode 100644 tasks/orqa/unsupervised/nq.py create mode 100644 tasks/orqa/unsupervised/qa_utils.py create mode 100644 tasks/orqa/unsupervised/tokenizers.py create mode 100644 tasks/race/data.py create mode 100644 tasks/race/finetune.py create mode 100644 tasks/vision/classification/classification.py create mode 100644 tasks/vision/classification/eval_utils.py create mode 100644 tasks/vision/finetune_utils.py create mode 100644 tasks/vision/main.py create mode 100644 tasks/vision/segmentation/cityscapes.py create mode 100644 tasks/vision/segmentation/data.py create mode 100644 tasks/vision/segmentation/finetune_segformer.py create mode 100644 tasks/vision/segmentation/finetune_setr.py create mode 100644 tasks/vision/segmentation/metrics.py create mode 100644 tasks/vision/segmentation/seg_heads.py create mode 100644 tasks/vision/segmentation/seg_models.py create mode 100644 tasks/vision/segmentation/transforms.py create mode 100644 tasks/vision/segmentation/utils.py create mode 100644 tasks/zeroshot_gpt/datasets.py create mode 100644 tasks/zeroshot_gpt/detokenizer.py create mode 100644 tasks/zeroshot_gpt/evaluate.py create mode 100644 tests/__init__.py create mode 100644 tests/functional_tests/__init__.py create mode 100644 tests/functional_tests/jet_recipes/MR-bert.yaml create mode 100644 tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml create mode 100644 tests/functional_tests/jet_recipes/MR-gpt.yaml create mode 100644 tests/functional_tests/jet_recipes/MR-multimodal.yaml create mode 100644 tests/functional_tests/jet_recipes/MR-t5.yaml create mode 100644 tests/functional_tests/jet_recipes/build-pyt.yaml create mode 100644 tests/functional_tests/jet_recipes/local-generator.py create mode 100644 tests/functional_tests/jet_recipes/monthly-t5.yaml create mode 100644 tests/functional_tests/jet_recipes/nightly-bert.yaml create mode 100644 tests/functional_tests/jet_recipes/nightly-gpt.yaml create mode 100644 tests/functional_tests/jet_recipes/weekly-gpt.yaml create mode 100644 tests/functional_tests/python_test_utils/__init__.py create mode 100644 tests/functional_tests/python_test_utils/common.py create mode 100644 tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py create mode 100644 tests/functional_tests/python_test_utils/jet_test_pipeline.py create mode 100644 tests/functional_tests/python_test_utils/multitest_ci_pipeline.py create mode 100644 tests/functional_tests/python_test_utils/test_ci_pipeline.py create mode 100644 tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py create mode 100644 tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py create mode 100644 tests/functional_tests/shell_test_utils/_run_local_training.sh create mode 100644 tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh create mode 100644 tests/functional_tests/shell_test_utils/run_release_record.sh create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json create mode 100644 tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json create mode 100644 tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json create mode 100644 tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_results/jet/t5_220m_mr_mcore_te_tp1_pp1_vp1_calculate_per_token_loss_dgx_a100_1N8G.json create mode 100644 tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh create mode 100644 tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh create mode 100644 tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh create mode 100644 tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh create mode 100644 tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh create mode 100644 tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh create mode 100644 tests/unit_tests/__init__.py create mode 100644 tests/unit_tests/data/__init__.py create mode 100644 tests/unit_tests/data/test_bin_reader.py create mode 100644 tests/unit_tests/data/test_builder.py create mode 100644 tests/unit_tests/data/test_gpt_dataset.py create mode 100644 tests/unit_tests/data/test_multimodal_dataset.py create mode 100644 tests/unit_tests/data/test_preprocess_data.py create mode 100644 tests/unit_tests/data/test_preprocess_mmdata.py create mode 100644 tests/unit_tests/dist_checkpointing/__init__.py create mode 100644 tests/unit_tests/dist_checkpointing/conftest.py create mode 100644 tests/unit_tests/dist_checkpointing/models/__init__.py create mode 100644 tests/unit_tests/dist_checkpointing/models/common.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_bert_model.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_gpt_model.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_grouped_mlp.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_retro_model.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py create mode 100644 tests/unit_tests/dist_checkpointing/models/test_t5_model.py create mode 100644 tests/unit_tests/dist_checkpointing/test_async_save.py create mode 100644 tests/unit_tests/dist_checkpointing/test_cached_metadata.py create mode 100644 tests/unit_tests/dist_checkpointing/test_flattened_resharding.py create mode 100644 tests/unit_tests/dist_checkpointing/test_fully_parallel.py create mode 100644 tests/unit_tests/dist_checkpointing/test_mapping.py create mode 100644 tests/unit_tests/dist_checkpointing/test_optimizer.py create mode 100644 tests/unit_tests/dist_checkpointing/test_serialization.py create mode 100644 tests/unit_tests/distributed/test_param_and_grad_buffer.py create mode 100644 tests/unit_tests/fusions/test_torch_softmax.py create mode 100644 tests/unit_tests/inference/__init__.py create mode 100644 tests/unit_tests/inference/engines/__init__.py create mode 100644 tests/unit_tests/inference/engines/test_mcore_engine.py create mode 100644 tests/unit_tests/inference/model_inference_wrappers/__init__.py create mode 100644 tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py create mode 100644 tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py create mode 100644 tests/unit_tests/inference/test_common_inference_params.py create mode 100644 tests/unit_tests/inference/test_inference_utils.py create mode 100644 tests/unit_tests/inference/test_modelopt_gpt_model.py create mode 100644 tests/unit_tests/inference/test_scheduler.py create mode 100644 tests/unit_tests/inference/text_generation_controllers/__init__.py create mode 100644 tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py create mode 100644 tests/unit_tests/models/__init__.py create mode 100644 tests/unit_tests/models/test_base_embedding.py create mode 100644 tests/unit_tests/models/test_bert_model.py create mode 100644 tests/unit_tests/models/test_clip_vit_model.py create mode 100644 tests/unit_tests/models/test_gpt_model.py create mode 100644 tests/unit_tests/models/test_llava_model.py create mode 100644 tests/unit_tests/models/test_multimodal_projector.py create mode 100644 tests/unit_tests/models/test_t5_model.py create mode 100644 tests/unit_tests/pipeline_parallel/__init__.py create mode 100644 tests/unit_tests/pipeline_parallel/test_schedules.py create mode 100644 tests/unit_tests/tensor_parallel/__init__.py create mode 100644 tests/unit_tests/tensor_parallel/test_cross_entropy.py create mode 100644 tests/unit_tests/tensor_parallel/test_data.py create mode 100644 tests/unit_tests/tensor_parallel/test_initialization.py create mode 100644 tests/unit_tests/tensor_parallel/test_layers.py create mode 100644 tests/unit_tests/tensor_parallel/test_mappings.py create mode 100644 tests/unit_tests/tensor_parallel/test_random.py create mode 100644 tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py create mode 100644 tests/unit_tests/test_basic.py create mode 100644 tests/unit_tests/test_imports.py create mode 100644 tests/unit_tests/test_local_multi_tensor_fns.py create mode 100644 tests/unit_tests/test_num_microbatches_calculator.py create mode 100644 tests/unit_tests/test_optimizer.py create mode 100644 tests/unit_tests/test_parallel_state.py create mode 100644 tests/unit_tests/test_training.py create mode 100644 tests/unit_tests/test_utilities.py create mode 100644 tests/unit_tests/test_utils.py create mode 100644 tests/unit_tests/transformer/__init__.py create mode 100644 tests/unit_tests/transformer/moe/__init__.py create mode 100644 tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py create mode 100644 tests/unit_tests/transformer/moe/test_aux_loss.py create mode 100644 tests/unit_tests/transformer/moe/test_grouped_mlp.py create mode 100644 tests/unit_tests/transformer/moe/test_routers.py create mode 100644 tests/unit_tests/transformer/moe/test_sequential_mlp.py create mode 100644 tests/unit_tests/transformer/moe/test_token_dispatcher.py create mode 100644 tests/unit_tests/transformer/test_attention.py create mode 100644 tests/unit_tests/transformer/test_attention_packed_seq.py create mode 100644 tests/unit_tests/transformer/test_core_attention.py create mode 100644 tests/unit_tests/transformer/test_mlp.py create mode 100644 tests/unit_tests/transformer/test_module.py create mode 100644 tests/unit_tests/transformer/test_retro_attention.py create mode 100644 tests/unit_tests/transformer/test_rope.py create mode 100644 tests/unit_tests/transformer/test_spec_customization.py create mode 100644 tests/unit_tests/transformer/test_transformer_block.py create mode 100644 tests/unit_tests/transformer/test_transformer_layer.py create mode 100644 tools/autoformat.sh create mode 100644 tools/bert_embedding/__init__.py create mode 100644 tools/bert_embedding/dataset.py create mode 100644 tools/bert_embedding/embed.py create mode 100644 tools/bert_embedding/external_libs.py create mode 100644 tools/bert_embedding/huggingface.py create mode 100644 tools/checkpoint/convert.py create mode 100644 tools/checkpoint/hybrid_conversion.py create mode 100644 tools/checkpoint/loader_llama_mistral.py create mode 100644 tools/checkpoint/loader_mcore.py create mode 100644 tools/checkpoint/loader_megatron.py create mode 100644 tools/checkpoint/loader_mixtral_hf.py create mode 100644 tools/checkpoint/saver_mcore.py create mode 100644 tools/checkpoint/saver_megatron.py create mode 100644 tools/checkpoint/setter.py create mode 100644 tools/checkpoint/utils.py create mode 100644 tools/linter.py create mode 100644 tools/merge_datasets.py create mode 100644 tools/openwebtext/README.md create mode 100644 tools/openwebtext/add_id.py create mode 100644 tools/openwebtext/blacklist_urls.py create mode 100644 tools/openwebtext/cleanup_dataset.py create mode 100644 tools/openwebtext/cleanup_fix_dataset.py create mode 100644 tools/openwebtext/filter_ngrams.py create mode 100644 tools/openwebtext/find_duplicates.py create mode 100644 tools/openwebtext/group_duplicate_url.py create mode 100644 tools/openwebtext/merge_jsons.py create mode 100644 tools/openwebtext/remove_group_duplicates.py create mode 100644 tools/preprocess_data.py create mode 100644 tools/preprocess_data_nmt.py create mode 100644 tools/preprocess_mmdata.py create mode 100644 tools/report_theoretical_memory.py create mode 100644 tools/retro/README.md create mode 100644 tools/retro/build_db.md create mode 100644 tools/retro/cli/__init__.py create mode 100644 tools/retro/cli/__main__.py create mode 100644 tools/retro/cli/cli.py create mode 100644 tools/retro/config_utils.py create mode 100644 tools/retro/docker/Dockerfile create mode 100644 tools/retro/preprocess_data.py create mode 100644 tools/retro/sft/README.md create mode 100644 tools/retro/sft/dataset_conv.py create mode 100644 tools/retro/sft/open_inst.sh create mode 100644 tools/retro/sft/sft_retro.py create mode 100644 tools/retro/sft/sft_retro_lm.sh create mode 100644 tools/retro/text_generation/evaluate.py create mode 100644 tools/retro/text_generation/metrics.py create mode 100644 tools/retro/text_generation/retro_api.py create mode 100644 tools/retro/text_generation/retro_generate.sh create mode 100644 tools/retro/text_generation/retro_generation.py create mode 100644 tools/retro/text_generation/retro_text_generation.py create mode 100644 tools/run_mamba_text_generation_server.py create mode 100644 tools/run_text_generation_server.py create mode 100644 tools/run_vlm_text_generation.py create mode 100644 tools/text_generation_cli.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..29de6ff --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[html] +directory = coverage + +[run] +data_file = .coverage_$LOCAL_RANK diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md new file mode 100644 index 0000000..b639acd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -0,0 +1,32 @@ +--- +name: BUG +about: Report a bug that needs attention +title: "[BUG]" +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Stack trace/logs** +If applicable, add the stack trace or logs from the time of the error. + +**Environment (please complete the following information):** + - Megatron-LM commit ID + - PyTorch version + - CUDA version + - NCCL version + +**Proposed fix** +If you have a proposal for how to fix the issue state it here or link to a PR. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md new file mode 100644 index 0000000..076f719 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement.md @@ -0,0 +1,23 @@ +--- +name: ENHANCEMENT +about: Suggest an idea to improve this project +title: "[ENHANCEMENT]" +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Proposed implementation** +If you have a proposed implementation for the feature state it here or link to a PR. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..b3d89a0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,12 @@ +--- +name: QUESTION +about: Ask a question about Megatron-LM that is not a bug, regression or enhancement + request +title: "[QUESTION]" +labels: '' +assignees: '' + +--- + +**Your question** +Ask a clear and concise question about Megatron-LM. diff --git a/.github/ISSUE_TEMPLATE/regression.md b/.github/ISSUE_TEMPLATE/regression.md new file mode 100644 index 0000000..10078d2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/regression.md @@ -0,0 +1,39 @@ +--- +name: REGRESSION +about: Report a regression in speed or accuracy due to a Megatron-LM update +title: "[REGRESSION]" +labels: '' +assignees: '' + +--- + +**Describe the regression** +A clear and concise description of what the regression is. + +**To Reproduce** +Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. + +**Previous performance** +What speed or accuracy did you previously see. + +**New performance** +What speed or accuracy do you see after the update. + +**Stack trace/logs** +If applicable, add the stack trace or logs related to the regression. + +**Environment (please complete the following information):** + - Previous Megatron-LM commit ID + - New Megatron-LM commit ID + - Previous PyTorch version + - New PyTorch version + - Previous CUDA version + - New CUDA version + - Previous NCCL version + - New NCCL version + +**Proposed fix** +If you have a proposal for how to fix the issue state it here or link to a PR. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000..58ba38e --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,31 @@ +# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. +# +# You can adjust the behavior by modifying this file. +# For more information, see: +# https://github.com/actions/stale +name: Mark stale issues and pull requests + +on: + schedule: + - cron: '15 18 * * *' + +jobs: + stale: + + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + + steps: + - uses: actions/stale@v5 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + days-before-stale: 60 + stale-issue-message: 'Marking as stale. No activity in 60 days.' + stale-pr-message: 'Marking as stale. No activity in 60 days.' + stale-issue-label: 'stale' + stale-pr-label: 'stale' + remove-stale-when-updated: true + operations-per-run: 1000 + days-before-close: -1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..900ab51 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__ +*.so +build +.coverage_* +*.egg-info +*~ +slurm* +logs +.vscode +local/ \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..2c0d92e --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,357 @@ +workflow: + rules: + - if: ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests and nightly/) || ($CI_PIPELINE_SOURCE == "schedule") + variables: + JET_CUSTOM_FILTER: "type == 'build' or 'mr' in spec.scope or 'nightly' in spec.scope" + - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/ + variables: + JET_CUSTOM_FILTER: "type == 'build' or 'mr' in spec.scope" + # always run MR pipelines + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + # always run web pipelines + - if: $CI_PIPELINE_SOURCE == "web" + # do not run branch pipelines if open MR exists + - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS + when: never + # run branch pipeline if no open MR and on main + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + + +stages: + - build + - unit_tests + - functional_tests + - publish + +variables: + JET_CUSTOM_FILTER: + description: | + Selects what functional tests to run. For mr tests: "type == 'build' or 'mr' in spec.scope". For nightly tests: "type == 'build' or 'nightly' in spec.scope" + value: "" + TIME_LIMIT: "10:00" # Default time limit for all jobs + PUBLISH: + value: "no" + options: + - "yes" + - "no" + description: Build and publish a wheel to PyPi + SLURM_CLUSTER: + value: "dgxa100_dracooci" + options: + - "dgxa100_dracooci" + - "dgxh100_eos" + description: '"dgxa100_dracooci" for OCI-IAD, "dgxh100_eos" for EOS' + CI_MCORE_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci + CI_NEMO_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/nemo_ci + LINTING_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_linting + +metadata: + image: python:3.10 + stage: .pre + tags: + - os/linux + script: + - env + - | + if [[ $SLURM_CLUSTER == dgxh100_eos ]]; then + JET_CI_BRANCH=mcore/eos; + elif [[ $SLURM_CLUSTER == dgxa100_dracooci ]]; then + JET_CI_BRANCH=mcore/draco-oci; + else + echo "Unsupported value of SLURM_CLUSTER=$SLURM_CLUSTER"; + exit 1; + fi + - echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a build.env + artifacts: + reports: + dotenv: build.env + interruptible: true + +ppp_capacity_statistics: + tags: [mcore-ssh-agent] + stage: .pre + script: + - | + set -x + + ALL_USER=$(sshare -aP | grep coreai_dlalgo_mcore | tail -n +2 | awk -F '|' '{print $2}' | tr '\n' ',') + + # Get the current year, month, and day + YEAR=$(date +%Y) + MONTH=$(date +%m) + DAY=$([[ $(date +%-d) -le 15 ]] && echo "01" || echo "15") + TIMESTAMP="${YEAR}-${MONTH}-${DAY}T00:00:01" + + CLUSTER_ID=$(curl "${RESOURCE_ENDPOINT}/api/v1/clusters" \ + -H "accept: application/json, text/plain, */*" \ + -H "accept-language: en-US,en;q=0.9" \ + -H "authorization: Bearer $CSRG_API_KEY" | jq '.[] | select(.name == "draco-oci-iad") | .id' | tr -d '"') + + INITIATIVE_ITEM_ID=$(curl "${RESOURCE_ENDPOINT}/api/v1/initiative-items" \ + -H "accept: application/json, text/plain, */*" \ + -H "accept-language: en-US,en;q=0.9" \ + -H "authorization: Bearer $CSRG_API_KEY" | jq '.[] | select(.name == "coreai_dlalgo_mcore") | .id' | tr -d '"') + + QUOTA=$(curl "${RESOURCE_ENDPOINT}/api/v1/capacity-requests" \ + -H "accept: application/json, text/plain, */*" \ + -H "accept-language: en-US,en;q=0.9" \ + -H "authorization: Bearer $CSRG_API_KEY" | jq --arg CLUSTER_ID $CLUSTER_ID --arg INITIATIVE_ITEM_ID $INITIATIVE_ITEM_ID '[.[] | select(.clusterId == $CLUSTER_ID and .initiativeItemId == $INITIATIVE_ITEM_ID)] | to_entries | [last] | .[0].value.quantity') + + USED_CAPA=$(sacct \ + -u ${ALL_USER} \ + --partition batch_block1,batch_block3,batch_block4 \ + --truncate \ + -A coreai_dlalgo_mcore \ + -S ${TIMESTAMP} \ + -X \ + --format JobID,JobName%20,Partition,AllocNodes,ElapsedRaw \ + -p \ + -n \ + | awk -F "|" '{{sum+=$4*$5}} END {{print sum*8/3600}}') + TOTAL_CAPA=$(( $QUOTA*24*30 )) + + USAGE=$(echo "$USED_CAPA $TOTAL_CAPA" | awk '{print (1 - $1/$2)*100}')% + + echo "Usage left: $USAGE" + echo "Disclaimer: Please be careful with this number. Usage does not imply + what we are guaranteed to get a slot, SLURM scheduling is more complicated + than that. The number is rather a proxy to the FairShare that determines + our job-scheduling-priority. + + Most important take-away of this number is to get a sense how much much + we are eating up our budget such that we can discuss this with capacity planning. + " + +build_image: + tags: + - mcore-docker-node + image: docker:26.1.4-dind + needs: [] # May start ASAP + stage: build + timeout: 30m + parallel: + matrix: + - IMAGE: CI_MCORE_IMAGE + FILE: Dockerfile.ci + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 + - IMAGE: CI_NEMO_IMAGE + FILE: Dockerfile.ci + BASE_IMAGE: nvcr.io/nvidian/nemo:nightly + - IMAGE: LINTING_IMAGE + FILE: Dockerfile.linting + BASE_IMAGE: python:3.10 + before_script: + - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin + - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin + script: + - | + set -x + eval "IMAGE=\$$IMAGE" + + OLD_IMAGES=$(docker image ls --format "{{.ID}} {{.Repository}}:{{.Tag}}" \ + | grep -v 'nvcr.io/nvidia/pytorch:24.01-py3' \ + | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:buildcache' \ + | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_nemo:buildcache' \ + | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_linting:buildcache' \ + | grep -v 'nvcr.io/nvidian/nemo:nightly' \ + | grep -v 'python:3.10' | awk '{ print $1 }' + ) + docker rmi $OLD_IMAGES || true + + if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then + ADDITIONAL_PARAMS="--pull" + fi + + docker build \ + -f $FILE \ + -t ${IMAGE}:${CI_PIPELINE_ID} \ + --cache-to type=inline \ + --cache-from type=registry,ref=${IMAGE}:buildcache \ + --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ + ${ADDITIONAL_PARAMS} . + + docker push ${IMAGE}:${CI_PIPELINE_ID} + + if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then + docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache + docker push ${IMAGE}:buildcache + fi + + if [[ $CI_COMMIT_BRANCH == core_r* ]]; then + docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} + docker push ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} + fi + interruptible: true + +.unit_test_common: + image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} + stage: unit_tests + needs: [build_image] + tags: + - 8xL40S + variables: + MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE + interruptible: true + retry: + max: 2 + when: job_execution_timeout + +unit_tests: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests + coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' + artifacts: + paths: + - coverage + expire_in: 30 days + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + +unit_tests-data: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-dist-checkpointing: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-fusions: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-inference: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-models: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-pipeline-parallel: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-tensor-parallel: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-transformer: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +unit_tests-top-py: + extends: [.unit_test_common] + script: + - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' + when: never + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + when: never + - when: always + +docs_build_test: + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1 + stage: unit_tests + tags: + - os/linux + script: + - cd .. + - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git + - mv megatron-lm/ documentation/ + - cd documentation/ + - ./repo docs + allow_failure: true + except: + - main + interruptible: true + +formatting: + image: ${LINTING_IMAGE}:${CI_PIPELINE_ID} + tags: + - os/linux + stage: unit_tests + before_script: + - git fetch origin main + script: + - CHECK_ONLY=true bash tools/autoformat.sh + + rules: + - when: always + interruptible: true + +include: + - jet-tests.yml + +publish-wheel: + image: quay.io/pypa/manylinux_2_28_x86_64 + stage: publish + rules: + - if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" + when: manual + - when: never + before_script: + - pip install twine + script: + - /opt/python/cp310-cp310/bin/python -m build + - /opt/python/cp311-cp311/bin/python -m build + - auditwheel repair dist/*.whl + - twine upload --repository pypi wheelhouse/* \ No newline at end of file diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..150ae00 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,8 @@ +[MCORE][3] +megatron/core/ @shanmugamr @jcasper @eharper @terryk @okoenig + +[TESTS] +tests/ @shanmugamr @terryk @okoenig + +[MODELOPT] +examples/inference/quantization @chenhany @kmorabia diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..6152276 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,66 @@ +# Contributing to Megatron-LM + +This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM github repository. + +Everyone is welcome to contribute to the project but development of Megatron-LM continues internally at NVIDIA. When contributing it important to ensure that changes are in line with the project direction. Small changes to fix bugs are welcomed and appreciated. If proposing large architectural changes or changes for stylistic reasons open an issue first so we can discuss it. + +PRs will first be pulled into NVIDIA's internal Megatron-LM repo and then pushed back out to the open github repo with proper credit given to the committers. + +## Issue policy + +Please do file any bugs you find, keeping the following in mind: + +- If filing a bug, i.e. you have found something that doesn't work as expected, use the BUG template. +- If you've found a regression in speed or accuracy use the REGRESSION template. +- If you are requesting a new feature or modification of an existing feature use the ENHANCEMENT template. +- If opening an issue to ask a question no template is needed but please make your question as clear and concise as possible. +- One issue per bug. Putting multiple things in the same issue makes both discussion and completion unnecessarily complicated. +- Your bug is mostly likely to get attention from the development team quickly if we can easily reproduce it. +- Use proper spelling, grammar, and punctuation. +- Write in an authoritative and technical tone. + +## Code submission policy + +Here are some dos & don'ts to try and stick to: + +### Do: + +- Format new code in a style that is consistent with the file being changed. Megatron-LM doesn't (yet) have a style guide or enforced formatting. +- Split your changes into separate, atomic commits i.e. A commit per feature or fix. +- Make sure your commits are rebased on the master branch. +- Write the commit message subject line in the imperative mood ("Change the default argument for X", not "Changed the default argument for X"). +- Write your commit messages in proper English, with care and punctuation. +- Check the spelling of your code, comments and commit messages. + +### Don't: + +- Submit code that's incompatible with the project licence. +- Touch anything outside the stated scope of the PR. This includes formatting changes to code not relevant to the PR. +- Iterate excessively on your design across multiple commits. +- Include commented-out code. +- Attempt large architectural changes without first opening an issue to discuss. + +## Issue and Pull Request Q&A (Updated Jul 2023) + +### I've submitted an issue and PR. When can I expect to get some feedback? + +Megatron-LM is developed and maintained by a small team of researchers. We will endeavour to read and acknowledge all new issues and PRs within a week. A few rules of thumb: +- Reproducible bugs/regressions and bug/regression fixes are likely to get the attention of maintainers the quickest. +- Issues requesting an enhancement may only recieve acknowlegement that they've been read and may be closed with a "wontfix" label if they're not inline with the project direction. If they are acknowledged and remain open you can assume the maintainers agree they're a desirable feature. +- Support requests, i.e. requests for help running the code, have the lowest priority and will be responded to as maintainer time permits. + +### If my issue or PR isn't getting attention, how long should I wait before pinging one of the project maintainers? + +One week if there is no acknowledgement of the intial request. + +### Who are the project maintainers I should ping? + +The corresponding maintainers at this time are @jaredcasper and @jon-barker. + +### Is there a policy for issues and PRs that haven't been touched in X days? Should they be closed? + +Yes, starting in July 2023 we have a bot that will mark untouched PRs as "stale" after 60 days. + +We have a long backlog of issues and PRs dating back 3.5 years. We are trying to triage these now by working backwards. Older issues we believe may still be relevant may recieve a request to re-test them with the latest code. If there's no response they may be closed. Again, if you they should be re-opened then just respond with a comment to that effect. + +Thank-you! \ No newline at end of file diff --git a/Dockerfile.ci b/Dockerfile.ci new file mode 100644 index 0000000..c3ae746 --- /dev/null +++ b/Dockerfile.ci @@ -0,0 +1,33 @@ +# syntax=docker/dockerfile:experimental + +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME +ENV DEBIAN_FRONTEND=noninteractive + +RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ + /etc/apt/apt.conf.d/docker-clean + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gettext && \ + apt-get clean + +RUN wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ +chmod a+x /usr/local/bin/yq + +RUN pip3 install --no-cache-dir \ + einops \ + flask-restful \ + nltk \ + pytest \ + pytest-cov \ + pytest_mock \ + sentencepiece \ + wrapt \ + git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 \ + zarr \ + tensorstore==0.1.45 + +COPY . /workspace/megatron-lm + +RUN cp -r /workspace/megatron-lm /opt && \ + pip install /opt/megatron-lm diff --git a/Dockerfile.linting b/Dockerfile.linting new file mode 100644 index 0000000..2d5c2e4 --- /dev/null +++ b/Dockerfile.linting @@ -0,0 +1,17 @@ +# syntax=docker/dockerfile:experimental + +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME +ENV DEBIAN_FRONTEND=noninteractive + +RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ + /etc/apt/apt.conf.d/docker-clean + + +RUN pip3 install --no-cache-dir \ + black==24.4.2 \ + isort + +COPY . /opt/megatron-lm + +WORKDIR /opt/megatron-lm \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4782df5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,292 @@ +The following applies to all files unless otherwise noted: + +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-- + +This repository also contains code from Hugging Face Inc., Google Research, +Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their +Swin-Transformer project), Philip Popien, and the Mamba project (Tri Dao and +Albert Gu). Files from these organizations have notices at the top of each file. +Below are licenses used in those files, as indicated. + + +-------------------------------------------------------------------------------- +-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code -- + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------------- LICENSE FOR various code from Facebook -------------- + +MIT License + +Copyright (c) Facebook, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------- LICENSE FOR Mircrosoft Swin transformer code -------------- + +MIT License + +Copyright (c) Microsoft Corporation. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE + + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..dbb29b0 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include megatron/core/requirements.txt +include megatron/core/README.md \ No newline at end of file diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md new file mode 100644 index 0000000..41d1ccb --- /dev/null +++ b/docs/llama_mistral.md @@ -0,0 +1,397 @@ +# Llama, Mistral and other Llama-like model support in Megatron-LM + +NOTE: Llama-3 and Mistral support in Megatron is currently experimental and we are still evaluting benchmark results to confirm model conversion, training and inference correctness. + +The [Llama-2](https://ai.meta.com/llama/) and [Llama-3](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/). + +Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results. + +Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatron can support loading checkpoints from all three for inference and finetuning. Converting the checkpoints and loading them is slightly different for each model and is detailed for each below. + +# Llama-2 + +Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps: + +1. Get access to download the checkpoints. +2. Convert the checkpoints from Meta/Huggingface format to Megatron format. +3. Setup arguments for launching the model. + +The following sections detail these steps. The final section lists benchmark result comparisons between: 1) Llama-2 inference code running the Meta-format checkpoints, and 2) Megatron inference code running the converted checkpoints. + +## Contents + * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints) + * [Convert checkpoint format](#convert-checkpoint-format) + * [Meta format](#meta-format) + * [Huggingface format](#huggingface-format) + * [Launch model](#launch-model) + * [Megatron](#launch-megatron) + * [Meta](#launch-meta) + * [Huggingface](#launch-hf) + * [Benchmark results](#benchmark-results) + +## Download Meta or Huggingface checkpoints + +Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next. + +## Convert checkpoint format + +We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. + +### Meta format + +The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 7B, 13B, 70B, etc.), the following example command can be used to convert from Llama-2 format to HF format in bfloat16: + +``` +python tools/checkpoint/convert.py --model-type GPT \ +> --loader llama_mistral \ +> --saver megatron \ +> --checkpoint-type meta \ +> --model-size llama2-7B \ +> --load-dir $LLAMA_META_FORMAT_DIR \ +> --save-dir ${MEGATRON_FORMAT_DIR} \ +> --tokenizer-model ${TOKENIZER_MODEL} \ +> --target-tensor-parallel-size ${TP} \ +> --target-pipeline-parallel-size ${PP} \ +> --bf16 +``` + +Valid values for `--model-size` are `llama2-7B`, `llama2-13B`, and `llama2-70B` (for pretrained-only models), and `llama2-7Bf`, `llama2-13Bf`, and `llama2-70Bf` (for chat-finetuned models). + +### Huggingface format + +The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-2 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values: + +| Model size | Tensor parallel size (`TP`) | +| ---------- | --------------------------- | +| 7B | 1 | +| 13B | 2 | +| 70B | 8 | + +Using these values for `TP`, along with the path to the Llama-2 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format: + +``` +$>: python tools/checkpoint/convert.py \ + > --model-type GPT \ + > --loader llama_mistral \ + > --saver megatron \ + > --target-tensor-parallel-size ${TP} \ + > --checkpoint-type hf + > --load-dir ${HF_FORMAT_DIR} \ + > --save-dir ${MEGATRON_FORMAT_DIR} \ + > --tokenizer-model ${TOKENIZER_MODEL} +``` + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## Launch model + +### Launch Megatron + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 4096 \ +--max-position-embeddings 4096 \ +--tokenizer-type Llama2Tokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--use-rotary-position-embeddings \ +--normalization RMSNorm \ +--no-position-embedding \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 +``` + +### Launch Meta + +Meta checkpoints can be launched with: https://github.com/facebookresearch/llama + +### Launch Huggingface + +Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py + +## Benchmark results + +The tables below list the benchmark comparisons between native Llama-2 (using Meta's checkpoint and Meta's inference code) and Megatron (using a converted HF checkpoint and Megatron's inference code). + +The values are the percent error between Megatron and Llama-2, calculated using the formula: `| - | / `, where the type of score is detailed before each table. Across all tests (80 total per model size), the mean error is 0.15%. The small difference in benchmark scores between the two models is due to minor arithmetic differences in implementation that alter the numerics slightly. Some of the factors that influence this difference include: + +- Megatron performs batch matrix multiplications in a couple places, such as within self attention and in SwiGLU, that Llama performs separately. +- Megatron uses `torch.baddbmm` within self attention, versus Llama using `torch.matmul`. +- Megatron uses a `sin`/`cos` implementation for rotary position embeddings, versus Llama using a `polar`/`complex` implementation. +- Llama calls `torch.set_default_dtype(torch.float16)` during initialization, which Megatron does not. + +### Big Bench + +Score type: multiple choice grade. + +| bigbench / standard | 7b | 13b | 70b | +| -- | -- | -- | -- | +| date_understanding | 0.29% | 0.13% | 0.12% | +| general_knowledge | 0.00% | 0.00% | 0.00% | +| human_organs_senses | 0.00% | 0.00% | 0.00% | +| intent_recognition | 0.00% | 0.11% | 0.00% | +| riddle_sense | 0.00% | 0.00% | 0.00% | +| similarities_abstraction | 0.00% | 0.58% | 0.00% | +| simple_arithmetic_json_multiple_choice | 0.00% | 0.00% | 0.00% | +| undo_permutation | 0.19% | 0.19% | 0.18% | + +### Multilingual + +Score type: multiple choice grade. + +| multilingual / xcopa | 7b | 13b | 70b | +| -- | -- | -- | -- | +| en-template-mGPT-remove-punctuation | 0.08% | 0.00% | 0.00% | +| et-template-mGPT-remove-punctuation | 0.00% | 0.13% | 0.25% | +| ht-template-mGPT-remove-punctuation | 0.26% | 0.13% | 0.26% | +| id-template-mGPT-remove-punctuation | 0.11% | 0.00% | 0.19% | +| it-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% | +| qu-template-mGPT-remove-punctuation | 0.00% | 0.00% | 0.27% | +| sw-template-mGPT-remove-punctuation | 0.14% | 0.13% | 0.13% | +| th-template-mGPT-remove-punctuation | 0.25% | 0.13% | 0.13% | +| tr-template-mGPT-remove-punctuation | 0.26% | 0.00% | 0.34% | +| vi-template-mGPT-remove-punctuation | 0.00% | 0.11% | 0.00% | +| zh-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% | + +### LM Evaluation Harness + +Score type: multiple choice grade. + +| lm-eval | 7b | 13b | 70b | +| -- | -- | -- | -- | +| boolq | 0.04% | 0.04% | 0.07% | +| hellaswag | 0.02% | 0.03% | 0.03% | +| piqa | 0.00% | 0.00% | 0.07% | +| winogrande | 0.00% | 0.11% | 0.20% | + +### MMLU + +Score type: multiple choice grade. + +Note: the number in brackets is the number of sub-tasks for each supercategory. + +| mmlu | 7b | 13b | 70b | +| -- | -- | -- | -- | +| stem [18] | 0.79% | 0.05% | 0.01% | +| humanities [13] | 0.19% | 0.01% | 0.02% | +| other (business, health, misc.) [14] | 0.08% | 0.06% | 0.12% | +| social sciences [12] | 0.37% | 0.21% | 0.01% | + +# Llama-3 + +Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps: + +1. Get access to download the checkpoints (weights and tokenizer). +2. Clone the llama3 loading code from Meta. +3. Install the llama package from source. +4. Convert the checkpoints from Meta/Huggingface format to Megatron format. +5. Setup arguments for launching the model. + +The following sections detail these steps. + +## Contents + * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints) + * [Install tiktoken](#install-tiktoken) + * [Install llama package from Meta](#install-llama-package) + * [Convert checkpoint format](#convert-checkpoint-format) + * [Meta format](#meta-format) + * [Huggingface format](#huggingface-format) + * [Launch model](#launch-model) + * [Megatron](#launch-megatron) + * [Meta](#launch-meta) + * [Huggingface](#launch-hf) + * [Benchmark results](#benchmark-results) + +## Download Meta or Huggingface checkpoints + +Users must first apply for access to download the Llama-3 checkpoints either directly from [Meta](https://llama.meta.com/llama-downloads) or through [Huggingface](https://huggingface.co/meta-llama) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next. + +## Install tiktoken + +The Llama-3 tokenizer relies on the availability of the `tiktoken` module which can be installed through `pip`. + +## Install llama package from Meta + +1. In a location outside of the megatron-lm source directory, e.g `~`: `git clone https://github.com/meta-llama/llama3.git` +2. `cd $LLAMA3_SOURCE_DIR` +4. `pip install -e .` + +## Convert checkpoint format + +We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. + +### Meta format + +The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 8B, 70B, etc.), the following example command can be used to convert from Llama-3 format to HF format in bfloat16: + +``` +python tools/checkpoint/convert.py \ +> --model-type GPT \ +> --loader llama_mistral \ +> --saver mcore \ +> --checkpoint-type meta \ +> --model-size llama3-8B \ +> --load-dir $LLAMA_META_FORMAT_DIR \ +> --save-dir ${MEGATRON_FORMAT_DIR} \ +> --tokenizer-model ${TOKENIZER_MODEL} \ +> --target-tensor-parallel-size ${TP} \ +> --target-pipeline-parallel-size ${PP} \ +> --bf16 +``` + +Valid values for `--model_size` are `llama3-8B` and `llama3-70B` (for pretrained-only models), and `llama3-8Bf` and `llama3-70Bf` (for chat-finetuned models). + +### Huggingface format + +The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values: + +| Model size | Tensor parallel size (`TP`) | +| ---------- | --------------------------- | +| 8B | 1 | +| 70B | 8 | + +Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format: + +``` +$>: python tools/checkpoint/convert.py \ + > --model-type GPT \ + > --loader llama_mistral \ + > --saver mcore \ + > --target-tensor-parallel-size ${TP} \ + > --checkpoint-type hf + > --load-dir ${HF_FORMAT_DIR} \ + > --save-dir ${MEGATRON_FORMAT_DIR} \ + > --tokenizer-model ${TOKENIZER_MODEL} + > --model-size llama3-8B \ +``` + +Valid values for `--model-size` are `llama3-8B` and `llama3-70B` (for pretrained-only models), and `llama3-8Bf` and `llama3-70Bf` (for chat-finetuned models). + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## Launch model + +### Launch Megatron + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 4096 \ +--max-position-embeddings 4096 \ +--tokenizer-type Llama3Tokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 +``` + +### Launch Meta + +Meta checkpoints can be launched with: https://github.com/meta-llama/llama3 + +### Launch Huggingface + +Huggingface checkpoints can be launched by following the instructions here: https://huggingface.co/blog/llama3 + +## Benchmark results + +Llama-3 support in Megatron is currently experimental and we are still carrying out benchmark evaluations. + +# Mistral-7b + +Megatron currently supports loading the v.03 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps: + +1. Get access to download the checkpoints (weights and tokenizer). +2. Install the `mistral-common` package +3. Convert the checkpoints from HuggingFace format to Megatron format. +4. Setup arguments for launching the model. + +The following sections detail these steps. + +## Contents + * [Download Huggingface checkpoints](#download-huggingface-checkpoints) + * [Install mistral-common packgage](#install-mistral-common) + * [Convert checkpoint format](#convert-checkpoint-format) + * [Launch model](#launch-model) + * [Benchmark results](#benchmark-results) + +## Download Huggingface checkpoints + +Users must first apply for access to download the Mistral-7b checkpoints through [Huggingface](https://huggingface.co/mistralai/Mistral-7B-v0.3) (HF). Megatron does not currently support the v0.1 or v0.2 checkpoints, ensure you download v0.3. Megatron does not currently support using the raw weights directly from [Mistral](https://docs.mistral.ai/getting-started/open_weight_models/). + +## Install the mistral-common package + +`pip install mistral-common` + +## Convert checkpoint format + +The HF checkpoints can be converted to Megatron format by using Megatron's own Mistral checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). + +Using the path to the Mistral tokenizer model (downloaded alongside the HF checkpoint), run the following command from the root of your Megatron source code to convert from HF format to mcore format: + +``` +$>: python tools/checkpoint/convert.py \ + > --model-type GPT \ + > --loader llama_mistral \ + > --saver mcore \ + > --target-tensor-parallel-size ${TP} \ + > --checkpoint-type hf \ + > --load-dir ${HF_FORMAT_DIR} \ + > --save-dir ${MEGATRON_FORMAT_DIR} \ + > --tokenizer-model ${TOKENIZER_MODEL} \ + > --model-size mistral-7B \ +``` + +Valid values for `--model-size` are mistral-7B for the pretrained model or mistral-7Bf for the chat fine-tuned model. + +After this conversion, we are ready to load the checkpoints into an mcore GPT model. + +## Launch model + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 4096 \ +--max-position-embeddings 4096 \ +--tokenizer-type MistralTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 +``` + +## Benchmark results + +Mistral-7B support in Megatron is currently experimental and we are still carrying out benchmark evaluations. + +# Other Llama-like model support + +*Note: Experimental* + +Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3). diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst new file mode 100644 index 0000000..c381f66 --- /dev/null +++ b/docs/source/api-guide/context_parallel.rst @@ -0,0 +1,35 @@ +context\_parallel package +========================= + +Context parallelism overview +---------------------------- + +.. figure:: ../images/context_parallel/CP_overview.png + :alt: cp_overview + :align: center + + Figure 1: A transformer layer running with TP2CP2. Communications next to Attention are for CP, others are for TP. (AG/RS: all-gather in forward and reduce-scatter in backward, RS/AG: reduce-scatter in forward and all-gather in backward, /AG: no-op in forward and all-gather in backward). + +Context Parallelism ("CP") is a parallelization scheme on the dimension of sequence length. Unlike prior SP (sequence parallelism) which only splits the sequence of Dropout and LayerNorm activations, CP partitions the network inputs and all activations along sequence dimension. With CP, all modules except attention (e.g., Linear, LayerNorm, etc.) can work as usual without any changes, because they do not have inter-token operations. As for attention, the Q (query) of each token needs to compute with the KV (key and value) of all tokens in the same sequence. Hence, CP requires additional all-gather across GPUs to collect the full sequence of KV. Correspondingly, reduce-scatter should be applied to the activation gradients of KV in backward propagation. To reduce activation memory footprint, each GPU only stores the KV of a sequence chunk in forward and gathers KV again in backward. KV communication happens between a GPU and its counterparts in other TP groups. The all-gather and reduce-scatter are transformed to point-to-point communications in ring topology under the hood. Exchanging KV also can leverage MQA/GQA to reduce communication volumes, as they only have one or few attention heads for KV. + +For example, in Figure 1, assuming sequence length is 8K, each GPU processes 4K tokens. GPU0 and GPU2 compose a CP group, they exchange KV with each other. Same thing also happens between GPU1 and GPU3. CP is similar to `Ring Attention `_ but provides better performance by (1) leveraging the latest OSS and cuDNN flash attention kernels; (2) removing unnecessary computation resulted from low-triangle causal masking and achieving optimal load balance among GPUs. + +Context parallelism benefits +---------------------------- + +.. figure:: ../images/context_parallel/CP_results.png + :alt: cp_results + :align: center + + Figure 2: Speedup of 175B GPT with various TP+CP combinations vs. full recompute (i.e., TP8CP1). + +LLM encounters OOM (out of memory) issue with long context (i.e., long sequence length) because of linearly increasing memory footprint of activations. Recomputing activations in backward can avoid OOM but also introduce significant overheads (~30% with full recompute). Enlarging TP (tensor model parallelism) can fix the OOM issue as well, but it potentially makes compute (e.g., Linear) too short to overlap communication latencies. To be clear, scaling out to more GPUs with bigger TP can hit the overlapping problem no matter if OOM happens. + +CP can better address the issues. With CP, each GPU only computes on a part of the sequence, which reduces both computation and communication by CP times. Therefore, there are no concerns about the overlapping between them. The activation memory footprint per GPU is also CP times smaller, hence no OOM issue any more. As Figure 2 shows, the combinations of TP and CP can achieve optimal performance by eliminating recompute overheads and making the best tradeoff between computation and communications. + +Enabling context parallelism +---------------------------- + +CP support has been added to GPT. All models that share GPT code path also should be able to benefit from CP, such as Llama. CP can work with TP (tensor model parallelism), PP (pipeline model parallelism), and DP (data parallelism), where the total number of GPUs equals TPxCPxPPxDP. CP also can work with different attention variants, including MHA/MQA/GQA, uni-directional and bi-directional masking. + +CP is enabled by simply setting context_parallel_size= in command line. Default context_parallel_size is 1, which means CP is disabled. Running with CP requires Megatron-Core (>=0.5.0) and Transformer Engine (>=1.1). diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst new file mode 100644 index 0000000..247a3f0 --- /dev/null +++ b/docs/source/api-guide/datasets.rst @@ -0,0 +1,104 @@ +datasets package +================ + +.. mdinclude :: ../../../megatron/core/datasets/readme.md + +Submodules +---------- + +datasets.blended\_megatron\_dataset\_config module +--------------------------------------------------- + +.. automodule:: core.datasets.blended_megatron_dataset_config + :members: + :undoc-members: + :show-inheritance: + +datasets.blended\_megatron\_dataset\_builder module +--------------------------------------------------- + +.. automodule:: core.datasets.blended_megatron_dataset_builder + :members: + :undoc-members: + :show-inheritance: + +datasets.megatron\_tokenizer module +----------------------------------- + +.. automodule:: core.datasets.megatron_tokenizer + :members: + :undoc-members: + :show-inheritance: + +datasets.indexed\_dataset module +-------------------------------- + +.. automodule:: core.datasets.indexed_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.megatron\_dataset module +--------------------------------- + +.. automodule:: core.datasets.megatron_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.gpt\_dataset module +---------------------------- + +.. automodule:: core.datasets.gpt_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.masked\_dataset module +------------------------------- + +.. automodule:: core.datasets.masked_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.bert\_dataset module +----------------------------- + +.. automodule:: core.datasets.bert_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.t5\_dataset module +--------------------------- + +.. automodule:: core.datasets.t5_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.blended\_dataset module +---------------------------------- + +.. automodule:: core.datasets.blended_dataset + :members: + :undoc-members: + :show-inheritance: + +datasets.utils module +--------------------- + +.. automodule:: core.datasets.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.datasets + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst new file mode 100644 index 0000000..7e384a0 --- /dev/null +++ b/docs/source/api-guide/dist_checkpointing.rst @@ -0,0 +1,79 @@ +dist\_checkpointing package +=========================== + +A library for saving and loading the distributed checkpoints. +A "distributed checkpoint" can have various underlying formats (current default format is based on Zarr) +but has a distinctive property - the checkpoint saved in one parallel configuration (tensor/pipeline/data parallelism) +can be loaded in a different parallel configuration. + +Using the library requires defining sharded state_dict dictionaries with functions from *mapping* and *optimizer* modules. +Those state dicts can be saved or loaded with a *serialization* module using strategies from *strategies* module. + + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + dist_checkpointing.strategies + +Submodules +---------- + +dist\_checkpointing.serialization module +---------------------------------------- + +.. automodule:: core.dist_checkpointing.serialization + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.mapping module +---------------------------------- + +.. automodule:: core.dist_checkpointing.mapping + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.optimizer module +------------------------------------ + +.. automodule:: core.dist_checkpointing.optimizer + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.core module +------------------------------- + +.. automodule:: core.dist_checkpointing.core + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.dict\_utils module +-------------------------------------- + +.. automodule:: core.dist_checkpointing.dict_utils + :members: + :undoc-members: + :show-inheritance: + + +dist\_checkpointing.utils module +-------------------------------- + +.. automodule:: core.dist_checkpointing.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.dist_checkpointing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst new file mode 100644 index 0000000..41e674c --- /dev/null +++ b/docs/source/api-guide/dist_checkpointing.strategies.rst @@ -0,0 +1,50 @@ +dist\_checkpointing.strategies package +====================================== + +Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies). + +Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats. +Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure. + +Submodules +---------- + +dist\_checkpointing.strategies.base module +------------------------------------------ + +.. automodule:: core.dist_checkpointing.strategies.base + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.strategies.tensorstore module +------------------------------------------------- + +.. automodule:: core.dist_checkpointing.strategies.tensorstore + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.strategies.two\_stage module +------------------------------------------------ + +.. automodule:: core.dist_checkpointing.strategies.two_stage + :members: + :undoc-members: + :show-inheritance: + +dist\_checkpointing.strategies.zarr module +------------------------------------------ + +.. automodule:: core.dist_checkpointing.strategies.zarr + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.dist_checkpointing.strategies + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst new file mode 100644 index 0000000..7378203 --- /dev/null +++ b/docs/source/api-guide/distributed.rst @@ -0,0 +1,53 @@ +distributed package +=================== + +This package contains various utilities to finalize model weight gradients +on each rank before the optimizer step. This includes a distributed data +parallelism wrapper to all-reduce or reduce-scatter the gradients across +data-parallel replicas, and a `finalize\_model\_grads` method to +synchronize gradients across different parallelism modes (e.g., 'tied' +layers on different pipeline stages, or gradients for experts in a MoE on +different ranks due to expert parallelism). + +Submodules +---------- + +distributed.distributed\_data\_parallel +--------------------------------------- + +Model wrapper for distributed data parallelism. Stores gradients in a +contiguous buffer, and supports the option of overlapping communication +(all-reduce or reduce-scatter) with backprop computation by breaking up +full model's gradients into smaller buckets and running all-reduce / +reduce-scatter on each bucket asynchronously. + +.. automodule:: core.distributed.distributed_data_parallel + :members: + :undoc-members: + :show-inheritance: + +distributed.finalize\_model\_grads +---------------------------------- + +Finalize model gradients for optimizer step across all used parallelism modes. +Synchronizes the all-reduce / reduce-scatter of model gradients across DP replicas, +all-reduces the layernorm gradients for sequence parallelism, embedding gradients +across first and last pipeline stages (if not tied), and expert gradients for expert +parallelism. + +.. automodule:: core.distributed.finalize_model_grads + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +Contains functionality to synchronize gradients across different ranks before +optimizer step. + +.. automodule:: core.distributed + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst new file mode 100644 index 0000000..694ed12 --- /dev/null +++ b/docs/source/api-guide/fusions.rst @@ -0,0 +1,65 @@ +fusions package +=============== + +This package provides modules that provide commonly fused +operations. Fusing operations improves compute efficiency by +increasing the amount of work done each time a tensor is read from +memory. To perform the fusion, modules in this either rely on PyTorch +functionality for doing just-in-time compilation +(i.e. `torch.jit.script` in older PyTorch versions of `torch.compile` +in recent versions), or call into custom kernels in external libraries +such as Apex or TransformerEngine. + +Submodules +---------- + +fusions.fused\_bias\_dropout module +----------------------------------- + +This module uses PyTorch JIT to fuse the bias add and dropout operations. Since dropout is not used during inference, different functions are used when in train mode and when in inference mode. + +.. automodule:: core.fusions.fused_bias_dropout + :members: + :undoc-members: + :show-inheritance: + +fusions.fused\_bias\_gelu module +-------------------------------- + +This module uses PyTorch JIT to fuse the bias add and GeLU nonlinearity operations. + +.. automodule:: core.fusions.fused_bias_gelu + :members: + :undoc-members: + :show-inheritance: + +fusions.fused\_layer\_norm module +--------------------------------- + +This module provides a wrapper around various fused LayerNorm implementation in Apex. + +.. automodule:: core.fusions.fused_layer_norm + :members: + :undoc-members: + :show-inheritance: + +fusions.fused\_softmax module +----------------------------- + +This module provides wrappers around variations of Softmax in Apex. + +.. automodule:: core.fusions.fused_softmax + :members: + :undoc-members: + :show-inheritance: + +fusions.fused\_cross\_entropy\_loss module +------------------------------------------ + +This module uses PyTorch JIT to fuse the cross entropy loss calculation and batches communication calls. + +.. automodule:: core.fusions.fused_softmax + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst new file mode 100644 index 0000000..d0206eb --- /dev/null +++ b/docs/source/api-guide/index.rst @@ -0,0 +1,17 @@ +API Guide +========= + +.. toctree:: + :maxdepth: 4 + + models + tensor_parallel + context_parallel + pipeline_parallel + fusions + transformer + moe + dist_checkpointing + distributed + datasets + num_microbatches_calculator diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst new file mode 100644 index 0000000..1b562ce --- /dev/null +++ b/docs/source/api-guide/models.bert.rst @@ -0,0 +1,22 @@ +models.bert package +=================== +Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . + +Submodules +---------- + +models.bert.bert\_model module +------------------------------ + +.. automodule:: core.models.bert.bert_model + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.models.bert + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst new file mode 100644 index 0000000..31c4da6 --- /dev/null +++ b/docs/source/api-guide/models.gpt.rst @@ -0,0 +1,22 @@ +models.gpt package +================== +This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. + +Submodules +---------- + +models.gpt.gpt\_model module +---------------------------- + +.. automodule:: core.models.gpt.gpt_model + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.models.gpt + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst new file mode 100644 index 0000000..12c40e4 --- /dev/null +++ b/docs/source/api-guide/models.rst @@ -0,0 +1,21 @@ +models package +============== +This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + models.gpt + models.t5 + models.bert + +Module contents +--------------- + +.. automodule:: core.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst new file mode 100644 index 0000000..1cc3315 --- /dev/null +++ b/docs/source/api-guide/models.t5.rst @@ -0,0 +1,21 @@ +models.t5 package +================= + +Submodules +---------- + +models.t5.t5\_model module +-------------------------- + +.. automodule:: core.models.T5.t5_model + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.models.T5 + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst new file mode 100644 index 0000000..9afc01e --- /dev/null +++ b/docs/source/api-guide/moe.rst @@ -0,0 +1,4 @@ +Mixture of Experts package +========================== + +.. mdinclude :: ../../../megatron/core/transformer/moe/README.md diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst new file mode 100644 index 0000000..1c478a7 --- /dev/null +++ b/docs/source/api-guide/num_microbatches_calculator.rst @@ -0,0 +1,12 @@ +Microbatches Calculator +============== +This api is used to calculate the number of microbatches required to fit a given model on a given batch size. + + +Module contents +--------------- + +.. automodule:: core.num_microbatches_calculator + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst new file mode 100644 index 0000000..5c67079 --- /dev/null +++ b/docs/source/api-guide/pipeline_parallel.rst @@ -0,0 +1,47 @@ +pipeline\_parallel package +========================== + +This package contains implementations for two different pipeline parallelism +schedules (one without interleaving and one with interleaving, see `Efficient +Large-Scale Language Model Training on GPU Clusters Using Megatron-LM `_ +for details), and a default no-pipelining schedule. It also contains methods +for the point-to-point communication that is needed between pipeline stages. + +Submodules +---------- + +pipeline\_parallel.p2p\_communication module +-------------------------------------------- + +Contains implementations for the various point-to-point communication needed +(e.g., `recv_forward` and `recv_backward`) in the different pipeline parallelism +schedules. + +.. automodule:: core.pipeline_parallel.p2p_communication + :members: + :undoc-members: + :show-inheritance: + +pipeline\_parallel.schedules module +----------------------------------- + +Contains implementations for two pipeline parallelism schedules +(`forward_backward_pipelining_with_interleaving`for pipeline parallelism with +interleaving, `forward_backward_pipelining_without_interleaving` for pipeline +parallelism without interleaving) and a default no-pipelining schedule +(`forward_backward_no_pipelining`). `get_forward_backward_func` returns the right +scheduling function to use based on the configuration being trained +(e.g., if pipeline-parallel size is 1, use `forward_backward_no_pipelining`). + +.. automodule:: core.pipeline_parallel.schedules + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.pipeline_parallel + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst new file mode 100644 index 0000000..d8ae9de --- /dev/null +++ b/docs/source/api-guide/tensor_parallel.rst @@ -0,0 +1,67 @@ +tensor\_parallel package +======================== + +This package contains an implementation for tensor parallelism in transformer +models (see `Megatron-LM: Training Multi-Billion Parameter Language Models +Using Model Parallelism `_ and `Reducing +Activation Recomputation in Large Transformer Models `_ +for details). + +Submodules +---------- + +tensor\_parallel.cross\_entropy module +-------------------------------------- + +.. automodule:: core.tensor_parallel.cross_entropy + :members: + :undoc-members: + :show-inheritance: + +tensor\_parallel.data module +---------------------------- + +.. automodule:: core.tensor_parallel.data + :members: + :undoc-members: + :show-inheritance: + +tensor\_parallel.layers module +------------------------------ + +.. automodule:: core.tensor_parallel.layers + :members: + :undoc-members: + :show-inheritance: + +tensor\_parallel.mappings module +-------------------------------- + +.. automodule:: core.tensor_parallel.mappings + :members: + :undoc-members: + :show-inheritance: + +tensor\_parallel.random module +------------------------------ + +.. automodule:: core.tensor_parallel.random + :members: + :undoc-members: + :show-inheritance: + +tensor\_parallel.utils module +----------------------------- + +.. automodule:: core.tensor_parallel.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.tensor_parallel + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst new file mode 100644 index 0000000..6e2e894 --- /dev/null +++ b/docs/source/api-guide/transformer.rst @@ -0,0 +1,136 @@ +transformer package +=================== + +The `transformer` package provides a customizable and configurable +implementation of the transformer model architecture. Each component +of a transformer stack, from entire layers down to individual linear +layers, can be customized by swapping in different PyTorch modules +using the "spec" parameters (see `here +`_). The +configuration of the transformer (hidden size, number of layers, +number of attention heads, etc.) is provided via a `TransformerConfig` +object. + +Submodules +---------- + +transformer.attention module +---------------------------- + +This is the entire attention portion, either self or cross attention, +of a transformer layer including the query, key, and value +projections, a "core" attention calculation (e.g. dot product +attention), and final output linear projection. + +.. automodule:: core.transformer.attention + :members: + :undoc-members: + :show-inheritance: + +transformer.dot\_product\_attention module +------------------------------------------ + +This is a PyTorch-only implementation of dot product attention. A more +efficient implementation, like those provided by FlashAttention or +CUDNN's FusedAttention, are typically used when training speed is +important. + +.. automodule:: core.transformer.dot_product_attention + :members: + :undoc-members: + :show-inheritance: + +transformer.enums module +------------------------ + +.. automodule:: core.transformer.enums + :members: + :undoc-members: + :show-inheritance: + +transformer.identity\_op module +------------------------------- + +This provides a pass-through module that can be used in specs to +indicate that the operation should not be performed. For example, when +using LayerNorm with the subsequent linear layer, an IdentityOp can be +passed in as the LayerNorm module to use. + +.. automodule:: core.transformer.identity_op + :members: + :undoc-members: + :show-inheritance: + +transformer.mlp module +---------------------- + +This is the entire MLP portion of the transformer layer with an input +projection, non-linearity, and output projection. + +.. automodule:: core.transformer.mlp + :members: + :undoc-members: + :show-inheritance: + +transformer.module module +------------------------- + +This provides a common base class for all modules used in the +transformer that contains some common functionality. + +.. automodule:: core.transformer.module + :members: + :undoc-members: + :show-inheritance: + +transformer.transformer\_block module +------------------------------------- + +A block, or stack, of several transformer layers. The layers can all +be the same or each can be unique. + +.. automodule:: core.transformer.transformer_block + :members: + :undoc-members: + :show-inheritance: + +transformer.transformer\_config module +-------------------------------------- + +This contains all of the configuration options for the +transformer. Using a dataclass reduces code bloat by keeping all +arguments together in a dataclass instead of passing several arguments +through multiple layers of function calls. + +.. automodule:: core.transformer.transformer_config + :members: + :undoc-members: + :show-inheritance: + +transformer.transformer\_layer module +------------------------------------- + +A single standard transformer layer including attention and MLP blocks. + +.. automodule:: core.transformer.transformer_layer + :members: + :undoc-members: + :show-inheritance: + +transformer.utils module +------------------------ + +Various utilities used in the transformer implementation. + +.. automodule:: core.transformer.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: core.transformer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/distrib_optimizer.md b/docs/source/distrib_optimizer.md new file mode 100644 index 0000000..def23b2 --- /dev/null +++ b/docs/source/distrib_optimizer.md @@ -0,0 +1,54 @@ +# Distributed Optimizer + +The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following: + +- [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed) +- [no] distribute model gradients +- [no] distribute model parameters + +Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size): + +| | Non-distributed optim | Distributed optim | +| ------ | ------ | ------ | +| float16 param, float16 grads | 20 | 4 + 16/d | +| float16 param, fp32 grads | 18 | 6 + 12/d | +| fp32 param, fp32 grads | 16 | 8 + 8/d | + +The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds: + +1. all model grads +2. a 1/d size _copy_ of the main grads (before copying to the optimizer state) +3. a 1/d size _copy_ of the main params (after copying from the optimizer state) +4. all model params +5. zeros (or None), between iterations + +The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated. + +The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update: + +## Data flow + +![Data flow](images/distrib_optimizer/data_flow.png) + +## Sharding scheme + +![Sharding scheme](images/distrib_optimizer/sharding_scheme.png) + +## Key steps + +_(note: using illustrations above, and assuming fp16 grads)_ + +- Backward pass finishes (grad buffer holds 16 fp16 grad elements) +- Call reduce-scatter on each DP rank +- Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage) +- Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e. + - DP rank 0 copies elements [0:4] + - DP rank 1 copies elements [4:8] + - DP rank 2 copies elements [8:12] + - DP rank 3 copies elements [12:16] +- Optimizer.step() +- Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer +- Call all-gather on each DP rank +- Grad buffer now contains all 16, fully updated, fp16 model param elements +- Copy updated model params from grad buffer into their respective param tensors +- (At this point, grad buffer is ready to be zero'd for the next iteration) diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/source/images/context_parallel/CP_overview.png new file mode 100644 index 0000000000000000000000000000000000000000..38c55b371aafbd639b47ab3eea8aa406ca3beb56 GIT binary patch literal 154304 zcmeEvd00~E`?rmaG?PoVy4x~Ni@=RT*a4tD$H zzE}BPN=i!Zz|VV*N=eD8OG$luW|<6d=D7Pl;6Q2#^637bq;eY627!a6Fq^|RQd0SG z%jdkl1&+T9_}K+1CADLd_`f9~=~h8fQlj(&du)z{xDN|FUY}z{QbRv(Ye>x}iG)DeUC#~1~yU0bQ{Hw`DO#pxX>yw|A@&8lc zSVu?94rcNyL;?7=NqH<8+qr0>o87{83uIn01<#D6TpL+N^OjL`up!b&ldM zI5FiPJeR$Y5Tg`7Wjt7hh3fc9tLbiRG*VhBz_R-NH{K4S28~S?i`R;>-Mx_YcKo6g zV5A-uw8Kg;*%A$3x*f`1BOjzvwCu$uZl@-CvN>B2I;U#wu_wTZe_v_L-M$+9dtu5W zAzOou=F+AFLO_0E0_!(>aOfyuY^1Y9p|Bs8isn+dg16ea7So}ncAMmVFSAxjq)Ag) z(28e>w^SXfl5EOGG^x)1D0Ad};iF`|1Z^4r%l+?HbfI~j(Y8whw(CV*n4P65W7jR2 z+|3RwHt15|{xodvJ7nUG+WC$@?A&%?{oGpUXhdn_HQw1ayGt61iq+7g;aT#^jqMat z^TcQCVj%K4y+@nQdbR(9}mx(JdKhHMeQrrsI&1{5K zk86O`dQkI{oig~gLAz!f`AubwDfD>4&ITN##RzxcQnpUcP6q=GQo57YSj0DyB~D)U zuacC&%F*h33^?{cgO_a665I;uVqdz0>@VL0oZJJdRBCn4G>?}*)uo7u?!#N*-@}Es zA7X#)f|2?l_jA+7LR|GVBSvkeA_==vNLsm0jhiLcw)R-4W|ezaNaNws2ka2CtxIq_ zjn|KL1~bqm-$ps=8Rs2ANln&tL=CjLhqHTDlb`5pK3Ao+7gV){Q^4Z(=L^S5OyLv5 z6GuT$N#RX2LAG)(F#%jSl|05tk7M0#JcJ~VFUXPv$S%2Yk*}|`{ZG<|f35E<2}Xol zZ?2O&IvKWfc9YC9QdECPXYG1c9C&u1$DeW}ygphd2E8BAp5M^6hpo4)fSXMkA&W!H zidxuvXb#^$9LvA};ikYUmc=av=}|F*Qi7t}uRFp!1Kz-^{%Gld_pLH1mt+z*-X~r> zrf2fB4Dje35&epw3gRVHuhCoKh3UV(ierM_b^a~B$p^P0T z)LQB#RB)Ylgw9!{N@S5jZ&-jMPKOaq2hOBzeSS~%XDu@+JA3lQ8&b7t+(ZS)3jR&n z^IU`{a$bULUcX*bd0}aC{Ah~K!zi@VH!%$K_?yiNnO$}kCvwZUG#1%y{N(5-A6URK zu7>y}n;%@vczxP${oN(@m4jz3^AxQ`t?m3YWVkNOUx}>a5BDAF_SA`A zT)gYbg%}fSK1r;3)#fEl3J;~bXFuuuTyqr+Pxi})79SSTfB(^ z4HLbZ<~^b4YCIa}zB+NFvcvkR<6aPw=TAtU+BvXo$Z+TW?Im1iwcVkbFGG@(`^Ik2 zW9cb5wh4lw9X+j$fd?p+wQH-Pj(+#e!)dV8<#fZsqY{r~l@tzB+lKo(IOvm(;j%A_ zJd{td$xtEf!YfMdAPsTK!ae`+dq3^r{#emptK$A8Wo)?jXpeSo>4l*nrc7!Xf24$v zabHFHqPG-H#e3G|8LSQhIbEL+$!51YBP*~W!{kBRKAv*p5!qd|U~bD3v7=xEWB7zP zrPeh(0&0X9(@)7@C}ZJ-mD29e89Y5(Ic@@^8PRkuukVSwG~91<#tpjJBrS&RU2pxu zxng%*U>5Xx8x`D3yv}z5YanaQQfC7hVHm|#5Fc?zV$ur;NG>UVR42AKvPZ-VC-F&?lDBiF_49z-cS!dq|}v*9av zP``FV@7djS3I9U!-frAr%9SdR8_fI`fc6+%Hv;hYe z^V4xh)pILP%NAbsq4>cM%Kq-E{~G1jM7z`J`H?mUkG1h5$(ngwsKC-H+jiGZ>d4gA z3ht{?UsR*%PqMqsQdKsj%p||&RImWrlD&mNu{;E z)yTj)BlX5AJ!y{Jlqx)!{I&*sW6JVuZ`16A5|6P7LuwAsu_Spkfv<+S4=2a+ zLu8TU8|?F%$i}L264FoAs`kW?a%25%wvX>2hW__&e=ihzQOXVB9_9YG!iM-4uF;oq z5y52v-z{X}O0DL`UxCgH_(T~vNaQq;$Vqy4@mkM=s%ttj_u~E3UxW-qf{@s2xUjmY zw>r{7--hg0`Mm7OCZv^?%yE2?zhj(f$rNDac6vvL^&nNow?$WPgqL`+mwyI-I67K} z(s2H*z->9RFuF)Hvpb1DjHV-=Aufy%-HUHtWY6wI(#I%#)nU~^m27IbR)??H+z&+LVwq z70gI`gl|9?WP?^|stgoC8P>f}xA35G^~^9iwc_Npbl1=t?AqLtwJ=<235TQ<*LG5D zt&MQczLjBASQ}H0P)^CLE++yZDueSbao<;a{5x)ECe&RM-+wte=dl4#Z8+S5<<1Zb zX8t0D6~RFs^c_;uC}0mmM~Gl9ms{r1G2RuTiKm|tgw!Kaz3lbKUq}Vp|7yOF<=mN< zs0wJ1EIH@-W7O@P8JRh5xsH}?*$LqdeUYaeOFEzA z`{iq->+sr<^ie(R``|jA@IfP=5_daUYHsHVrMNdIy;Eq$qKaCF zwT4Sadar2hfZ~z$6NQ}ECR-V`n%P63mXQ*+bJs{{hiR|~TK4vArp89UXN4WfdNJqU zAeDG9M$n2KE~#4qXUXbhGE+*5%FuB(S+6>xK05PgeXFeK#@)L*^t*L&5XcE7p4S^> zf##z5^}E;;afR^bg?s~@kGKTNdg?#YK8&DAk*eeO!13S-JVjdy~%yzCGp z7hfg{PFsMZ6Cjx(S-kvk`3wqf6(u9N_=Dxk+a$k_PrCR?iL}`(E<;IpT?S9>MMjWn zwq`D|LOEYFaMd;hJnuj|zT0mv36RQ#6nEtskm8;(-&JK8W=LQ4&a;!f1+kTlxUYKa zMNry@cYJjM;gnln{*!vmT78_p@v=a6-8A%TMp@eUdVC;*{Ju z*PnC&$q{GRk9!&bgS1ZNYX9Xee{Wf)so4x(8GG$)iDBGVFI*pt} zTGXf1kWP|67CQrNs>{c=@v=E8xDF0p`e&XT)jg~NK2aN%bPd35Bn$uZxB;5}E2xY- zM>VAQbhzbvxaXh|;SM?uKA0Gosv1%fWFAY;(%Fow2=e6KBnCplM3N){_3S&^t2f}4 zYH92&NLW&j{dX!m6oS3d`{|Lpu)-n33^J~a4sd;Ps-$io@7Id zP6yB*@XLJCFtYES>1ex-cht)jy$$JNy^fJ*Hhf>+FWctGmGgF+m`na>cq_lYUhYIn zaZo2Oo29}`;XyUXfIMcE(_)WpP*#8OdywW!E&g3J|9y7jafD0HZ37xThhRXcxEAHR zl{eD)Y{w%M;*LH~X@~l-g|X;NbCp;WEDaE4h#uylk;|fuzr@ppM|}zJ$H!>v)F%to z|GEgHfFfKhDNZas*u%z+GBxih_~WQNpQg^6L<8XtWeYRo?vHn1AqaK#Vb%4;f%hD_ z$;v^4_Y&==R9;h=%!Y3&kYwvVwDiyMU>LHIpa0;11H`ZBQZqL=exF6}zkSIOql&gxjkuG@Oi{ zDPQu@i-LY#o{e_$e_L!1AzoU9x3|B5w*x=aDTS;gXLX*Sdaw79(UIZTb#hD|4ZO3t zV#t&^tfL%a63i4jf)$W)Y>Ks~->pGCtZL?P39af~0W4Kw5B7C4N~y*{?_^(cEFJQ) zW1^fH%S5Ub+(`LI^xF%-e%1#nKi}Vhd6Fr!U$@-|kR#t8gTGT`)nAI-@G{Xe*GQCT z5Zh=88Y|67FHFk&7C98!zyq_=3UdIgfUhaH_@$DFZfZEe$@Xpa%kA$oBp|LpzIl7- zuam3AcLp-M*{UB30@Ff`kVB=+v3ro(eCZFpde3x%2fJ%%rC!%sy4HxmdAgJoFdS7(WPM1m$u?(dkhF$^p0bM9=*SpxEA<-NZ_?&^ zXy^k7<(}KxkEl6unT0odZl|5~!>nr+Qnst-nXkiFM|G#yS%M&34!-}A&KY8g<4v?A zAE@jIEkk(ud1%m`ErR&5fr6d;%_UJcL&IF5I>|}lt1-0k)`kY5&(8C1V_ke0BsJbSXotf7?qw5 zmwM~Tg5Mxjr6b@IIS&>jsY9d}cI55%9a7F|-<%EDs_P{cl&zcwCdehdB_a@vFds`@ z$l%~YFII_OY~=lXu?{a|ZH$rFh?BC`^n(CIUy^`qNWL3V-}hT+1%U12pL2Xc=1_-s zx2x{JfeN)Ij(^e)4Nz2aIa2CgOcCl6Yb06Cp+K|=okz4jX+28A*xgO2zg)fcaU4e>{gsA_h*FVtB?r&7!n~U z91<&V%VNFLF1=G%hO_G*`S^ntzl;2J+Z`3#PAs91!iiYAWl*K>U?OSrL}VLIHOJ@8 z8ikD7V*jhs82}eRd)`6x!FLsM<gr>;W8d-Q zDD4NuZH9|RCYG+T?Nn%z3*KZICLDsNayjK!)P#0%ta>NhrN6mli6;pR8}3-i6d6PkQjn* z3#GT1c7Y|corJjCa>b`E?-H)j24U*8T%T|kpfEI^WW$Ok4GFlA2>Z}*d-NX)0jpy1h*XU%S`6V$=;&?>AHG%ZB-hg7H}Q%hnc@7pnI zmh!SvBMYDqy+@)}h=W6{c&IxxGt7d}G*SFMX6k7+=QgXyIgAdof1;yZ3UE2Ew)zP(jU87^k6D;m6Z z$C0iwh?_g2GHh7n+tQFMh?J)XLN-o6&PM9|_*kHG6_HlkQ3xpsv*|ct>b7(nO5uDXVl?(1XeG;RSwc3Am+`F!0(+%;on0F>>jX&K^vnit>o7#uXh+hgS{Wc~IZ}{WC$~MfjLG6E!2Rh#ohd|iR_Ql;*$3C~ zU&ddc<;AXMdgGU=6*m1T90()7$)C?bhuQ++fYdumoh|Fr`@hG7q4Rl$)I%NVb}TW_ zePa4l7YFdtL%ruymMc(XTNA~750Pg4b1=EG-*n^C$DOtBSo(a!yiK2NlJAuA`h&QX zFx}&9Q=+pciyL>`qciSeo8i)wS9tzxY82Yc3Zu*5-aEN5xlwFrSv6y8~7x+62zq@z` zs2hc{MqS1s#Qw^B;u2q5LiBK3<;^TQ&_h{V0W#BU0tMhT<;OPz3=QMckcgFQM5Rn z`XUZFOokwzPNmWb(WKISkiTwXBrYY93Z9tzgP!HSGUR>y~YY@xep`@kt6rAobB%GUq}M=fFkK$p)wPWNO9Rf=91DIU#(?oJVJ5`Y`JCJ;5s=l zL&dT^hiV*V9vn$1~Yevdy(p~ zFj@SPCq0P;J*WMaJ3jZzEUX(={ z2{aDDyTq@tgAs#n?3ujwEwYtKx3te69uMxh-QUnW9}nmE-uR5TG)n9f`_%a3l%xv+ zRu`RZ!NeEGQH{5|a6kLhrOz7xe-I*4c2~#S4(oS@arR6>j*hX#w@B#+4QtfvCw2wbPH5H1oxJHo`6;@n_od#82zh3Xu*o|u@N<=m>)c#FZv>(+IwxpF?ypo$ z!bU&;b@??Ig)a)n7hUlm9IC>+Z}cRy;hg%XinqbL$q|6N*fsI`#oAQ6DKc{5ew@=H?UIe21!jftxHCCz0i)gN$n?0?ZcxlPEH{Xwoq98$c@G zA&oeg&d<*J>rG;o%=VXhRd2vo{6^Xrpp%LvwxM0m_BnqpgH-R-WAoMt6R-;3CzqhB z^IQE`EB;3pBidL3wFIz)=%pd-%FU`I{Ka1!KZVMQ5}Qc~$f}_;Gab7N=vj`D_JEME z=Uv30ChBDD3&XZ@yXPL#<_lg2SganZNF@0;bqBV zRKL(Tl_BuDT#erUMTtyM4N1|SB;{Doby(Xcc1ZOKwq5R*$?csKp$ee zn=12_JWD>5*9wy8DK8B#*4(T^cpY5q$ck$l+XBv5Pt*Fp&9{svyA`9wEi}-S=|yVr z`kh$==KTF@vu&J8P-V=nlqD8%dc=wC8GDtS-5YC<}SC*`H1`sX#UbKaX_N;gTiDVYJ;AS z$bjmTFTY?eCT@*~M)pcWtK;@s{OH)H$M41}aKZa*W$X`&TO2~LtZhS>Mr~Stk^R`* zAvgVyUAT%}?-Fhkbq>AxqPU*KS#ZYYYM+x47O@B`fN)0`#0~--E07uaBgo}_=X3RV zCTt55I9$^fl)j_W4(D0ow||%bAM!xTjVZO^j+pVxoOhF1-A#Rp^e#X8HN+W}DfMFA z&s~q0swK9CQrJEY?iV)|mgsPZl;@KIsCIny*)M$fi#<9C3e^Xt9dP<&krP9`_ zb|A%{7%5?tu?u*L(z#@x_#C-Bd9M;bGOBwn3Emf#uD84TJJP#Z(=DJ7-Ki$FkDoTs z30oe8H0HerKl!~-L@q6}-Q-TTT ztU6k-2zH{WX0P*@6q{+F!U9|6HnrBBN-;sQe_ZXIx+U0rOC3x4_(Ehsh@41XVPn_+ zEI4ua-Lh3Dp(TE#+X1I+)$)EwahWyYJeoqT7<$n^6+FQGB3t;T9y>}G`E+%qKR;!a zUlWp@-CnWC_0E1lc+P`PrqqkuP+(smqQzzszr+fzky^)h6VqX9rJweK&#CV*HNN!} z0Q!&idgce+qN@F2)y_8(Dr&5`$@%Kp?G$^-7D!Zo!){(lCYrlJKAS~B_pUtGMk@G0 z?Vg{#t?8u7HFn0g%8fQ13_SgK2LpZTv#R^SXAdGo@uA`IhlW)ldy5sd5TWvm+c;nD zaPK@UYXP?$HIMPZ5!RfRPnwzTi(98~>c*h=>}?tp#px=GuCP1=l%iyp(E7?+C5JWG z!FcdKbKr~cNh5s%+eemq!>3m2jTY=-ib|aAb*d+6p4*_?_3Bw(38}wJmTfDo<>veF z<)M&Rc4?u#^Gj21FiGFIDF3ppCP{C)P znaDg;)YR>O^+?M&fPgw^Hv={Yn&Z;4fFZaBouN$%QfU=YQ)SUmND9#k_Ix$e2>i`X}S( zucZ>)d4qjn*ATif^mfE@B+o!28lY--504;+02}lLJx?i!4NTVWEbj|V0du_OEc%Fz zfyU!KO|n2(f{b2Tt} z@vO>bPHpj9*B?I*LFUDr3|J94H%3y(T>M%p{>lOf0nVs>coYYIDg$Z+^M_w;UHnO< z))$kRpwRxiPEVVwN;u{GPeV&m0arf{P*%E^17jRcSnMd^8w#28^De-(;$uGFXjm)f zw8L`w-j55zOj1(a&p(YbNkx^l{(>FFDu6>w#AB|KPtmc(`6?+XPaH5t@>oUf5-@}H z=GBwMpH!xQvCe{0V6^IecQ+oGZ~5XXr#@dL?V#(Mw6$uN&VM_PB!SBW%zPHZ`ffOH zt}8JJA35w@$abhXI=b%Gitpf1~aBBtPxlVTC9Bb3rjFOjt1BqI<{HVXH_ENMT#~( z4;MvS&F)tq(DJ7hL_Jm=qArrCTgaFZh*FY6IH^L66YEW0hN;x*^=43ljU*Y8F%h zd|$ysaC_^+qmIn9hz3RBBFuCjW={}DG-oB6h=#D8t6}XNK^5bfP<><9J z$XLW=SVRQXZj@Ik6_trz2`XkmBGu;*jDUxk5o!vt9B_v(Y2z=yS4;sa6*^T+A%eO+ zgwGhED>Pz%K4)9O3$JiO-?(Kt!{!KYfUvy%KgidiCq%-M_X3bTN7QeO@gG18`bY3O zKf*5!*NBEVB92BWLFuki-IR#W70n7OW|B3%K=Nfeoe`ASzKx=Kj)--NAe`a|+7xc~ z+=yv>u2$1*MHV)5&_lusK3$|8)U7EPQB_F!P4Oy+7|9A3u?3=81&@s-+(#T?L*8cW zbslQ0D~FpBH(WCvYw?lyyb6|T6DQ#UdU#M0tPOsh}uPi<`PvHi!`D8L83;EQ1e-?;`4QV z%J=`5@r!wqL-32!h51+WT!s)D+D zHT(frWkdKlCxYh>98SCA%FM@k&ut=vF@5fM{yuImV;UQL<)*xFjKTMR%DEgGw#h<_ zx}vZ#^0P-#*34~Yp4xjsr^+8n7cLyy0*5&drS(0Umq50ft)Ez4EVVds`AAA&AwOzaMSB6)F;y=ms|WESd94Ul{nz%u3t!?{U2)p5d5Wq zG~IcTBofXqMtt?cV-oqr5=tZh1pZPy%piKCUmQJ^Awrf}+i%Cn@-EXu z;O&6fBh!EUyNw4Q*AuE~Yvg-j__M4zzzb?*^!&5nh{XRFH@eUJK2_C8b4zYSp3zFG zveUMN9h)P1*t0KeV?#GtKNjTt`d?e-=1!S$Myb5U+Y1{L^+(ty?bsBX z_Q1$(|9`Kqm%5CZjo*ii*XTxST&x^n_0Y6eTZowKnbeRRup8{Zz5(@&>-%md`Xwup zovj#`$cMs3q|6e;pih<#R&z&a}0w@3geOyp-rsjtnnf*Og=6_TNXgk+XYEcSH2l zV20Cfs&QwlzS(qb_|%}8xb6AxR4${5`FMRx_n^~I3qbc@>O^@}e~%x`u*+2FyY?M^ zFbFJSKALEamFjJJe)GS-+d+>o7S~fK*2=otyK)&VwZQZd$}zJg8hg(k375ji4tPLi zr&Aj$78HW+CC;LvW9i8|_M>)Qzo??N_lu6@@7M3O(?C`QarUovxbK&$sC+11QE@k- zI4}0wmoWES`x5)+DsS1=?kx+HjF^G=`*OP|DJ2=wl9SBY8K@cipK^Z zLe`r($4X?d$*dJxEvGee=;wK8@Q?1;Y5gu4e-`I_WocuDOR66%w8U#W z)FW%Bp-h<_3mDw;miSYXB!GlQIkz0|=WJS_dHz8WSwepHZFL%uR%~One>fDs=7;9Cn*7Cgua*@>IS@MqHo($=DNrlN(tsTHD$U`$rAD_2ZL{x!S!=eA?5(gSNY zi=(TKt!BrQ;yfw(HAy;^m}og~(cQc)PZBy51pm^~Ue zt5s`F)fC;+49Oc#MGK;zwbX^ym9}Ng$DYa-v*~ok+`&LmNiEf>oU3Z@6yF&6b=ve# zJ;x#~i((q&GBnOP-_HOZ)<>&IAJ3$>?y`=#D8h&0rp3na(;*nuCz0yH>w)M8TBr1j zkFdFO9l(nXT0hZ$1Wn%di<4CLNnZ~|rpcb}H@1?;AWNmkQa(?Ncentgd8S4Y7 zD=~O@HMN2dRfs+NmLYnXE2tKTT~4!uvr~pydsqGK;J$@GbvuGuQCp5Ej?FW2EAqEe zCbzRSp&mpOLzCa<+=UXU)Pjw0#N{-ciCYCN5==Q8W)7zm7=MAQ{*zjd^7kUTwi zp8#52J{O5Dz6nIAkL+-gpQGMR6t6^Sz*OKS`!T5jQji3@TJmg*ngOR(>GxtjT6SNu zu+(DJbZd;7FBY=jalwu*fE}kB=PI-N5Ag;aS=`@zhVcb!1K6>Q8hzi1yZfT@egG#x zDvGT5+`h4r=3Z$@bp919|5bv*W;1xKV;y#=R|IcC)(14`QPLYK8N$i zoKGKGRm*rX&e?0S?%-s`KS&hPsi6XHRI#}xC3DEN*~Y5&UPdPUn9i0hnO85S_V40v z2R)}`+ykB?!C%bt$_hu^f+s1KBypQuio#DU_<m)~*v z;c+&k$j3^9tjl`x?e}H#8)=K%NdF^WvY72m3Utem;p;n>`?qDPS_4|M17U>2LsqWI zKgL%5-on=P2C%$gc#OQf+%Y+F)l*=?w@bV=1wcqA74>$3aYDlkoNfCt15D=Hp#J}%O z-F00N;JJ=vD}3jdLN2gr%ebckyO&~Dfsz`a1g!5c<4~%ewrE?$?sN#WTX6sM<|>qF z&uw6Vk^0+t^@=N7GwvgcbLHSIY6X!kGn73SSYMD4f7o67Gq1R?I~TCeBZN;&b=1|8h_BtJ$;aUg<;+y9(J@ zb|lqr{az5uIOp8X&YW^_uHup!fF|FZb%@AVj{6xuGNU$PTwy_u&u(4Ob)Gz(!H zr)D31q)2F;>X#udaZ% zT$P_a@|gwsU)%b})R2+u7m;Hk{8<@CRDe6Cr!BIV;#mjH?yPOi?y)WkbZ-XuQnp$c zr1<4PHwnW8`RQ!ViEy|-l3*j~N3>?N!(7olO>=;mi+-|4B&PfWSdvs#^t7s;^|Dbw zgG>!d)FoWJ(;ht^xd-HiP`Ln$hgj# zJaDJyHTedXu83O-Cy5 zX0XWej%m?!``TWx`0ZH=8bdVXidqivYA|WVpGaTXG{G87Pdj?3InInH&+d}mW@eJw z?^(CoX=?(qICiH@2_<7WIlzHqZog00=x~pr;jk&MPJwi&rol$^+VWvt2M)fQEzW=VJf7tvtlxYC!@IYQ*S2p%6(2@XL zDp@RBjC`axP=3``+pe5e5okp$mIEZ#R_*)c8@HJ3L5zy@YTv8i znvkeGU6UA+A;lm|5SclewkV;8Xts?bB3kiS8go3?v5TMfTI@RA`q#&bau2uMllVy0 z)lPQwD=`yf@c1Ei!Wo!TLDpPM?DuR#WB0?Qq0j(d*{7X?5+jcypHI0<#l(#2J-jY{ zDX1sE*~Z!3#S?bvOum8+@4cgfKyF)GZeda<;*OOp#GPA3@shbPDM3Gl@e1)%*PK*s- zuTms_3uZ>P4)lPOgX)~7`0OlFLnT0SC?dxF5jMnv92z{)w((|BE;4TaQmp}L$zGqG zZM95^;2ja@0K@|{vxK=(mD&mjI;wNF$BcP3lw2_9qqe=P^Tac0TIMlS(}VNJvC)|2 znn14taN>@rriv0YBZj{%GGn0o63!_pbTD}qg$05)uD@GA+Xdgg=?YrfqR(;`MN7ob zywsV%O>9SVEY(X@L>~7~FP+~Sdk5gO()}qT1d0-NuI$* z+#o}2;-+sw4+gsRCtV2s64o`%gY^?>cWwA@v~gyv0L}j1&JlWVEb{mF{Aa&f-S;~J z6dGtrvb?o?9|J;Qor9Ev;P@=u(47YZX&(CpkhLwth>!Uaw124CrUJ(=-zlF$$?#!ZleZJ2l(EE6U~^`dw6>W^ zS%0}ngPq%bzZ+xa$l|))(i?i~P9QzN>a*|8vP<;@s6_pxDM%IdtWXi-{wH>4ebNY) ztK3op-p0j1Kn3SI?J2RLWaz4NI7q4bMaxVu$*gzUF0{^vg`0i|+6#h)*NweO?2lYu zIHW@|gj$}=ZqeEz8A?N+H0Ku8YH>2XF%^?#wAO^o!qUXC$V@I6RJh)VD%~g_IGGfgNaezDYyOe%*jnpzr4sx%SiG^`>|Ts z{y6nHQS*LHOMOhpnOhh@>&l2r=S8@q%MWbaS2?379%}1;(fSaZoOum+wJR`fHb9cc6t8>)%;0xyR+~u=$K(^}qs$y4n05hynN1c)N>qI!}4d9IqM+3ed=qL5g zP|dk?*Iv+Oeec}CAuaDb8cM9+)DoNfIyVF}G~m&CcuO5Is7nU}?_FM$$1M?WEr^>1 zEP&barozXm#`cY|Dx$6ns>tT52IteVLDvR*xhENMj@!(X=v1Rj`p0*D(S}g-lpS!G z8Xv;%Iu#Z%o)B0(zju&kPezZOgp-q0_^k2p1j8MsGY6BD(v18=5d8Ci9y``eYUsPK zQq*t5g@6Caho1l}&i*+_p^*tydFaTFp?8w~4(yHAh78Txs43i@h`dW{u_EZ~1!cV` zjAq$hkPaoYFm?rN!BwB@no;kmYf1`nQ{0%(1-2p4K)&k|U;b82G&S90kH4wOF%6}g zbnS}O+d&KsB(B|Ly@}h2Tv$Cdb3j7kQH-Xj;vQ?r0r1rZ_16}sMeDHFy)l|(_Y#iz z=`OevDqtg7yP;G}XIz}`-^Fa2R&!OJ#x?mJ2b!y>g=(&a{%J?l?6c|(U|A%;E+}R2 z=)5Rk9bT&5PW{Nj>fMZW^>XiG1sB){cjecaFkKo9g3a+ZjQ-NLLd|vRNwvG3wq0*K zaxQ#~LE{9cXj{az(NLBr%R%&Fakz_BMP|b;Rrs_pY8cO{ns@p(dPk5t5cZiRfZ>6D z#B&qAV2~g$Pq}4Ejbt2G4ZAMidu?gIg`MV_`%Jh4)B)@7yoaNVNAy#k(L z(sbyv9YBNN@XP4B;DJ}cN}7)|C)CYMjzLH?_MHdT5zBNlU3^0w@`60mg7<=wXX=Ik zSocvv3%6WIHMT4YsT1mQ-?RWP*1R{75Sr`OTy-^~m#ACVJPo`l06WhdN!;xr)@zM= zEKsr7-On$$q{H1oEsbTQtd{hJfTUgUa6Ku1Cl;gIscCP2^f*7~vC%-d@iOFW0h}Ip zU4A_K^3r?+$;g~1I4k?Z%_TMcr7RlQfPqE0onV$fP8Y}Kn7_-gg13v_KJVFy#D23& z_`x0Xn=DjoS9DrO&-{Z2J^6=Nvy;CjR>=22jarbz%HR?%~{-wcie#9``|9z&aD*(nn z?bFCU8}O9f10qUyz3q#J<17Uo;dN$i*ML-b6xEce!>e&5_a%&}*U>l^ygQjWFU;Jy zBe4`b!;K#4in=vaPW5?`pFPqAPZ~P-QpbarjeCC~Ss!4N&QiF)%@EGjU+~@R9}8Hm zOkBYL7m~&obd!|1jRyeL=-CsKVzx5}wwlEZJW7fjDy}{*cT#6Ld!F}uaqg;oP~n0X znN*ZWJDK-&!x6>k+UTx9{g>0!%jqLwX|(-)F|hsdFo!1 zHygi904RjGvPnC1u^q6cLDc&_3ty=IBLFY*y&q%#^U<}DM5Fp|0Y?4P4AAEwOS|AQX12x3N!Rj<$)B9ZoV;p3 z!c>{EfhWgJ>Ehzqq`ja}yEt+*#4d(%{T`>PaECV3LyR=+0i;n_J6L>aEHu-obD?@_ z&j1f}YqYm_l!fzp{2TY5My(*HJOc8n5!yB1Q@j89Pm3XBdwsx@MOh4?lk!kg9l<*+ z?;jnG53U{000>S$)~=)Xg%Rvh^G%5{*fg(0jzVz5+m$^o0R1gIeM99^{Z=!Ri}_YU zVEV^}FSz_w6_wbS^y^h?r9Q|^CB?a9+RpD(&%Wp{4}s0Ny?fUepPjUzA5$DBmyjfz zeffN;1x2fHDEa&$%g93Wv&}2#lTdB>mfgT^hahrB6d^>tx)DxVPY$((S_ZWcl=5~JFQlPtkT2z?7A1p>3ZqME*@q%6?{jA( za$RzE*H7noz6lrA@jLQPkQ?F;)?NdgQL4n8R> z%+=Y{vERPgro+!vp!ciwP=8OzLR={W;3R#a<6#xd1$6disrJXC_&JC2IIqwV4!WR( z9Ce42@~_>-FZVR)rznN)d^V7_O3d6Kwh1eY4zmEjq`wh682NPWVcOg;Jx%Mr>j#HQ z7=C~z!DLRC&pT_hS&$+pI|vxNy&$%Z$0m4r+&QnAWab8_KpxGxH=}WG==n*` z_2@mW3X0lf+DY%Fxy|9I0AM;&2&CTrU1rk- zR{R1!W_&Z|NZpCP1 zkX^!*tIm!#ZLBrPd??vtI8^uw$FgFbK4 zsc@wv)jnDNrOD~89R8ddMhun|c zp@UhM3g=yBw>179Kk72+Jm2%!Llj!K$3as14z{$=D+&cg18HR~eYBoL?OaXxOxQKv z&V2A#@eKxvS2(6VG+AaFGEma^M<;4;)d&0PjXZ+Q*RF5xNe{>MQv>oyzCM&#Z(WTuy zUt9;UuqM17FnFmwCbBMiQa{W2LRzlHs!F25N;SZr#5)>d;FzocOo0J=Mw;ZxXW|*I z;gaz~_0-MwEMSa?MF}PA!h&jx+hoG;IJJGJko@2Y0G|Fd3yRIJR~tQ{v0Nwaz=-+H zKO|`&v~VPO2-+B5JIMAB7xkvD3-`OiHsbNP+AN<4xQJAEEnX+M^$^Dw9^4`R;k-r*+H!-}kC2OjNbx|xZz_Ji>BIMVkl@n9=P zaCP&DNLSP%n*6oxJ6)3gd?(l)fz!MapY(0`)fS&CNKIcjP}-Mdl1@zbM-Q94+K(uH z(r-g(2IT>T9oRB0r)H!?Zy1`%>ez`$?ghFAUi%#2C$3^YYXUC~8_0Mo?Yr1^hn`|^O8 z^Z$R=Zm}p~71NYsEmo(aW4h$X?k6fNrJF%2-P56(!l0-rXGJAy6}mA^cSG(b0GnK-A7T?d$U*B(^o%ic{zMjY9`FK8G;1rIE2WyQ!w>=xml+zq| zJyz-9FM`>NkcYMga2dFDjP#s&1>ksC{jeRX%%=PNS)C~ z60Sr|2!Zqi<5*&Hb1vx!V%!4?#<-M<*&>>Q z@2~pl)|ZfW2qytR0m^wWZ|rmp0ekY@C~~z%#(FFe_ZK?Q4!#;v$i`(f`|-l$g~IfF zX@@(HpPXSwKTYl?462s==gohR{ea|B&0nrJwLu>q%806Vzu(dw;5Tq38UaBqplMfe zVo!)gn#@SnrpPt+SxT?J$Y{cvsQDO$+h1sYXhZNT=q_AKDv5>gBF0z5?#{Sw&KNVr zveiPms~U3|;Rp2`#`w-4SC1T*qV&4Fdvi)j#-cgb*GOV;1*FL4KfPP(MUPGm!4TVJ z!rX8Fq(CZXKjV5=->xk9t`z(+17B~(EOmJSSc+!@#Nrhtb!0gAJPpg2x&`4^7BX*C8P78Hai+iqk|yp3QjY zJi27oSR6H=xiWUnsz1W?YMTBt7i7Zm*qWL$!VK`2k=#>Ct$s!94P%O_Zhv1h-JcJ# zyW8RvMzX}nX5q&Kk@|F&A0agnt(5?nr}IIVSi3}BtI6(NAF z?tY2&Yn528I!%Zd0rVa=w#$$;{#OJmw>j#HFI6& z_)=eT#ys)6k4);Llwj>4+57V!Ywf_n>j(5xqpEVM7=PX#IsI{&Qq$^Q)j#&Zx6h(( zY_4ykyn45BB9j?YG!Zp}<8-OOh z8BN$YR8T`s%bJp=%ZxI1-W=8)xKmgDk?AP$9$#UTbs$YwGa|cm{})wKYml^kswp7D z+G5Lg#mSbKdj1);&E#X`Dyi3h2e7@z7e%T~avhAEP1R!WACxMIs?nrY?iu#F-|S6l zF3Oa-VSnLcAjV>^X^E2Kl*8^Ux|W#nc5uSE4ntMZuk6ziR=lTK+~SM0jL|))4?Yp= z4~RsO8i~NR1~2+3m(coK{k7p2QNwc@$)lf<=1&8QJ1HI6zKrtEWqx=Wb!bC_i#1u{_MQi9_Z*yE$=>M&DYt2 zj1nNr)A*>AwB`l~Gk8JgLZb4h*sGzvs*lfobPYp_W|X8#YxJ|DK>K>dapo3ab1h7V zJi~8Av`MsvL``>vMr79)?I|2>s9K~F1Y85~rxK22_Pl%Nyxf!8cDtfs41c8FKDacW z|6mkZ1@U6?u)rxlDDEbxSQK zxor#Xi5nzmKU2blmq*!rXON8>(jAhl-pUuBwz3djQ_8%okXygry*ugVG|y;TxyK{v8M52KVkA4Y_W(ErZgqTyXuaXCqHrxotgUDtr5vXoCE4F|?%1 zFMfuQ=DFP5m@QbU@xy4s>ePG@K3-95HF!uxD47;(f*sU$r=KPv;leCuRRivu(9=8_ z(V^=E`?ydo4JF4mz^-n1kG?jksliT9p~}50dZu|211UR{sKmnWL9o;uz_dvC*PDd^ z7v~@NuRjo&L2Co^nTZmzn1S9h{W5!ZHa1mhugdm}gpiVrX?%KqzO^+Pop53DxNW7+ zGoaKX)Yv@owq%u->kB{LhQqV5xz3rZDZ(R6=U9S0W7c4=JCEiCc$roAg;sD<5Yo>) zfc&#*T)!eCNE@f>n{ON`AJ6v0L3(kVKEs@yZBUAY>5%}}!TwafJDpZ#c9 zj#r&5!K>oc-U2H)jeWM&c=UGWySw*ZWg_g`?KIEFM_Kj-p&(^J{Mk0}7&?q@! zo=Ac2vXYboT+ zGJ}t5jVxk&+g0*dVqEb>z{YXZ_-8B{$u;b=X179I`h(y|?dx`BK$U$ce-!TJ*aZ1{-4s&AL(zIVA|<7c(lm zKDkrO+{JtQ)EZ50YC1FoUPU!^_I-J>Kp&o_Yjg$ce zVL=4!N%NnVn@G7WYJS~Wgj*Kmal|+9&{{1&a7=+Y2#B51UbQS=3GD*KKnZmD52qce zmSgDDj^^$V^)%QT{E=LbU6l)*v@-$^7(RUBG_FNyI>NG(z6IS_A>}vqwI$qN2LAg!y{F%%X&iR>+<#brzX^m-Sre7LN#_CcD5Qk>H48*|s^7K{^ zvY<2+hs8;r&FEEWDrO3@2k(ugsG^MY;kPrn%`;l?HCDs+7ByXhHjmw4C2Vh9cIAnusl z>mkCeSZVWcIT^sX*-qh!fyEh=iI4!m1P&Bu2=Vm_p87e2&RIj%DF(IP>t zNaN*X$Zo*BM!+_ju+N%=>m9i>czhMcyV;_ezvP5y(grM`!_#1liUx6?5d)D z*WR&^3kaH)F9xy~spyEO-WY`>ciqmA&0BE>bAL*MhjyH4XC+}b3x8P<;<&5xl=nEk z;HY z`auzcZ2uh+VNlK1BB)T)eW*WrBE5h62Hk_R5eN{YJQ_T+blv`zZ_iu?y{o6XcMVch zJGF8uzbd3iw9@E2xA`I5i=&o#E>mKHx*@aTWmstwwOby~@q}R8vgEG4{hVlmHzUj| zo@0t_aAGUD73XMOs0rpQX&TYX);sJaH|r(7#)H~#xv!Pxe%~p|hRTL?aa0hOoG_ew zB0hA8yo-NzgZ^GJXBRQQ!-+eB->v}Blk{K0RKHcwmD2RxZBsu5A+~hp-X{=hJusA* zU1_d($Bjf#xD|k3@rSxFX;4JNS7kgp2+3$`SUlks4LQ^I&)}*mr{n1FHl_KbC!^bp zd8-exE$O)gOJpbSAf;Gtzs%CJEt}6LgYYWH)BeKDP|5Oy*W&g!$PsI}{mq*;GdM-| zw@O7pUV}ERB)m}SeKJx1JJMER*AfVA9;=x{g`ZfnxMtY{4i94 z`7#0rYdKP!p*?grX1lQBZLJP$hSWZWroJ@6k*;o%ykP+~QC1$f)z=rf=>#f}MZ~V( zmL3<(qd3QxlY+Ta+Mcmx-$_cq&92kxBFCP2H*cXux=0@j9(0tJE{G$ia-VErlY z)3yX1@kXL5vnE!`_krf?Cy=vnIBDri51pZFhVp(k>elh3<<3Y;^+_t8cSl~tT`zCT zD|G|!pLYoFTxJ+{AminSr8pO+zQ$Gtq+*!9vyL;@tDG{TZ zngl)HymDu}TaixkL4hM)qjKON?_`LRSFX_3@64<*-%np(engpHHo{1Y5h2vHX4Y2))Cy ztr$Z4q)2zFkLu`>z}{z1Bch?qT6?4n@5I7)BQ@MjkL2kY=sOka+EAipnU*Dg-rd5# zGB)GL1bk#UI~gA(i?L+hm7c5aH;&)JHAv3Fz870r$m<<2-#l8^UFDWtdEsbBZsqHA zVhF`^xAfHkhheYs{u7fFwBx%XRP%OtFUWtwe~|aSA9FRPDz*26s0)Qrbte6yw%wrL z30K~61bjstv*h9HdCqAQk`!ts`QuGB-m}d<8F+hyz995ikK7+(RndVIRRQH}Hsj|X z%SJk(ux0(C@|J^G*o;~m7$yK}V?WP-)&U_?G=X_y(6PSLDj2DL7YWSj&O9}v_{corSI1#dm9 z?nA#=XV<4Q?|mM4H0@Z&2d76#tt++$OG$53zsTS0aY|n(r(?rt*R!1be1WlL3Cr~<`EgNx!)OdDxe)S)gR_$eR)Hw*rkG&9EKajPDXSQvJ@T&t8 z%-!w6+tvcL^wK-N2l`o`r)dRC1Xfq5FYgAAK3r@VpNG8_O;}`fYbFmVOBO9iXOmi2A&Si=P@* zJjI|wv5={t^F_W6mY!83sx{aot`)q+vcJcX{mai_0lCAedC9w>;+oJExJ5Z>!4gBE zfc!gZd86yKJjcX+*lP|v%TlzHSwH;DVhl(Rn-T82R+Ky~Zg>PK!_(|vuhmZ=n@g9` z*$J`?g!eH{!mqLlM!HmeZC#a~CA_bP;^2?5`^5#s)|C7zVY+t`uQoZS?-|G3XN=3a z<OjRX*B4u8&L>IMHhQfWj_&dmC+vyKvuF|> zd8XxSyEGI^myuG;%!h4HG`)8H6V-C&KNt3(0-rauxYV`koK+aXFBQ3#7qxNgL#B7I z*8Zp@s6oRHAdFovH51EuKWpHWJ_OE8P<-Py#Jj5(>h!0;5Z+yXQKbD*(Z{Y}|K7l3 zoM@ktZ~a&2`+(rl(F|q#`LS89tIy&Q^7r7alR27ZaR8n4w{k|S*mms{e(7CjPo)oo zDLhMuPtqPO$sK(fyi1&5)E}v#hMi$6!CI&#iiPqhT}JLTYC3|9LTih+*S8CE-(i8S z0);x?RIOpeq;lxXwiwR;`K7hZp{=h$`)bhnv?3^tqtlAFpb7hJ%g#H6v`f@tA~l8l zX|eMe?q$}tW#tjYj~oUYX*H9R_yw5+<9jAy4J9iGS0dbe{vkSUAGNIk#i@v6AyWt; zX!XxGuUkqmqHf$@w56*0u}})q1^~&|B`wFKlt>|sJigq}?4Qa1|3R+pOq_n6anx^t z>zB93&Rul?iep?ccnL#L%;cUyW$DPcV?QqN3#kBZWFBQpIa5i@>M7Kl_B1=-nY3|I zx77?=qy-&R?lKEvX7Spo-_qh`LP82c>oYccbfo_GP62;Fwq*#4LfdKFl=p$};6ZSo z&8Ida#X?YEk83PFjprWyj^ae4MySSIrfwK{?88<0`dg#O#@DDuDyidK&!oTIs#+2u zwzfQ^6d{>~%+kUd*w|&N`nJiB4x02-#wz|#Ccx0gD|*NY4Ff_kL?YYIPxD$JTOM1p z)NmVQUDocMjlFrZ*)BkY>&z_Gj$nFbY9RFps*b&(K0WJ&$L zZ3lr=2t4Mxp+IwoaD002cv*>^D9M^Tcs+`j4~(=YH{srn@7@zl-iCi~;{ zHoNjXWE}RJdg2U37LVM?8)%I$tB?^UQclsTam_Pms|iOK^J>dZ(nKsrprFM#vSaUf zy_dKYy2{Gp{G{8ASeU7eHUGI3bQYASCeC!J_(C&;zPs-BCE!9)@9q>nq-uogXs-l> z@xH}ihIE3Sc+%2eQM2^12g{gGx6Lx5_6_p=7hLFrNnu(3`1{np)R)>N$U~3YRA~O7 zQBn*=tJ{A%dTK#WA-g>3n&YcXdB_2h^LoQAP9DkfpBY2is9#j#!|jvyIWi>ix3QiY zN%B_|;V)`)(d{yc+RX4{Lt}nIDU|Fi>!;3J>}n6xmM}|zXe&_BKOri5;5#lXSAxKE z(9FZz&uardV)=%Z{(sz7sw*BCK@_%CJb>RmsQJLS;~>DeE6@LQoTA63gvi$zQ|I#^ z5O3l`cIqo5E-uAJ!m8uetvS3mGiWKM@Z=KHv$7>?q67e}_Z>X9>Ko=~so-u!@fm$x z5_tN)lN+b{@fty2vMTJ-Ipwhju}c+^nziec+>;#-g#7{s506$mqt;_2o-$9BuBMzS zl?XF7Iq+KInz4zV^}JXjvr@#;?BOVB5cunY9a~+heLPCzpPRaVE_*5kcdZ@=Y5xsv z7itfZL+2eP)~H}lE$911uOzf-INyOS1^1Pw+F+%K|?4Vj{lS84<4q?W? zYmRqc!5P4MuN(EZW34nEC2)^W4;`hoyJ`JXD{p+$RgfgyOEzSdOPDU3wa1p4u$1hwR%xx-qv7*{gAjf-JNdAdB+Z9h zZj`LREfKrC!*an`ZqFh7(4w^%i86^@%dDOSosK#*%tPXp507n$&4rdUBl}vNc@c@b z#kHb%b(&bSI9hDRuM@uF4&i-R3#%xNZ@JV}$4?5ilQerg@frJC&k)w!;+GR5;pZJS6*}#M3EU19_WF>1XvNfOpbDp^ekBd7roS1^;3TO28&x@xXR=?c}(&?A8%w9FDD%!kn3m>uIcg&vTh5pxmk$1>#%eS6#PN`ujQz}Gl z%H;bZQJh*MIbKyWLc93bZ?mfHN*gq5i`v}}r4@1WfsRqiwABeSD9(EH2x!ya*s%?U zwe;M7W?+oA=ShvSp75Jz*P&WI5^6uDo+dDiDZqJ9l}%35AL?{_L=UQ!!nv-S>4*h!@It{iRCuvo7MaCkM2^V zGE}3{jKbc-d`fT$rbHKmAnc!RcYN(2=k(R)gvCUftGkj$FAr@7abE{xu1^5p-$-Df>FD4rio2P)^Lsw@ua@K4@!^=Y;t~ zZspl8jkH`Q=2R#>S)W|dZ6l2I6GNQIPfYECFW7=IKZGEq3B|SFDmF2MNC!IXt49}Qr%)!k zvzX{mJ1|Nl9qorM9YcKb1wt}PDTT6sEAu=JNUsd>W(&pUK{MXlm!N6bx7~`TYfH7Q zfBcbyPEoSpOuyb5zhUm}_^EkQbA?EVgrR{4GjeCB>&hwE#MJcf9z)2Gd}zbS=&cXmXD(N4V|?J-z@rkBI*CO<@7in$4 zA2-O4e$-aqAtHxB;jmHCUIS2H96HoUl&_bd{%mr79yBnVa=*S~v!J<=Lw!2XTFwh| zY9$Pnt7O*%LPI{b>SFZ&>1LhZ6NB^^H7J(WIb{&eHJzr;zGFL~7=fo8Qu^d=XG1Zu zIa^FJ+Z}5+!|z9i zI3_5vD<`t3V~jS@f?f-V(tI}cG^?5KL2(KVrjIrmJ>TgN*>Ci`d}`zS<1w6o?ZWTZ z2}BVIeX#F(oPgJac{XRq3@cH0?ak4_q~%-Sv239D)crF}`kQqcsDefx9u%)L@I z*{PIpa#pgYr`Kj|W>{`ogp5G`r@xRxU<5LxN!W#VOEg`Foh}ZnTpzN}WVAS)=;Bkl zd1@!^A|@|dUZ5++O$C9Ck@JGKq%b2z%xqgjg|PF&;HAAQ4;zg{#Zj5Swd^b1hU@HL z1?QR<@plqHkCxG!$DgbyL=2%D{!n?_@`Q6BsRM_*9x3>68;PXmUc?5st-yqN5&H50(5l3^ zU-GmnkI(aLb$sS|QQ7UwvtYd1PhyDb5*cWyCywMDUN2PrEcH?bV+Am&x*tw$uuEX{ z;!DK^5%Rc($yY*$;-MrI_C^&>>(_fbg!WnFr7AfDBgf>jQ`kx829tGk==}Qn;}yVo zV9z(M)seh?OKoFuoV1VIlk|xG#SZP;`3mck17n9+*gtdfhyI=_L3?;tKD;?a?yjPA zD7|EYB=v5fw+=bmPJQDbjm3+u`DAj8fwta}9L7U<=5&^#CobjD5~;%d3+i)ImGH^q z6Lhy2Y1+Ic9_leozGC2Vahb4;fS?C^sjhq|m-{yOEtu{2`nWart%qeDc@j3w#ij;;SmMD#x-X3#NWX8Owcee{S zMxxw(WmazW7z1})V?A8rVl;Xe{m2fqac-Bf*Bf`M#dph^&r0W+MY&y2bVyXOgVtsU zog3((lXw9*UUjmtX*ZH%d(OHPt#6w9fQj=f&aEI!n@Sy%w}dW!LFD;9K?(7+$!wE{ zZYn=;HY*$^bVj}?;SPdi_3@o`MntK$WLRYGi;K#~yLy;M?{lx;;UWF|XO)wfp8NP% z#V7Wysv|_6Tu`^v&Am8s4q`D$4WQXE;$ZY5uQMJ%N*5oF*U6#u@CT(@4RVt3qJ zFX`jTQ*|7x55LDu;Qf!k2vQZxco_UU2Al;A5XNw;?nIO=%Z0z|P)y6zZerebb`Pm& z?R}-hlaW8Vd8;f2;XMfGcvD_n;2#Lzw56c(ncl0+epmj7Q|3|=RSxXE#dPmjSg*!7VN%9) zr#O46AKNf(&7ix0>HZIU4c%%4!&=?Fk+s-<@VwoN)h<-R^DdoKB{NVJ{3@9#{1ZnKgrW07_sX)mFU;7~*y9 zE^Jhh?3Xv84mKg;;QE$$S%(XxjV`;1k8qKP7t}>e(>SJdakhXU^_lV;j~0sV1FpRL zFL*?tE7k@xa!1(_wsf1r^QKp-&FU^Rxws0XtsekSpIxxrTAQxseezv&Rr%p|aXfLBkqV2pb_s!-5P zQFid|qo|>XCvB;kd3WI$Eg4VgHZsX>YJA8m zVgqI^eeE;UBaax|^M~!V?w0A z*oA2rm&(16G?lSd@uj5c19e>(M6sw8j%LIhswXIkjcV+2%; zQyaK1wh9RCG~=vDzE=%hUA%h_jJQ6ksJ3gDiK`pZ+4YZWwpZ;i@EPoV!RuqD8Q>6H;n4I@UJ)G%VREs6?6W6%?U0Js4c|uA_eeqDKimSk@dM z60!=@Fk9e0`1RpSH+-TYOh7J&#^n;a<~*^?bn)eZcLMx!n2Z>Ddz~tt-JVs;j&OPd z{Ww+Dnn8sFUGlt1eS0Kh`NJ>Eqby8hq?4=ebUqPqSy|zhIxhhELZ)d83%a=MKWK}m z5cPnkq{c-$62**9E57MEN()x3z6{dMk zhxo^<8@b#zRG-VpHhqbv)93RGd;AZ2@xX=-kAA>a1H&=$lwprpMR$}~XD)iRqkQ7~HBf3-%pmeBekw-09m%kTyj=*XrNi_Jp^i2l=mA z!KA494|igN6jv(7g%@`{Hz!uUjM=MZbx8wbvqiks5A@Dzi%`A_x*Sa~RI=Cah@K^L zt{qx%+XBqCN83 z6!`!Dj8iP+XQ7^E2M~`Cp8D!=sJ&GS+*v#8iz2k0Zny>FwsDJM%og3xfa$P)GwvFB zI9r3Zv!4}Q3a`-ehmOORx82UJU+8bRRdq&&ki@>IDmlD9w$nS5F+?AWkmQaMcDE75UeI zi`WP$>&A+o9GuY6O7@}6eZ+K;eL)wB{FO6o$jn>I=voy&WWrlLGiInr-zYuZA0T3g zgCCw+?NZ$K#Pwa=gmvjGrm!#gT0>UN5`TowTpl<#kCaR=3`?o_&CXAvWPRXVs^aN@ zxsE-=^Z@_PiHnawcq-DzSpcX)YB&<>bAi)96d`cvZL_wBuNM#4^2?U1x0aVc7h_SPUOPhx7Ly zUmYCiD*nrr<3w*uK^lucttZU!m7nSnJb?y(4GH8I6HIUizjhS@{PP<&3F3+WPZLLg z&wPp+5q}iRPA!HmcVD|=KLL||FD)I{j zI!HzpBhl68hZqW2hDW-h&;1j4Jl3ujF!F?8_*rm=#mGTkgErn&e{wHiIGt>{m7u>H zypaz$ng1#<1_zyS3Qa6gL@OF$aSAz9?E%Xv->R|M6zH2w1eA5cp1ml|_pjW6DAfpL zuvNtq3Kki{dt?QIV%S~~sjnXRKmm|C@2`Rylr00m5=*?yW{m%Ta6^x{&0c*@Iecq` z7!&xjL|>*qU)iRXw^^2-yv(=Ty2tL9=HE-!g0apcg55_fv-nvy5Koh00e(*_+8aY4 zTjap!8lMiOLqBZ(#yMg_mZ1YZ&9;NRc`|5ZgH!mB{fEHzJVL%*DLt;f9UU0Gp#{5h zvQoJ!fUc)#!tb&jSuaYeTP+?0Ga)_8|6UvxnZ(3`@9GH0d|WV5L>-20if+u3)>oAXBn<>A}Quf+F%$&VsK*H3fnB-Pj1_V(){O> z+h#c**((^C`C|vk@B`Kp@Ga!u6Q}9P)7Fb;LHkwe{xtX&U;JB8c1AQir%NVCHGp8x>@ZBsC@kUvZF7q(2yV)YJTT@S$#@hu|Uj~&7mx$dqD`|}Ay=~HI0 z7%kH{v6+9jS5!HWGw>U|j>N}9ZqD~pJ? zC&lS(o8_MqcWpIH_U7aTS%8*d3X~pr$Jwb)LjfzZvlKgg3{YCp$PGE$)<7~$8a1U+ z0ui?j{(*FU<>$6rDva#t8lqIEU;YJAq1%PaqNrhSD!$`WweQkLc8HCcqctRU&J zfE;^Muc?W)pX3y?c0ynSrO-0$acM&mBabJqZf~2V_4qlIgTpW$-@$+HJ$iSIR;YaK zgW#n=IN~ob4vYNP6Wdi9L%92TLO^163hR0a6)02GcKiYaz5|dEH6x|l7>+@%7Cm;3lnIA)u-r;>umM{r6jF>FBtW4 zQpisxCUqIw&Q0peRWPY>0|Aq21dDGWF%jWl-?ue9B1a9YQ}QY)fw2=O2xFuaPF#WK zaKR?^3-aY${@+CExLb3$LOua?a6amFGQ; zWEaG(flxm=$GOkyp3q~q6Iu8yG2Vs&Z_x3D{|OTk)J8)STK_RT#x$w2kC*1Z z6__u`r_IztLNlEJ5zVo;SLf8uC4a!`KtPZD`?_xkFme_aJdme;`oB3g)+E!4h!ChF z|4x-D!spLVh$sML%O;Hv#9o`HTaG_UwRcs=UQHL3D*x^;t(wYgusE>y6TAQWpghJR8O(0F0F=UvN0xbn+>~JDtbY%u) zukAzS?P~2&joNmvR1tj)zhNdm@IkK0!bvX=nU(Xn+aLj^j5=Y;HiGpDnY2C&?mXd& zzgQn?p;nllmG8#ud~28<6$x8TavGQ=imPlhN41 zpKC_W7E$Id{f1Brfuc^-p8r8?c{$}(-(6x2KF4*C1X{aVZx(TVUMxkCRX%14~`3Sx~x!*y6GXg*k}a*pj*$ma`M9*m3L z1Z)v88NH<(5ll&89%8*5A}BwAD@-x7`f+aA$o8`aD0jrknSedu?DYQS6Ns%bT0T1R zCTsC7PRN4#dLxuOqJzKo6Cj=aN_sJ;0j^4)Tacu-q!2CE;Sl)bcCAucyd7Vif5WF5 zbx~9gDEB;Yk3AOgEyo@xQZQS{W<|cKfZO`jLF+wWHxE%D4BL}0hxI;TSYsjjEyD(h zq~)e|RS=!}&urM5PEyR3q6a&X*j|WU{aVpMJh5*WlmFTOPY{kE?fCx}d$J7_(e&ti z^~@_-%cZN?OM0+vbfdWM@Bw`L{mH<`^P)&o^>&F$SsS{8ojQ%@pq`<*kP=mLk~ki_ zVsUSJcoUJ#t>{^mM?sGWZRNkL)92LD*3cub;nMZ0r7Ejf+#dd~>h;oQ!-;IGAjLle zM#{N)o23{LS5+pQ#TYR2^{;+U)-UY4Qe73kta5)-_0B<$%?_UE770W_MIeNsdihL2 zaHkvVhak03kuwS z-1X9Z#dG6de=qq&WF?RfzGvlqRI7o{ifoV z`Ljf@7}%JuKMl)ROUi7eQ)eWlyy5kx$+xNql#_Of~O84Outgif}mvDIN0hD;iqQZ=;1?gduwA|3d33HB|Ed(>> zVeOs z^!5kn1c~_c#z<@UBnL?5OOFi{h3@Py8t(Md^3JB6Z?BUcZV(RL$@VBOUV0=rXvoE_ zqs?+?=m|g7aN7URSZHpJxCszo)`mowxZ_{$QteoLH6Hh9Q-dJ)ylBSa4Cyy9s82-c1qt?1FJHKaul$__)W#!AY z3LSH&tXvXS?xXZv3Vq#0N4V>JX>aBZnTUX7eTU{8&LXdV)w-P$4iSK2v^+0*%Nt7` z>wm@>rD`qD>JJ{%8lzKV++?II#0P0)`P`x14;_5_>l{Q!hQR$>ZraV;0YOZcW~E{M zAmy2Z`Vju1Eu(K|g5gpz4%K<_PP~PZf2)O>`z4LRX7_qDH|H_T03;kvcDyhaJW+V( zSO4!DUSfm4k#jNE;d^7zyPAr3Ln>vl8SE?0#1V#ULoP|ID= z8km8%mAOt`ue63`-5T_BhJFEJM0;ClzSe+r?g0(o&R(C!D9D!Y!aA9~f;9ju%iELI zdviyvn-?!fRLXRu-Zt5^j*DTuPGEBVa#h{*c4~$+k&b1zB#cB{%8{t-&Z z-XXO$IObUNJ_gReS+K`7@3n5Dmmbdv55FYx*v1 zYG$Vt)UUyPk@Vn12|rlI`lZIi!$FpQ`}e#&YPT^pl{`}`bd|Jj^elwK0yP+p5%uz( z2&2Z~l`N1iLA3)fD!(yP&IGU@L8j3zV$@mXI(R*|^v@%)wlU7&z}?E-Z<}~!U5$_T zvH_3%oR8C>66avP+o3UsG`y)kqjJMb6OZ7|b-DXsC$&pE_l9W4A z47@ZEf3yhV71J$p0dLL7FYA^?*=$7TZRVCS;;LZFfP(c2ZQ9R7?oxb?)Bd3QIWrGz zOiJ8EW{oWJ*8PgXab|At4kbS^*6D~nD+~@&T;AG;*`!pi=|d#Ctq0V7_x`|uA8;6@ z2({Gpwr}s}nC)63;LlHI9g2{LA19cR+vLRGzRtMrKG<@=bl-gF19fQ0U+zGf5sPVjLxc#x)H0d}QP zO{*MZs}q}+Vd9k$CL?$(=Q{rs`Lz;2F_rA-9V)aY zPA2`b*9bpD27k=Oc8dWpD<+shp*^3-w zDRGSvhaaZteo;ac=m5L9`s3$TH!XS`i)GA@g%y@{9kP2>-n}TDd$R3JE>?2TXd&7W znh*f95J<;9rDX_B1#7hd#n5BW9d~14MYQ>oH-;tcN_XhWvD=G9=Dh#6%!6>H@gRXXW90 zM1(+#*j)p?D@}(TuHD@es8{rCNBuxOu055)eqbYhwZwJB!7mXiRbDYS`{T)-^~Uut z)#@eVi8aRN+cG*{rzlX$Wu#4TDbgQ>syNK6J={H^YXngA={MUOBrADbuhiD1YuqS; z=>3(eCz;?EYJn6+54EXc!i<;vow(*N(wW}HSJQw#@cH$&%uO}0_pCcam(3}6vYH8^qifFA?MYogBUBeFqdCNe1X#i;vjpF2)*9NZEkz7SG)Q_GNkd1nPK;p?i zJFsXS|4GrfLC+elPSxxG#oYk53b?j?5VI}_iBecpQ%OPFo?M@pQnBVepQ-v8;WW%4 zO+?r8#ZXY9@;fHXYVd`TvJ;)1N3~t!6d9vr9|Q(w!GD+5q)OL!uy~+) z-z&{Wpa2@=rFfaXf{+;*G$s2vbiWx&$I4R6Y&llSo!!t})t}U;|o{%k) z%G#p$?~Clqlv4x!0+a(Faj7lbrQhaoB7vAy?5yQrmq1kG?>1oM+;~HB#tZg68_Es_ zb99GrkCRitD#2p_Q|P^6R<#ekJabkgU64F%y77j69%p`hw_77P+0yQjUFl^bKtcP+ zDl0nqDjUn>AXM{dm-pj#{85_!lvuXNvo^5-BI9fM+U+q)9enKWIFPMt;=^l$9voAh z={-gqO8;_M2aBj*7in%=<&Nqw3n8}#DN}721>K&y9*hX)_ z6os4oyDj!H3yl=qSg4|vw~M${6}*|duV##oM*$3~`ir%9(<`?q7x~{p#pCo9t=9`$ z@O)C+m85JRqmff@fZABpJwG0itQuoYdkJ1xmZiOy-BH%Mq^p7ddSQ94W^T~Y49w$U z+=BXdDjFHmHYY4>=jCzo`zoMK00Eo<((n%KoO?rPaTsAG6yQSiRp%S!|#s((|zRf(*J z2)9UK9$?pvER=(TjD9aKuq)Nf&X=Dfhd(RMffw@XFaG`;wu~eFA!%;$JJ3d-`w%?Y zWQ*Ef&uxRpDqQC_H1V3(^hvP*#V8)>w zOEIZDD!PyFKmlR_GzMQr9nndc@OHq{TZ>YW1~1A2?zb6p&+J#ZqH9s+8;inz)!Ad* z!TReTra}*VKvXScEvaB{NV?oUgiB-vvXioodaVCj&&gaq&P#R97p=%rFoL_!WuM zJ*p(-B#rW1pG6Gg=ev{Q&UR(TR{q&UOmMMSvvjJX5DwRy-@jw3qrgSE3|Ch=H0@2M zP5}}Cia4eJ#j&d1PW96S^(<437r2}sBPl*)@P%DVxjXfO3u<(xP71lCz;@V*>!eV; zP#j3MTYD!er9ql>91#4xiVm6|P2Ft1=R4JTM40J_`w>H!J>TBW`Q%+o9lc(N5WUIl-9tSIniaVrTO9IM*C--5=CMe^_j!QAMsM8@@{%18uXf+>Y?&Yz|_DF2?;l!xyZ zyJhI|TNQZrG&>|oL@KOKoKQkL#CkQT=cpvY}(nTfLRQ_)ct10Nm8UxG9x;ZV| zg+-O^ixAcf1ztlL=X8CcF5^kaR4-Xj-99y(K-7+aaYRIt7c`2$@JmIIGD9W3{(Yz^ zX#fA^%sROgI|blIbj$rzxkK1p**;I2G7&sYZuIfCIQr1@67_q&d895Q`8>c!)^Icw zM1nWuc}vg$a*R@}Q&LE7?NzeiN+ZGCVO=CPNes0~dYAo`;$vAlw zZZAQI5S-aM9P~wwxryUA1<^$wZQ$idv-4Vq)W@CR-HL-=Ds(*r!xl0Bo&;k z9)Qf=l**B>6jUHwu$~h8m2)2|MxQZ3VN+|^RIQ>)22U|54G$E+DQia!T`;^Q^C9^8 z?dSwZZzig+}?XVyc#>5nGK>WuJDB`x>JxvjSrT%&3pXqv5=L%gpH;H3tr9g*hR-txo%qdrJF?KAea}h^< zxQ$g(@7YZn=y%Ri)$142d1HY3tasVh_y*$%g2{WBep))O=hvS-I^Z=PX!p(Q>wKfO z*t@&Fg01D2rJ1zxf3b|eoeT-D)f?CNd>FmT!MEgBaPUpn<7>>VwhNW9MTtv_KkaHX z)gk^TV-_NXuC4R1Ww@;{8}{;MH&UtxT-HM%HF?jGM<Fw=z$n!jU z%cWUwHgEWSPJE4qFIh8b+js1!0+0uDZ1Z!oeyM%uaa7FgS(TxKJ2cK3Pl_YgaOc1c zVZ&Ah((T)l`{pjP@fj|zV4HjAeCK$;MjWEBqtk&96+cL$te@MaL>Wle-FmX0famdn zlu1Xb0W_aP?MWkPrOY@s5%`3(QH*?$@)I zC)HF!ztZU~TE52D0|;T$Z)>-mNfcGq6VG{8H{XU~?@$|bL2cDkVm1Gs-Uj=1)V$|a z08swUIav{yRYn6AC}{AlnLxZ@T)*^CluNW9>eIG-U*mS~foiS(r%bbXTw=Ia<=K7H zBwVC-Fw3xIbA{r*C@|u zvG0-h322FIy0#e#0{Q+UzsK8G?tibG5Xr!D*}}6~1!oO#r_4WU##3OoVaEqbNKX4W z>e-pQx6cLmdDXnIpr{^+tiVpoO!@C07KI|ueb*EuRCW&`W}mYsE9;SEBV8)bPIo;; z6gp0%+&~Q%wwezeUoqyY#|c`{F?UXr-t4l}$~rjoXad3U&b0}!>#IE(IIu&7KbHMu zCnkNiEcqOCL{Fn*(7ev6w0y+%J!V{=+VGoBwt-h*N_KdGtGiAWfcVS5XRiSitG(Xv z@=V&r6IHQaujGP6g8K3)s;3L4 zM+qCVM?S!4ypFvoB}~l%HBlDejrPKgcc6sq`%ZOP3;}51Q;fvM;VO1~K))@v#pwKx zQQ3ylWRiES0ykCuK-f?>%8>jEy8YR?y^zYgy^oQdtzo6*#fs^1uru8`4kLY)5}!(N zLBupEHQQv1%uq65aM*6mSLU7j%ju-!fN~4)tXZM8#2!UGSzCH znfW@E5?xh|-GpG`@Cc|aP!u;2zL|R zgt#~*!SwsVHAia`EsG;XS8f_8T(6wB@##!~1CIWG?7eqfljrw7j#{M_WoRYHbhM7j z5C?mRqgWSOtRP!rML~8TtR!kg&;YSj5zB@OS`~;IHi;+@6$F$iD?kikCWIA2*7trA zz|rxpe*FFMd%fykt$CR|&wZc!KIdHLoa<61t7I^!ka-zdBWei>%lF}L54&Mf&0^c9 z&+i4CY$+G%FP+CSqX)BU89hB1&rmM&WT^(~ZO z&G)tPjU@f1Ag5#JzhpmaQBW#Nu6fug(KqR>?Q=Z2>Z?1^eL(erQZcu2NOx<4n%spS zC2Ij(cYRuRfiN+-AgJehM;7aP`#YK+e*toQXnxbUbqWx4qxGMAvb4joCj+Rj>nUa( zz~8x-%yUKYRpBcKYxn~EguovpP7f;MPZt3OJ}6{)b?{|rHYyJ-G%Yi*>A>WXFoD2- z&2SCAS9_2^Zy$!&5}Yp$TOB53HiTaw>Lz{%?9BF-k!eQ8ob8(YY|qUU5tB?Vq_)I4 zb{ubA2I)!w5Ee{IqbFcyQ+6+$Uss*#uHI6#oiK?83Z6IL)?(i)6iPDt4{d*jR>H`C zG=#kt-FT_|Lndi{7dCMOwT}Y!cFac4T%#5K9Y5Nf+dELV z>n=sxW!c-G{7uYoMX7K#x&3KB$F6>IN9Lp>i z6}D@`*@KPERD?zy%QL$CzE*C#^2;)v62ULJ8GIp(gkV?I2>LBCtc{926^UxkeX-Lg z@5nN>^S)m3%++_JWjKjz;pVNQ@PrMFBpxkHl^37n{KSA#VU}mns+w7o9hih?Y(XXL zNOE&~C$;`Fbo<_j={p!-Z<$p`QDNNhW;u`#9YNIIAmsRm>??mRVhJ3S?a2rTq!>32M{l>Bw@mtH@igm*Eae`bflAhOnLS99`$WC1k}_`RlaZyj^PH2FaeM7g7X=jBZWSJ&Z*Q6WeMo6( zg;@qwhUP&xlb*iy?g2U!&Qiw9mi)e8zfiM8FBs#+WXpt!15P%B*Q`vo?jGP!1t;St z6A$6$yOMcxv-?OM40f|y)Iu8PFqgNgZP^|K5C;E*n@SeCsmc*Z^+rlSR&UB`+Leq< z%3-Z_>nLT9Yy7IrD#Py*b&E^_TU)j_Z%hjH{KX?|RiNQSp;Z?XFusV-^z+^o3Cy*3 zqhALkMV(AB5j7~XuGi?d!ETnDRT?t}dB#j*d{cFeub0k~^YMIrlXpmJKW5p7U)x(L zzB1+fh^mqTr~ZC1t~ARkQ5$xN$wTzNasmujt{3xprHC+4`}G1E%#s|m^OfC}gT_`) zNY2^YbZ&69qt7bEb*$Y7L%q6m!3tsu_G=yN-3LlN;ZWDCLZExjp=%6q<- zE}DDh)kxTKyp4*@gQ)gZlYc}T_n4^5DqV&u-QW!%qNsm=!VMlI2hsai6}H4#fT!se zAhXr2$@jU-Z%50}u4HGz<&L_v>_BWpt{g(d$7@cSKA3g^L{K7UCM;D`U2^4GYjr_o zX=6d@V*-Ocf^3yy_a1ddJ(uLa1M%I3&4>;X;l#f6|VChYkbg5Nejg>qqD7okDiw|MHKB=Rj^r7Z2!os zq%(?St0{}&g6h9`X%H{P8&y?5`OQF|uhPryo(_E={~u#}-S6a7LE1eKPFQ-AsH=ap zUPFeP+1gk!_`oEuNxkSNgnVNK;vEF*Bcg)MQ*C>#6st{vp!kUuyQI}7H^e7L4CyX| z;PMP%MY2QtnlylEyn;VDJKPMoPVfxHbf92fIrA_KhkECvR!A3dlSIPvW3H|0H;EB8 zuP7c@ zJK1fnhk-XR1cAcy-cEn#;=<%ff6s*86qSDwhi1|u%=uk(aw1GT;{Hx_;Hc=L5^mbOo;!NQy87pH~KFU6g&UME>7;pAjVG;=Dd}sybw*ZRk2ST<0pj3v> z)1Equ7>tJ`G>{@cr5=t2?Cr1M$;~1*V1t?MXW@A|Wy5msboLA#_$ODK{ z`Px^O-xm9b_a76`VSDqRStj?Jw_7I2c)*{;@03pFbvJm@0E)AfRxMNKi4lUUbA&ro z2;Xsv+cqK4@0AFg{$;fkyVGp46N}Tqa4TVCekD1K{a8lFp(MqCJ(?`ym&bdPg4rFR zEYp}JG2g1|Of1Fc)hI2k*$^fYaJ0?~r{XjQTc;Z>o13)SY!-W}B@lAP;d$m$iIao(aBfaejJs59j@Zpg4C>Ip6)62$ z4l;-3S}?d1^7=VP0N7W!HnrsEERvpx)-wHI`?iAZQ>iR!9xSS~2)Y71K{F6+h`w_) zsR>#B1$`7piKN_%PE^LQ`U3rP4Z$V-w=v4XohHMtfn-$w{IYpb<--)sGgO$KgPRG} z60h#aJZ0ufx!~ul@a4l5X_0Ky7yyqnkZK)X`!G2O{d1e!3?-t5iXSy-qO5`=-yYt9NDU6X+(70|DiHVBetO z(ry=GI9$gR6@fLB8N-`ed1+yi2Mgh)^&`5R_Vt;nDoz~EcTWlj9^(%DL#lCd_xZW{ z7Z>k=vxRL|6L>3T_7LqLjcd7k<8{#GHtD?_xn&)v;3q^az=cTQNXMW6jgEDh^z0ck z3JkrD-6$;4%a8z~*rmLSHd)1rw_}zcepz}$6ZY_XgM%vi6kEP|dlM-2e0%c-#K zoUHP!4{pE3YHDR%Z7^I>?Mrw{j#~V<0Qp7!DTF*Nvo*rE<8J#eYe+{b+`CAOqd3{e zle}`yb2giru39vp{Z-;Wc&pmTJZgt;D*Hl3yUeO}`7tfsqIe~wM;zd)FHI1E*Aj{; zJ~|Oq7iDu#R7l=zkxTJQHQX=V`x*Dxj2+aC)Sm|#ajk5M9>J4)ffsn}@Er!sOJg98 z5qtuqVamv@aY82VP03U~xUUvJ>#4M;X_Z0`1kZ>+)_W*KmzO0QVU8 zMG3Dgl~>U>y{xK$<>f^rsF9-9gA_f8V8^>!{F^Q_&MFUwE2N{{Y3>y&gLR&HI@Pq~ zy6n9C224TF=^Pjngfk@oxuf-01FjCRtccUWOd0sggPxiI&+*v~Fw_2;s!H#A3M81m zFBXE|WC;loO#9B|-kbJQ8t;0-)E{^jyl68VL9pGy{3#U8LIKB3ptakQ5S-=f*;RXa zuzB=&UxNQw{iNd-hVuR)?lm$p(C{Es0A9dZ(m^7T2k93HO9`fMbTds4t za>#?Wr&3?KGMq@4VeUw>e&L59>Vo90x8BX$)G#9{^tM?Bs70z3dHB}uufdoA?%1Wu zjEYH)QE2XNA`F)q2ADh<(Lu` zU-%yhRL4Vhy^|Sl@9b7()xC9>Cpj~Ru6nnDWM`JS6M{$jS@#FJ!(SO7tAewz60skB zmnK)imkMV5;U|@cC<^S-iE3#=j>yQXa8mR#-O|ca!zoXm+|=u0vuKH^=3L9DqS_c< zq8?6L1MWUWHgrD0k}daEwLvq{uEm{;IgTrg--Hk9!XjMRjfnV`_;Yj>w*JxKWw0+Y zdCES=uJ)$vpw&Ihq%`2-;kkNH^;b=W@wiWyIKGBKwmttfgV)SSWsm?s<_9TxGqpDh zj$SOPsjpak7aZIQ)jU8C(gvCQlQvYDL+xHR}((w8)H^EE=rsbn!63`#H;4i}|i&-Uv9#}d!q_xtp_w+ALUl?U}KJq%dX)oxG-kOy7Y9g$N z(4L^5E>~i3I5dVxS-<0~?WA4I4UgbM8z&r9#JCX(B-*q;^fEw2IMFU?Gy}HP+fRt| zA3=~o@KpRPcLNxwS+#mooO(Lsg7tHz<1lf%o>KEb8g~AABc--jTxFL1NyO8d(y;*6 z{-nC?;(`eFfL2X1VdEx}9%Z0oLkn^D=8pK=B^pb4QD;)xPEdKabm>#qTC)aC)Ipo6 z#eLAS5mRIkh*Z17?pmavbhR~UU@|J1c%&wM)<*5r%~$ia*a=N{1w-2%?Sg9 zOqwEq8G!Yx{-9Z*V`Xv=i`{XOFiFnE`%oQKpleoS{l%(p)3CbJx9mf}Rj0mGLmXz- zo+ZK_v$uybGw~HTx*yL_#*sFE91)L#MYz}KI_+$#xIo|aWeVUED_l2wzTe*q4YxQkHSH;Fw9tP1e) z(N;FeWj|=M2^I`rMp-#?I^Gy+fjARHxV`nUjP6$>sN~7fw%})D%=tvlndbw)cLgf+ zjCpJ@##2Cl_vKy^%`1z&IVaI0Vc@{y8~!mLicp=P?*#n)RUJw$7?nk5A zI^QkwxVY?+-m$`{x!KS6U_thz;ks#Yq9{*B^?tk75e8}2rhW_H9~WvH zA2;_U#hKwZPS~pyK{yQ%NBPnn5D@^Eu0sx0x6=~hJ)aA99w*?O(y;^bYEc&@_uT5u zM$JZRC6PHum2`5R>~Q`POsD0QTMp! zZg0wFb2Q_b?)V^JY=R*1lfeqUp%K6Jt^Lgbson8&F!NsYYx@3NN667k^4srF7>R%; z<*&`DZ6Ly5le5fRnm*89&?KLk9qRqoE4Wwsvm5hc4BOq5WO6;=pxJ(T)At^{L^I~| zY8vE#zcDP))=(ckd$J_XP}(@qc{w8u3@C|x!j8edf^Q`5Kza#sb5Q^B2ka+u=EbVM z>YLKBP*9KNoZ2aE0L~XFs<%_bt`WX2&la>~JMz1AgsYaKxb*AEMs7bB=*%vX{E1Kn zDJjv|4?8Um%M9QJI|uL4=MzyfZt-VkWai z12|CwX586y{6X2zwm8XkZ}8yfZ+r$FWpiT{o{+5A`OI<@_XjxJL9SrM5vNY$Z*E26 z{OUY80#hR!MC~uh#mMpesVXpq16kS$?!xcttTflNCj6?@Tzm2t-}kydRNogII%T|) z9#Xl;90+L`O1dYI`^ z^krt?;`?p6TY!rU!_D729Y!SnMmYGOxKW?8!`jXeH8|}VcMr!!5^l0dC%CHa6`}L} zJ3L;9ePfjWp>rUfb?j!{f7R!|5H^wOXe~LA+ss7V8uNhr!K@NctPeE!`oC+KjP6eKl~Fm9YK@?R8qPH`T#U zd_0z0Eh>%!IomIFR!uPw<(MtFwSX*~sns>_udjM>dRF9k=KitS#%I z;qs%(^}aEJjU+TnA<%1{u4Gx}=7d>|8qv6p9Fkj)>R>a9Xa)m9ZN2s13bi+AC0p?m zJ!py&u6EU@)@y%@hL96Fvc?hw3Pdo81K2FD0>?Ju&4-D~t<5bRvM?N$upP$=8&G>` zVF}QZlLamT)oYJAGgj*4a7yyM$1ko>Fx|)5oY$MYKu*gwYBQ(~mgV0m9kY!pQm?NF zfbOYG#xuLlpE?=?Le`#ltYDorfJM}ey5tL{>ibqjfk-T%aCki^B^a7?m|WC7HVB_8 z*+@FV^oL3_(=rZ{n(_kutS-$Ug~vx`O=ha_A>XMus~tw$=X^!Gli0gooR?t_K#Nq# z$*A^KH|5ZhUqUyzvT;e4nz|ojJl>8ZC+{B@WY!vtuHw>68~fV6EoPR~R2eoCceJKbS8I;oPGD=e7~oFTnDxu6%Sy@$0MbiT2=(6A4!`!QkKENV8|9zFTgEf40`d zI#$-w3Tlz6{^h^uVJYoAp&hg#J3O}O8;+x8!u#Iu9?U(KwLg(&8rMh5o(`nonWN{1 zo8^8qt{-vpZweT@M^{srDA7p}U+nj4b^)RgqkxO#%8XcZs+QU5`@r8r-cD*0teJW> z-Yky;@T%KGPnubPeMEe#^KZH(qxu<$0}B5B<%2n>6n2Iz^Zgwe4&py-L{evp@>YF$%uSfdWOvKURB6U>m+sl^^uBz@IA zl~e^uEyoVw^Q$T6oK*u{lcuPp?iQu7O)uz!5 zJo)};5H$H6t%dJ%WZ`mi!`cj4ZU?BFn>k6RK0H&ccA;wie~i}Nkc6!BZfRASA3uwV z(#j)O7x2i>x<$annG%QEhIUP_#&at{eYrW8BAV=G2};OFcAdGPZr_-R26;lSUJ?NC`=q6Xi$g?M|}X->;1zcY0Q7c^Nixa=KgsES>LJ2>1eXHxsUy|Q{E)GSe8^7L=I>oJ>ONq zW7cjV7KaXJarSLyxKNmP=>B`4a2Fh_T%v;v;T+XN`~*Hy#9hM?nmIaoi5{Al$uT=% zBFto01b+3U8jNt%$`eK!ek_BhH~>b~%$;Ce6x51&4cSgYs>K>IH=fL`H@rjSxne;g z13_lsy7nn<&Wbj!FW}vCRJ}Amqb{3hYhvfLtrgj&rLDY=k~BKkVl{{fCIND$ZbQ@s z*3w%xVnSpcc|li^W3Soe>F{F2!3hN7@H{Q*=^MPzQ-%9Y0zIo%h<)ul9t@hn#@ZL5 zZ2{uu=UikK{gCf+wZ)?jSXQZXM6N_>z=&vqQFk43uy-;}2wj|WvyS5YI@kmZwqq*} z;4^bT0g1agr^Zp^o5SeN+N_|IhkW^D$I(>DO~g%D0!6(`FxSKBp%=Omijo#doQCQn zFvUao-e{!g9ljbHpCh0 zw<(D1#l^XjlEC+suqXG-8Zd=5K247WjIzWe2)b?q3^NNli{Ln>#>27SPXF>kmwkFbi zLj_=QvA9UZF{6j!(-{YyySc;oV)hF{F1gQeiwR}!vv=;^%HStiaEv?m!F^(J5_w=! z6BE;2XUIIMGQrpXG;t>PkEPTsL0`Y7Vbs`Hvl9r@gXTqWl*0Q7yUE`Y!zVH$ zWxH9vRcWCw=Ny<$HPU}Z`0RUoxoDSpkGlqmolz95ugWUD?QvWr2axK(^CVku+$ zJDd}Eqesw%TmrKN^WK$vQ=sip66-373_7I(wFbSPfsK!x*Ma5@;W$)`$WJF?wb7&X zGr4-Ctk;saZU<+~Ku?XERl){&PMuDiZqnURMpzohtWw5@&N1hd3#b|ZNnc(vI^3@t zz-y3{R$Z3k%7ZsmCpH~T67>}KDYWj-yDheYOR_tqPeUyV@AmaUyKiu-ttmhT`@AW) zj{U*A<`gdoiW%)K&=5sIRr%UxKQ?QBhj$`xR8P%+DYAXJnZIn{AjvKn2gouJ{l9`e zW+Vj3Z8UQr=mn-u$NLV3TnD4IU&ZaM>`iIt76F!XajXxFS4|&?VwWwjm*6|;^tFNI z-3=}IJ2bs{1+2#{0Xj@txo^S2i9-E(oTDTu9a#GTZ%jztqf~Gwx{Y1oWU2d$ks46p z-fR!`=2jb0$_;}H_e&f|@op9JT6fx>Og!?rOwq!>yb@c*`Sw88=2#ldh&~be;R1wt zyN94nuf1~<=9*+MgvI-*b&EXUUas%&H(z^0lWQ3*OVi-XccOS_(}kW>c*g=w{1^Loup8q&&!l}QrS0HIkTXt;}6*-oQ@(@-_3wD z^c>JpQ7oQl@vs6CS~*_RW&nl(tD0SKVt~ah*P%1`%6tKPdq`n+GN#1LYha}gv(liu z88%d;y?Z|g(>|)c59~M8G6Wxj>K|vtafFvP-UjN)gymMAWDn2*C{6J ziG{7DkuNQV$d+Zx_37a-ryzIoy|>Fo*x4v^uB4`O9@JsHrRP#x(eIrylk z`&;~x>yyWCx$RAh2f+6}Xc9$toNYWV_JQg5%u3w~0RS3oRQ=4K68k@1T+Ppuz(;iS zrRBXF`Opi3+=IQM%_+RCL_$#%n;j-w5V-t+266^j(}XofL|l;e$8>bZCC3;~RDV@; zTHx=j{aZ{jX9wD=k|S#*1Uw3e{#)cq*Y?Kh?E12%X^%qakpc1)ZcM#z1 z>Sj5bP3#xh7mI{EtDy&UswHCMn(0a^NDiLQEQp$|%mUeka?h=JUonbLA$!50ur_|& zZ0R{@298UdnJ8cWVCxA09=eHRYPgP9451|->aln^QVgOa$_}&rxA^ToPj^*;5HT-4 zGzL%AX!SHgKR6ci_d&HsHPBn64>a7yl4U)CAQ@C`fpE3@mhI1j@F$pCOZ%jWRqZDR z#&oAs`V}un?9fV0bcS}04n})IEty@?*9>Rg&8dt^=8cf|w?zk6cMoo|{ZdYF!yMGv z4yb`Ji63_L5Vo_EAPES*AzE?dR`1SV2nH^R+g5-&jGT_8>H!muA4Xr`ZM^I40-Zwm z#LO&Eq7p1n8)JrXEsl(a6%u94*u+RFcUX<9Gkx+m3c%QOQq-0%yqc_JMx@U#02S|q zQmbOW@%%rcp^kwYnl;xVb{@M)FXtLWG*yJtn@8Hzy?-4nJQ?!0@j+!%8G8x>`u%*J z!#oSKcL36tK(#HZ{D6}(;#)t(x^mR5a@1*P$Gu#tQrY98Op=FN==x*IpIy!Zr_;LPN$@y~Gt z2`Kp|ljI;L`sAedvKm6$nhu5*B@h%3tt)BX=czx_Iq-NT|HwLl7MQyP5sSgEEotNT z^p6&s0m6@BkepbV3K`}iZY4{-BHa?7)M7J8ey3Ql6X8=mv#@H*VNrUF@-@}Lb=pq7 z2l8(4x=CSYQN6p)<~MI`NB7Tl4#UY-0}l}bhBK z$dDJqO5P(~I%-fKyv2Rr7bOj$S?oDJL`rj^2YLx!!`xn%dU4kjd-ULft2}h9eRXLD zGlb+HO}6$Vt3j~ROHfLwL3_5PYL6jrh^@NqjNOWyt%`h&(H}(KByNv2PkE)$t?;=N zlL$xNDYY*mWhH*GAPSz%4 zA+81}K|#RqJGZ&ht>qQl-?Z@dH5IUy)_Bn5d{Y@52wpmHylQKVVb+soFQx;%Ih@Z! zqwT6o_t306@LZN(Rol=a2swEOEQqay2XCYajoJbvS6YW zmMj=_yTemjY4l}x^#Z#8mo)A93QGrRpuC6Shk3MWgi_9fxNmwbC3C%CDr23zj#s5s z@vY?nVZ~=+h60!Ii?rvK18^)zIvcydFa}?TAW_FgSs9#8Z`7 zf{J`YjmbK9+1$z?d}kpijRv*@$qb#Z1~PiFUGlI^0+Slw)Bz0;rr+#MnWM1iu+_G3 z!lapEqjx)THyLpiaBL0AV<;oR`=ZICV`Phb@A%0M-6{cQQ&ZIN9FRyrbu0;!Z;C%H z+{+>8l8gg8ZA+9@N^^kjcNGmzQk14Zt=MST3u1PSZf?o5@Qh~R8oc=#P;&`kbT!a~ z5J3#+jW%;uj2RzBq)AU8NYx*eDVUqX=jPgMv6N#byqi(_beX9d@Ie9<}=7tE>q`1<8 zYPNRRM4RUo=c<|CA=C`<3YX(wz6x@x6Rly_Opd1b>%FvKOgZLC;_4Iy_78$4t$g!} z0$e-j@I5DIHt7<$&o_~f{N?hPl{8U#!sNUHaUMO*D{Py>c11O1Ixi9~tOuO-2C2{( z4ks4pU@YPB%TFcT!i1=Iw0t+Wb=lU_(M8{dfr;Ltj(QV|AP`Fzve)>8DLa;xZ%Clw zW+3ntS^NYoF^SM$cEi@jL7j5@6^ku54g;9ckvSd`0y>G>#tzJ5Q`u`GWn(3|u7oKD zJKw%&ptQ~2{r(NG8%BaR15mRhD4;@vv|cW4XI+dKaSOiGs(v;z?*GftwmEC(V*Ogb zTrPoN#nJPybZb<5F3Jj|ChrgyH}k%3mxVD`OiF}c6WjvhZkn; zq8IwqVt#dSB#73ep;Mi0lI>WLAGtV2z_ThiHFi*0j z02ka^aD6!@h9-Af-?f5h5kX0Po;myZ6@QCiyB3&62K3I6XIZP#`}AvS+Kmykl*0(hjo%O?mStGAy6 zR}O}O!78TCiIucxOt0@U?f|tYs7GJ;yU;WPc*bAo=jtaB11d3+kn`QP2?*rm&59B5bWSPl0z07Wb@YvrrR620spbslJkCaIr&G%?VX zid;7d&CFZ698qZk0ot7CqH}^*tza6hEgSvN05MSG9n%Whc5{p`ls7lqK%LW-Epfkb zsHppnCdnf_@I#fE$PQ_VCnzs_;-9cFVFfhj19)rh`i}bPw(5^ebtYcm8+Qt<) zRwR04v_kFV(9{)nvOvelw7N$Lw>_J;r9W!^T|#Z(l%U`Mhp<9&C|~`NHjgoQ>>s*= z0lSYl3wURUR9sa8W^x{kM;BaA5(G28GUmA_fagj{MQ8;WXHzkp^!e$|ldsA*n{)11;c>g}^2|();d4BcYQ80@SNN}^T(6+sY@2eeVI^F+hV6kDjp0otBr(Dn>}+V<3rXfJ@uQ2SXIa=hE>HyKTi*^m1|1AuLQ zYzC?Pjy+t#gZo1RK|WWnac(1GdQZNC_N436_9O+6gxn8e{tHe78>qIgB6BE{vNKFM z5a9MTD0Y9YxhMpcF#j$%;^I!Na!cvz*Eoqsl1wr}4GS}LoBzBCb4dJ(hH`(Xr~<1< zT$}-icY*QS`o2jBw-0tZkE*_??ge^g;9@g*l3-QUVT5Gg`Ak1DNLaXgWt8sLN(xZ_{+SKfo?nfR1wjJ^1wLaClH61&0rM3s{^WEJAUGC11gLc9tFvhZF@De-jKyJ zC8(7aZ3~YXO%xih2GZU>@@ZI)hi8aTqADIP?^I5FQ1odAx2V!sv8^l;=8jj0MOx=G ztH0!dwlpA~rL5&L1Oeva#<~3O8H3;r|27Kw`wMU56RUP|;Ynu3d-Mm9;Eb4$a3-7dN(w@!6JtSv?fzk!GDv-CK`PG z6`u*GI9Z>`h&z=5kxoEq8>#5)LH|PhFa$K_6t`LgodmJ40ki}3?Jvoro5y{(R_;nC zq!9J893C0D{c<{NE(a061C|3usp=_Ohb1=faI*K2VW`3<-Rb!i_;ymEOP1b=n4fnK zKMNfPvo4gya1EYo_-()e695KYpglQHcYW!Yj5Z}YT%CGAKBxgxzrR~gF{y%-L3=Ww zm=jyW029h0&>A>h`_M8o`ZDoBghZSCV=r3u%A)yYX(FH;U7@xlgLmmx3g`xOQ+F^K z9lUp^6{HU8I}aaR@Fiit$$_B*3-yDUn*l1w%tmqgXsZ;X8qI=@NE80#sj*4h-|b=& z?X-W5-t`Av2!DA>)2lt^hL9H^V+(H`YR+*usDfXHsp)G{1jB9+T_PCl=u9vz{)c&# zS!IY>wjh^{dAP$g>pXClL?=@{JqM>plO~jKJpF7c+BIQ=2d&5M`!9TjK-fl^iecWo z)2ix`(CUGI3Ggu~G!r*YTMd@7`(@6=sm87Zt6Lv!)J9n`lh8{hz#~8L z(`X&as42>ApF0y#M9OYw45ogZW7YRzcE7UCS5CG=U74Swpfou3_0T=GlG`GAF_#+F zG?wwSC2oi2>I+~l&7m)0bX*%=)J)2yR)D^I-$EBe48diKVn*Y{Pn#8QHcnV8<4(c( zsWP?<6H!Ybs2)gS(lz4km+&F{+wE696_z%BBFQI*kTQ1JDkp|wh+y|GG+S9D8T-w7 zGLjGm72$S7MS9FILL}fAWv6^&$fx20&!=I}g8BFYbm}Qhhv@P&;!yTxvt!0)7mISK zw!cq#P*Ix#3s;OY(Bu8+KyJ8BgSn}($%LUCXnzv#Js5j5a+ACLQCFM6D?i=-)M{?o zpkrw%#S8Ora_e2KlctDcr$GS?&`MxEs`6X+Cg7g5aC(%1Lbm)$=5hJ67JM!^@DSXj zRByXaIr);BjQNwv(QHh%*~CG(IU#QAgbH3k#?6D)Jy1l-s|m`o_JGgYs3T(IR>+0z z_&6691NxL**`mNhrvMvrpq6?*GvY)5%+_r0 z#K;;%ebkBQ+zn@oP9zO4WoVA+tUed{1)!O}PR%#cV|pD_u~BAQ*z%n2zx~Gdio+k{ zR}5ak)3EugOA+xofsAt-AD`{ZA7EapJ^Q)7dSk`@yw(7lbAD}E22eqCmL>#b(z7+M zkD03W`JDoq?$s87v1lFgA&t+%b_W(A9ow4i)b0fc0AfM`QY_}SnHjBqw-o*NQ+9v8 z)jD_Mqh*r-2UlPbAbIQ_u%U#ri7m|l-Jgv!W_vfM)dwVmN&kXE?(svzzWnFHsXXDYy=u$2!3O;RRq!$1o|^j&8I(n z^+j2x3IAjp=*f#&3=q>k_SV(=*5$Rg9DSDq;@^@teMn`2U9FXl-BAoi?30V4aynnz zkE6m9!OY+6-u)j>3(W#3icwWPko)44kyewSgEx2WJHhFDO>(n~_2Gc+UqqCaA^Q-u ztGsudh`U@MGT9zmWzIuEqkQWj`6N;eO8RDQ9@7AZE zNmCHcK!k*W;@qE?jY1{Y2`(rY)>nL{@7FVmeCkr@xbsVTT>g$yAXPxaZaNXi3j?6- z=XiWQ@c1PG0j--fAq0|&N3jZ5Zx0i|^Ri>)%tuqq%z%LU3_A$0SxE!>i~!HS4&VCr zr(wY;eY>8}M1cNco>)6z+e&~gf1a2K(&3QGVri!!_!wyRjfSGRoaT(x5#bvH~ddOxF9!*MZkbEyXyGo%8+k z@C09EILH~+ZH&ZJB+~|nY`4#G5l>^8AIDPq;c*gIGp5;MKG`0`XKtE%=##yc#Q4Oe zQ8AS8NpaT<+AUt2PmL@j6^Q9d&!>hD;>h}A@1qMt;oaE+Eas6cKM^yA-zmLemX7Yz zzE4BYYl+v1%h3>xS&7C*M6!)%#NAQvP-%(R{a;1{Qc@3QsU$vSsij;%|1)UU78i=& zsbl<0%af9-63cA9%jxkk_r27aqvAT|J5bd}aeqo}1kDo8!1F?v0yLj8jg0-vJ3>n8 z#4MKZDK;kcNK<@#KT%pjN-8e=zs;A>V}k1W$L4tdZNA#r^h_4e6Pac7`j?mNeJlu)~ihQN7WEJ5Ros#tfI3hwvZA54IyQn6}=|m&VTI zNrv+=IdtD0@Lv^3v}i~8}*0Vl07nIny-_D0)A4K_H75~)R#C78{D2zi;pXGwzR z#&B|MVl^uWZ5b^qq+$4Y7@5bM7W~yf4Ya~Af+h?q_)SbXd4!}X&2Gh#bX39w_SD-+ z#!r*@EhnnS#cIQrL$+lD8Qv&CWdqULq=d|=hUo}b3lgr84uLbqGeyMWc{ru>-7YcZi_$NKA<17u(D_uG81W z80pNtT;nWga5g16U1E&u2)lK(MXZOrfI9YvfBuLBoAtq;Z9s$fgTIoJy0yyaPk$%% z(@sdw`~5#Dmn3r5@BQszNFn=!KW9KK|AW7hk~;X`L(Z;K2Sh`M~|iBMEqpYX5&r5LO$JA9y2KoH~^}RSvjlWXWJwB11v#TxebX6o;{f*tT~=1 zY}8kGD(3K5PU=Ls0<1^9WON*d5iXnPyiS@L$9ab_I65L$)GpP%Y+g?udHnKVizNuD zD6qYjZ>YjnOenjR{J_vAY)EDBQMT#SR4wII6G<4~5WM&`j?8_LrvXS}>BcPdPm`kBQV)N73$p;Ohl-P0`kvmzdEK5O&qalur;(4Pz zaQ8UndWXPuoNqDF-9+Njk1?Xyh3nDP&3MdM2_}Gx$pjn)Qc{X#8fZ={OvJn+8Y(aM z_C%MD4x$+YqOi{1PC0Ef&0f^otwH7>pLq$W7#?`tM5?HVe#ltSx64@ZfZ3=V2B$V@ zosFh}rD%-hGQ=VdMZH7gTwTQ&5jmRpWQ* zqtlIG;9uoA^Q2*zF>fKAE=tTjtqwvw&G~v`40#is89fOceYjtP~&%=LF%6}{Au0v8N4j)y=MSF`VXk01Z|rI zYb2CtGtcmkkd{PbdA6tm@M(yIX!;qT6zK24wD*=1`Y{Ocn|=oHxc>m-y|o%-%%hO1GyHRq}Cz4OA=x7X&rx_ao+k%E)kzP=bt z#wwZUsPP_RydD^xZ)T9dK{dAM1UKI0y!J4s60gdxxc+L6)?$?AdbN#GH|IUgL~Prz zNBXi=pOoKvwQXiM=h+=HlK&-j$x>p=+@RP@L=I`4+So4fr?r<}&3R*Ol(G!`vzv%* zdv|E;-6OqdTc6a(dbR1#&Nzs_>&%g>vOAC8D%*K4S1O6VUhS^V9%}4pYEXRiz~m-&d{Ad z`871M2{!Hjx+R_4v&kISLzt_Jo@XxExjr?wU6r_2wXDe<&f#{fb-upXwRQ3+n6CUR zKLxr_^t=1!(XYrI&{}x9{K{7uyoKK8f8Y~W=SI&36Oz$y%?TLe!!2Wk-w}_bz{V!O znsZO`8Q8dKBiXj|!k&Hg?lPT6lU|ihr6FozV$1hAYx|UZV?!8CSeH!}_9w!eaTx8q zcz7XRKKw#Ue#~e232!Rq&{si29k^^u3!uR4)4%kqj*$E!0Wdg9!1pt!V9uK#jZ$u= z;=jE)Z^2vJ2Ip~`_UzeSwdt_rh8npl+X%af1qT{@irrjpy;yJbJV@wLzK8X3@OTu6;v@_3B*vu_GtXh!87M;6(z>L~%cKgUb#-#b6M zJ(SBsnm237?MpuI^VD7rnHNg4KsiLb-Bqv{cFI$>KN`76y*)f&$HD4NoA-)UJHB1x z4xEanRi6vEs(NwaPYs?FSq0+e5Vu9K)@WKeyS(7iITH)A+^e!m zXWkzEOz|ByZB>y^4_*|zno4UUTX2g2Q-W@|>VszGox-}Q{JuJQ+35K1_DWNEzXNgR zw~zSAS9L;HgR#z@IoV_zjiucSzijng`5XvMS=X^wCdC~DN}G*OuIJspIJb;|a{L_I zu(V_}2&pS-#rM;;o4j@bLW;tsiv9a35n$ME-vfyWv$ve@ zn}mNGE)REq-gLyMSw&{6Qn=*i_;ed9ZN&$|*c3?LiD};6zDE2@hs3hQ%DkOyAX|JB zTS7R9J+w~ZW^#}^rbS|3iR+6kMkp^zWS`+Oy&9@=ZJ2)XA^#kF&#U8e#ceW6O`?XzfYA!F`s-C zPlOD0Fy{(wYcZE>ht1oC2Y*?Sbbg~6?_SZkFQshY04E)hqp#?`@hfqB1jR9*<~lY) z_X&(;xgKanU0%0*A4lO^;4u!y&VQ^T*AJIFWE7|N87wNUiz!`%TtLHj51AZR5)@GQ;)sb@ zGI;*^<~2^46ThF|i=J36zW*&UNuym%u3mLH(N|3pf_~c8`RvqWQmdp%3Nlk*8tuEi zUr-b?DmczXc||V@To0Wd+BzM;B>rZ0qsKP&?ox0KeX>2qtlK-6sGGTN?;Xua9X zam>d%<0OK?64z6kx-MdWO@T~;w>?;9s*Z@5%zd8CZ&Ybu)#-n-yGwD{9l`-!>(*hc zEA1#W4@dx*u^*l@v@(lKD;;1t3Xfk{Dvk@VvgND`eOwOYzN$fnuwl;vV@iDm&;`c- zM0uqnc>v|R2zf^9h-z^Sc^HS`zeb9>lY=JqlG$|)C|%jN9`xTNA64?Z&bBn^fDrW{ zufu`uAHHr?St2prMIJ+Xbl3CMJuimfgN&X91-b6+NiD87dHCF&oZk(Uv9v{tco_}v=R*WR90X4g*_osisC zTYUOD-Oxe`2?z-52r8^Fv z3K4jqRDBaso7ty3&XhxU-(0J)b0Nqqbl?8uva7O>q4S`#Ob(1A6UHn*ZKp?ucsh!T zq>xOr3^>?o?oH}sUt;+m#J-8`8|f;Oy%sGn+F_P8zR{tiR0!xAHSnD)vd(!UjFSWGxP*D%A}dB{Y=ylyPu0M{@yeI_M>w*7zR77&I2#np*H z16|B>kC9-|I$$5j(EmzIcz?kQ)z-{)4)=!+YyQ}dHdjnPKz8a_81~&BSz`IU^g|N+ zxpqzVv6b+KdG3ScK0}l{HTb?}sAt1h4)*L4z%y9%Ls_$j~w8sP+OaSdO~Ark!>|;=n7_o!v7<-M(dIfKFHQQVr<3Spn4v46FV~A6FE9aIA=4&z+tZUp=Fof1`c=zfcTh& zREx}1KHaevy*er6ph}Y=$l>mx z#NV4tT;hxo={t+uAvNI*GG%DFWyKl1EFhtjth=U+b$f)ftJ&siZ}g|xcxSY9kxW0+ z{2qQF;gyAX_kp@P_#a1R)Pz^kw>P%Lp{m<+bLI8QYMkufC?m_XDe6Ts$QlVObSnHN zc4NC0pvOQaMZMY|>U?>h_II;_d^D+Wm4jv2dB6b`;o-*JAJ;0Gb0Z*Q-5yrGGwj+N;mS zc}~jVBpuo~kgj`v-~MZoJFVIfAE6*~{B73sZxUlM2$+fR`8k`M_aHLM=!85#_=14R zPz^KCefsD^Bki(8ASY}E$qDzlFt@k#>0Q^HEYX|3Cr|aIm>{dyt0nxnWokBD2 zvGd`7@IW#(y)box^WaDoFrFPuicjAS(&L&j2Nid?HkX^xn)u_dMT#F6sZ2wm`Du9? zLz*d}^h65as{`iq1QHZBon-gN-Oj%zr2=sQ zQUpW|5LQGP6@(}&>@dU-8DSDp53r;mt%VL%$uNfR#1azCOOA~RZBI=cIde3tYL z2{3!!c07z8cuZPtbz)HOR*8VpomHlhdw#Q>48x@tR@L*bg8^X4&v%SwK7U%xpBC)- zv#V9k2@|ZJo9@ZxZLQuElJ5WsnHF6A4;s|2BM|0EkvM~*+(CKI`2AeH5Nvej|ByY% zxuCYbPa!U#8;Kl>WTmV~h?54f1Mh0&>?+@rW?~PMIJQb81N9IJi6sSVykA|E!C{av z=Zhgon%FF!#e+R{hWth2gMui_W?_UX#$xU^=&y$*=Dm%wyO75PC;^WW#fmY zf7*I1A0rP6-qYdi zuY_Tlpj&>66iH9Ex@Io9W1sfMN2^wz7 zk=vLWB|3xSYU0u5`XRtLPP@i?-?4^XdI3yig7F@5Qdav2cU7(~zLTvb@TQoq^#3RCJZzYjj4Sl&coLj5f>F4y7Y*Zs4U0{`{_wD@T0gb3 ztV|l1ZzHNrXVL@Fm04a-&V>!1=Ow!)C8WK|>>rN5^^@lWeiZt^D#ld#t;4#AHAhS5 zxJ&HCPfomK;HxGyG7ix0?ixBA^qT(iXq*2rY4V!HAdV{uy*VLu{C0KG%Oq;2+IlcP z2DEm~a={$oRI5J!9=46;YROPj2a|pV3CM;unxnz9t}2KUnGAsyfFW3QY`^RDYv;0WT9XD%bY7e&EJY zv92mYp0!o2B8;p&DklBJIJ>n_BZ3=`DJXfssVH%HP~dSC=3KUldWWYR<$2E6s`ZKG zDBF-dyCAqdK(o__+r^|;B9ZZ;M6a__-JMfSv8ItDZ+8i6ckN1|HQa9U?uc`(aQ#qq zJvsDf7lb%_!5AS~+yzl0PN)xr&Zd_sd!E9DO4OkCqzGmTQ{}Prz8!};Hpog{>%Y-^ zS>@#QA2=8sf2MFXjwa3nrS*x7RZ1#pb6C_SSnuQEZ^jUPW!=0wWoq@_2VqF$-59s^*UT2RpR*Bee zWW_BUK6&5nU|R~d3ZrDsCer1W4bjgk#N-cTK9`l+-O<%>b5kda#lc?7FtnY`vZ%5Q zge(ms`Iq!*l3+=8gOI*m>pW^@-VBxDHbOzlpE(S&mCwV=|A6`(Kqjk=;Mb0Q*xhy`7fgbnj z+m&pZ9X3?6eW=JC7_&8F=wq)sGz2lr-D-W=44FQ$uFiF*(GBy@h9B{P>6Z6(nq#j0QW0UrSi@tWur!jMQr(Bg3 z3RC}%m$xrk*qD7xPy7#Q%N~q^iEcyNc|CE4bVXt)w@dCn-HeWgOw-5)6;{+)5`Ee`sM%v+c)yr0(YZy#>Sn%rWKL6=|NGqhuW9e4~Mv#5a~t45fn z5{$9E{MdYvyp+|hUQt$8%R2H_`zonU5;ecHu)4^sZ%4XVNsezpJ$&II&DECE!-di1 z=mRBV#Z9WLVp3)+;|7jq#usy(NW32PQ9o+yhc&Ep_p0sGD;eF6hNk>o?>cryJ-k_+ z5#G9Zz@j7wMotk}FpcUYd}2)qvBF4kol6PHWT3a6_pG8@6YLREs?6PbpVEloqPXYB z*wS4U=wed|k;JmB5<$s|>~JYot?7xUyp12R|QlL-n8@5AU_;&f@LV`&7s}<%Bx;P%_u5Ifg*7 zK|em6G@e)gk$M{DG3lB90BB-V8EG*L#iN8$PI`pPy1)OfSB6o*@M=i|NN5$Luf;-# zD#VO?Ulz%o;m;O6y#gDN7=#)rk+97l^>56OOO}}x&o~Rs1}Ae}2*X0Fi^l5O7MDr( z6MUq@)GE3eDDm7<$&WPE%Ss}3m{MOcYe~uS&_Byy#PAMuMmgF^*bgm}IQ{2)*dAjm)y~}v4^B<2?Lvp zg&OdgmdW`9Wxa2%skK8Yv_`E5RZ1@2*-FzjDv)guP7Fe@S!%35FW}|v(rE0Q>B2a} zENiXApoNfGi0>MS;>9MP+9iT;TPI!LuhL_H68yJqO4uslSa#vHsr!aidd-D<8f(z* z^wZX?wx+5s10laorS~-s@^8!)@{L`zeB7!nDu~Q9M%}PuIkHoayJSXu8s%75MoLDh zi!&)r4VGa4p!mu0AtN#R;zyvYrBa7V#|*O{mx-EKm*>Z$4N0Z^Yh9>|Z5_)sgqa$$9Jb3hO9a zN0xGwRB%ZhO$m5S@{;FeZ0{Pw>}5+2DO4_LIWkq4w2F1j`2l{)$kxY|>X8_vUDG?< z5hrB3qfOu)w6%tx~K(zn4reQ=Yix#?RM}roc3v-Ww#$2TdX27^jt~GCH{~=7oCgRavp{Ad~zguwMc78 zSHm6Fu#v;ivjk85$N4XO4DM}GDnF2|GfIC4cgHAy>@0G@T4lyOq)egl-A^crCS_Qj zTo5;MF;*ih__1MD{OyG7WTF^+Gd(pW=l1{_J;MgJYJ-6th4tPUPp2%NrCQe3un?)_0Aw$MYWlG-duWt@XLH0S!T;jNY*5L~ix#q{~_z z*)Z$;0|ea9aZr@l$t@cmyD%Cx8%*9jqS+wLPNwFSH=|-J5JQlUr_%zlc~LQBwEDnxKcqxby0%_B0zXWsf)t})X@NynxPo> z{_LH+FPW?zwp!<;wk;*B;wN-JKE5nUq)DC|9yjcFLb%)UHpI<7eJEu9Ew}(^rE;hp zOSWWNn{CA=jQgaA_9~dkkc~O-jSIcP%g;W7EC$F}5e?7wL|ED6EsxzdyRImDIAA z`+V^2{c9cv41|yr!v64A8M7G#(oOt@qZ7x-w42KrS11cSltf^Ja03SIl)Jd=hHG|` zFs_r5qJ6pSvJ61}x=9q_A-pFJNyyBr|J)Vote%~}__!g@F%XJk-7ZvCBudboMWZ~D z=)bym6!av{s;7Hwmyc_y@0^Jo6*;NrYA`<7$ZvQuv7Gdky@;3|$vZ(B#TmbLPSfYx zGF{f2xS;sFjC$&Q1vt#y63R)_%tiK-BNsth`5uSN`?x(5hKVJeT$Fe##elZq>gI0e z;JF(&*-^^v!;b3x!71oat(`V*3Bx;UM}x8q_cPZxIlL@XV`iQ`H#s)0QQ=D6N*N>( z-X1NKBfcLCfZaV0Xw^E?j25hgG?I70L@vy*xOaCQs^|3%6%V5XH(SYPBm%_4Rp8p$ zSoWFdTFVp@eKg#VJj-9}Sg$FfiY)yUpXdMJ*Q41vil%STt4TY^^JaAMHjm6Ps^`K; zc1an;tdX2*{eT)jeRbwur{t|%w}HIy!XvsXI^I%W-aY6j3(}$_~ui)k=BjObnj-B z4Qtvr_*kEvEM(4cv@Mit6*8o3y8PQl$H?C6k7}OI&`x5am6|Lja#6aZILxWDJv(at zdcm@e1V}%t_3N?ViCp`|TJaCKr!RC%qno97_;6zsINZKxB%3Dh0Y>&g&GF7qWgGwa zb~zz563FZsROrP%HDLw3>c2W%Wa~dpG}G(>*mx|`{gP$Dgn|TNpxJ%mgRYX>0q!~@ z^9^~%elbsdZ|J+W)Q?it$Ew0?RsfVrki8=G*S;kVzf|4H4l zOr!1SDZT=_v);I)p_4nTIj%e8*IS5=3cV((bMMR_f&PPqBxGtZ6{ZH0B7YS~PAW_d zzC>~6tD>Dwq3w{Yv31iTXC^P(ZDh^ncE{La@Q*kM76+hnYCh4+xLgt&YrgQyL-5;*pU(_%k!5kIQwsbzvLB4tk^km>v-%)?M*@a{Oy`F zue0g0>xx^s)DxrC`u!?K6U-Q%qqNC4dW_g8%N;lykaaQ$>67g^isf>oiQE8C$>3)v zJXU*!(v20yhwjS!Ijg0xn#*JJ9((caTPf(#VCcju22w+cBtcc}kBMDkgeB6q)|!@FQyUbim1 z?3P zRQX{^!f@r;)VxV!<{E}#3&qDn_Q7>qo4{tDvIv^w$gxM3k~`KW2Fd%g-c@-p%Xv`P zJ+HTyX?{*sHFo7%>nqW3Jg2pjPPW%JbU{D;hKZniVQyLFwk$MR6p7W}UWk+Bj~sTR zp-&6*?#kQbVfGqad@=dMqkeaov`ap=L1X#~%2SuI!lyut%FXS=^$A>FjIx-fA_z*bBN3c;j>Hk@XddA@ey>HD8t}2zbi`+6xI78tA-I63hfqFN z4tnA4ss2!WXOxyhlk6BaYQt+?jo+lZI&L5%t0yb)VGn-bm$v}a@)CvH$3DhS#-4}F zjIE#29aO^APBQT28wcFlsvJ8fBLkQ-0Tw;eq;akP?bs{RD3+#|;`2bRwauOUMCLt3 z0cv8A5c+C#-P%ZzXsFq-@!M}ZJki-gbnkAhB;nn;^GzBf@9~($)pvZ1T7kM z^`8YKds4249V{H!{^d1SR^MTvfQVYuSbt(0b?GD&oy z7*8BF(u23<-iC-oUV>{g?AHt5Ub8)>fq7zlz5$!GHcLQt?s$y__=UYg&wn#B(e%9zxOYF7 zs0L_8qXgvd0xymx%tj40Prc4MHQGt?uyreA*RPxd(^S3FhiTt*zvQH7fJ>uRwJ}S+ zta61;sa%b=V5w!>MhMtUjV8ZF8To*3`pD_awWP(7aU& zWHKBwJo6O=e^4!f@ase|%qz|womHw8D33>IRj@<7=1Ca^Jf|+4wtZFAR2*O%*)*%~ zYfG)}k5O2V71G(l{e2kKrhy_j14LD$$W}^SB~^b+eKFrS+uyg_rn4eA+UK=&n%5xM z<)nfE0#}jI-33v>8SE0FuG!{=Mu5fRENAY(k+yHUZV3O>eP;{`8&sH;=!uTDEj~*^ zT$5Q%n5kB2_U^IX?4N1vOFB~)-Rz?AaT#9D*wETTL}$;$#^Ov-9i@r4o&PBQ`O0yH z#`*7cYzY|}Wc`Yg-Oar>=I_IhnO_=;hN7w1vLOHH8HqcYkoN=wn*b$JHio!V*o5Jc zMwHuyqnPOC-evC*Tw;>`^y%5<{!>~{riCE$=K}3D!3gyxX`fZ}I|HHX!{>H|A|bbO zKeMI!{L7;L?`IJ|EAYmi&nUuFm|Fb$0>^i=PAeHHLwyw9tl3?OHV;>49$*y;AF7#> zW)b#1@H)HDUAsY}%7(ywuIq+KfvIlT$;pS&gmU2<9^0R*x}|C-lV|hNf760|rxxX2 z_#Jska&rmJeU8P-4%KT^PyIkpW0p zJ^RH5@_|{!paFGkf6VR!qlbv%&o0gk)j{w)iIz(!rC;jA2cm5)C$|?iLLRoAmjiL9 zWJIj_9sJ+txj`f^>5jL4{L#hJbsCUJp4*=il!>=mcX*&T(mAxd)h{`;ZSNeP3$T_y zw6lSvcjR_c1KK{2t(|O@x3q?A-Rx&Y+h9z1q8n$S9m`@WhELFPP`JlWjC+a9?)$WH z7N(riJp3#uf!hWtU<7%|bUU8QeZqLD&ySp7FH~fW_iq%>w!OW=Pzb+gDu$Ci4#>J~ z6IFw&%UU9omau>0T=*sKkmhOYnfq~6erenxZCN^F)rY{-J|6*oNtfCP^;`kJ_{m(O zk)$QQc51%1pJsQQuHB|lXyZ4NtJZd}D}Ccp-g>c|vMkxESlg%BGmk(WAD_x>O`?iQ ze)yyxrYM0#$fzmL)Ig%9%H%vtVTuoq99mP^;(Vb-5*b|VJb9asvTV)hzMP|^=W`xC z!R&lHv<6{rr@mgoI>9bb<~A4C#8MTil!S#lYKbF|#o*-{I_dk&oeru)_7y3djl0#< zI`t|sv~iz;RPBoK&)Yc8qYq?p7vu4OrchsvsIwunhm3d~g!_q?7wAyb-l(w!XmiAch5 zXvq$F{MC7o3bNmds%q}0m1n)&yuPW)TD~-gU$#C6ck+DJI}K&uh!S&g7atrJq>|>Uj?O zw{EM{iI`-EYq^(@-te@R?-Y&$3o`ZWBy#3dCrM; zSE0c@Y}WCwA>g{-;N3GnSa}p=W>IJ=uTWN=BQcOGdLQaMkfw_YlYCKUxBwm z)z)SSQ(Y|G#HcLK{RgJU7egV_TBPkXfcfhbE3L%2M1z}lS-F@;A1(|v>GK0|@lZw* zUCdae^{C&0q1xUn$0s=`=h0cmwrWa&x-xS)fuT{e{qlgXjU@J}r4A7?ureRkQrD3; zX5G;`s^PPddylB6NVQAMsbFbvt*dV3SqC9%on>u?vGGo~oU-PZ)=4CpeHS~~^B^Kz z$vfiZi)0*TvX4dKV$Ml#juhE6bRuZjaq&>(1zE9m=tEY1*Pa8T`Xx!EdA%8D)I0 zv*|Ef%O!iwRJx*pki^v+MekQn8pPdkc2FK;JaEj&=d@L1Kz)OKpsU`mf3+Dy2Ph-) z_XCNRx>G;3A)}0sPK+Rj9vNm3UmF|pf_0hq|>$1W;hBWkD^-R+~*s+QozbVrV4aeff{iqnTw@`DT z8^*{}(3RF^F#W&o_PhXk{qf;rLX2;hxDvjnFoS{e;%)#2~1iXKO|4 zmBB|Eb0shCYA$9P{Tiw5#3IzD=v{t4$MJ&NN{f?0uHd`?+?(zDKvE*1^XsGea?3n~ zuA(I#*jx24I~(Xqv2XOPv0_cG%k1AGaoe6{XoCfKOc+8CW)6|1k8XJZ7TB>$=s(Gt zV{8e%SBIgABiSvW%1;9$VKvOUD*>DA-&Hg~@J;O&6YczV5hd3W8}D%ccX8vAl>Y4P zbK!l6{eP=YvlY(gd+a^U)l|~(?asK~tLvb4*Sgt3LtL`VF@1>O#3pp(`^WRvvRW|d zcZSv~mXK7Y+`r>WV%MLmg8Y>lvBE8gcP2j>w4`twF0fE<3M#NrYW$1#89TRIov{tZ zE7utP%GM$?>UOg8UXRU56LCr@y-Vd*1?EhXm%S-evDmDSCq){ z-pQ+Ww2~DcGloP~+EqK2!7|IC!@=$p3VQfJ<<})pg!;7P@(T~OTv>?>kMN9X4uW0B zSQ?i9P%62WdRdQs{N}K}LIM|x`9=dX%4z_szC$p^Hpp#xw>Ni7cO-?r){V6oLeRt8_ z7S*fY;)j;WN&TLtu!daPf>aXm`i@t>Y$8EB4A3kHigM?)9{(OGyA*L^wIsfexUz5r zvBs}`n3N<1tw8or8HtFBFDCAb6fNXV(dld}R&kE3WE|8NUXd}XWuQ|C#&N&1XR^2g zqJ8M&+Mrj67y6cGzmq^BO&ZulrYHVd77Vkg6=`xiT2tXFU(fH{!1G_vc(6ctA!&)F5QK?+W&y-Drijl*AA7S zg~d{MHPqFNV(Z7w4sQI$ZFN(H_mqUoud>(xf&8VZA9~#Ww=e9h&v736+9$p62mU7} z$p*T%HKV)n0m_Bz7oP7Yze+KdMLK2}CO@Wz^@8W^+nHiV9DIihWS~Sb?9`HbmXxVb zY-gh~XVN%+wD`$tN!AkLsHC2?l<5exl5r}%hejePT%^H_GLAG2#S^7#=&p4P4~RXpAqK-X=YYU`!aATH2`y{-6lrEt= zc{8u9p%m#`V$_i-+)6UhV%sEAyGp_g6h#n!QWDQxpBvx}^EsF683yVUlMaR}mkB72!pQS>NI6LfW*`g($ zZB$nF^}D79WF%BtSjsIo)M;)<3pod=G(&dj0q!tUG-7ab~w)L|OX&Ey|AuyR9qf9h9|&@0(lOWVG}=T&-GsvlEG zFY{SHA0Bu%vzS(2+6xVq>X%)bZ`=IxpZN|%WlPtU1^YfN@^VpNXYf{o-)`D{bA`w) zQ~#Xbp_CJ8-y@*3-w+%a^(`eC;!i#ZS|=ryK?W#bz`tZKiX=feNk_`|7kIn-u$j;Z z(R@36Sy_V7a$^hG>_(X-OYrEQpg1POv%^ur0H$kFBe*GgPoME-9#8NuDXr8HCO&8D zf0+~xf+`KA+7}_D4L-Up<}S=5XJF{De8Qz|QrO22m)q|@FwtZnB!ucs49cnD4L#O) zXr(?`f$et;50@hnNnVn|$3*`RHY1kSzfVh;S~!VxU zsGHrEZKK(sA+GA=Ad>kTUX*_Yyt)#DXnq56sCbmO^cP(OyCgXL4+Sl1HhIjkog7~4 z%VqE$zQ0Ioa4_Dh4!+|jHX!;R8U1NK0iwPfOV@*EbvWpv%YG3kDyL@WJxx zuoxJ8><(V!Crk4Gfg?p=(BBL;6~ymB*(&zv?NJ!VxnUpuvfOxQZ1_}tD~wp{JeSOZ zkBfx`Ad`=M3Up88x++{s3(Ne!xx_2L+L>=-9{ESs26Z`*8ec?xptOsD$HS=&l)X>Z z)POBptj+^5;Vo;_9IyBBT3wotz3`9a>$(4xYFPoCH|E=X2IUrB1|A7S$qIMZh`dYz zH#z90CtN4bdp*)zQ|v>{^dzgF)(TrVca<-E3{#j(rE-#+<8prkdR>-dYL{Eo|HWb` zxp~j*pbs$o=`VxzuYi{Yu-U$@gXYIS#+T&*cE0XST8zj}!Z9yOYesVUo)knp%%miY z%)gfuHg>SB8R}^Gx_+3->#zQpV=3SlQ?t=rT-De$OAH1&>A!A}Uof9AVowS(1i4iP z+HJ}fZ`!l?5}Ua#wtrJyPOtZP)n!}d=V@Wq zASG~D)HjGT-P?%h%~es5!;RWRKY8p6G0`2?)Cc3ui(D+kOAQ;+HkYwAVWM zJQG0;9Gv1245%{Juo9HamU&PIXz@>A&C9xVvziZJ2b#PA`R9=rnH@y-+2Ql{xoF`B zC8e<7R1fk;L-7}Ti|i=Y&$ByK(vSVj*D6Ah$Q)$T$Su4%$Y-i1{Be8+i|vU)+xJUc zt6Cwv*WtL6tF20=EItc&NFga~C|Suaeq@B4#jB7)oDb-3I$O1VmXr;wAd-0e7iOiE zKF{0Uh#83s8yFLU&gEC7gdUa1+2zgeS8jNqEkl8p#DSG2owo-@pwp@gs=5m&Ta%HngJSuh8i$R=Yx+{vt2n4-sQFY{qMBO(6y5qg4D z)zqzhf|d&Tty157wrAVEzuUgnv=Ur^!k54-hbb6WsTF^okh?8w1wUliU0P$)33X?b zJMx(H<$Eqd^~uVmog^l%zDya?{QEOwe;PnWA@5>O9L4m8eQYo-!Pswi?o+ebEM8S# zT0&-`>A#PgCp$SH04C4VdhE4r6;%_K`gxY7eg8??263*tL{xNVcr$+ZMW)Pmk--=G zog=&K-#IDRw-#QiE_(ep(xSvwF(Rukf1U}bS{ce!1ER7w<~_F2i|>DbP*MV%DlOFn zWES$(Vf{yFWq^yOjmj4j6FNe}|8nE+W8K7@jFlFftcwlTVjR9GGP^DFkipO5i$kgr zHzK2unTKH%a-ru=eybmk)8fl%qiP61G4SmoDE**or$_L(b$FQT|AL&+Q8U}?EHkLJ zpAjWe?g2c9U-(!~bV0TUwyWs9vkvtB%wat9a-+551M-^t11=z}S7pKD7OpNbGuIi3 z+y7glwrrQ&N9l|yX8;f8zQBmq-YS#VoDZeeZt}=rP`&Ib@nt)8i6wkX>2~VW zI`aLw#H4XFNLGa&dNGx38zailnb=T>N$iBy?$6f=m@_z+!xGQ8q)sXP$7r8%E|8FbBY}*2A*dRJmr+VyYJZ8C z$cJ#9G5}Zx^tx0|&3frz`etix1az(cfSM@xL*y3PAbyZ#p1x-usUgDFb5QsChAc?c zkRd{3N?Yid9OuJQJb2?Xman4S(x7CMH^O#alN9>ii9!^gOUK&2KRH^Y4dc}`j*2X~ zf@66I75$nsnu3Po9{&JsaE!Fyw zOOu-Se0y)5n6Yfb0LpFRuELHtQ+{h_>99US+L`S|rQZ6@52&`z9h|*=J2=^Yw+(Kz z4f+0#KFv>$fq=Bq%LfE#((>}UfS2!Mm)I;2`M)ssh6X}VL&BfoymCu~eE4qW+TU(_puR;ROlNJ1bu|@YR%W4?NY+*`L)}bI%(HsG=z7aM=bnhqD#56PclQB3nb%I|6C9E&5TtR;%k%==Q>&f zYNNjunE%fliiK*TYFT^Eqe6uufSY?!^VCJzjkTpVL390GK~{w)>!0BlwCy((>ZtA1 zC!i7Dr#`&wT&j6NBIGj-CB5=y*wdf3W&57RK4?*0B7}3PEuHV1&tGeiZ=vsS`7>!9 z|L#UnX-xk!QCmO8WuUxMa-Bn9AEavU_MLs(Ar{7Y$O5N_*{-)_i0WK*ov-aa!9%Tc z^@hIrdiBHFA@_Gi3YWVMxN5gPaz#NWSFfRQ;sS~5QFD{Af)AC=OQ~n4-!!v_0c0!Fa-7C8p|C z6`yWN0JN<(B&h+N(ezM&ahpA_?<#x;EQiu=`^#L=PfWu5gW)KvchO$#c`pGU1m}n7 zDoEc5mPJjq7QMCk>Vl#3hh1?hLz-pfixN#4lzH^~F%#qZXt(c1Zh@;UF7-kNL>#2jqQPMi=G+MW>RwjW)Mg|_Z@5-$v99lUM5*HHM$pEUD`db_D#c3 zt&cFU@>#G1!FJF{$>m@0y~|N0W06gmWZ;Cnsm21(zAZy&;727zM$vfvwVX!(5 zb5I)l2O77}wIu&Eg?-xT3hCOsQ=@w9h>ekxng*}~=}sjppX&n`O_W#bTg8NA)D^4TXd1ieK;ByJEY(bwEkjRuF~cnlgp&8s zep|uYP*{ECgQSsYT8qq*n|N0v6V_5$o6R50S#B3rp&rJ%M>Q*LKgxHX1ngjOGU*XU zRAzE+$sHhDnqDJ&@%>*8%CFh4)9}c(c{4cd-A#x{`3M@0=Q(^;M=?1n)ZATuX`9kL zYDPEq6J0*;S}JNJ)X4k&5HJw};=f+82s9my#?b zYoJ~pLbBto0Ch19ExkLcD>^6LBV{f{f0Y=nXg^iaW4f&W zX%o`MZtGvWo;_;HY2)7;L2X49s_I;$C2`HQ>X~U5%5qD-#{IPXWYD1RFn(8pTP-Gm zhk&G;Bl30flM)|qsVS@X{I;cL&igYr40+6M0v`N!?HWJB7JV<)*tmb3C`TP&BPvX8)zO zjO+8%VD5O}X-(Tk-Bkgp>^I3Uq@dfXw?)PowEjGKbz3w7Zv6CA5ZiJ*lp9a)Kosmi zI$yTuo1Hj%oM?8rq{_OyffW>7xM+=}-sq{hhN0$wp6G4l)mBe-_$ZmS1$$i#)-~m6 z2*F5}ppl$4Zz|^8kY7bJ-oFZKBB7w*uD1ejAf&X3{K>29^YOB!iMp(N^%J_Ah2>BY z*^6z~ms}>AJ6Fr8A25IP7%hG{JO!O}SPYd)^Da;xmiwQ!pFbDI>H(r;F`wM=%89e` z3GCHfok>e-GLH}7ZwxP5W?B#>I{&?X3;sYG7bu0spc2BZF*Ykve>COT{O?{ICG8f! zDLt*0K`;*d{YYM3y_vGK_uOeWXK&Nk?;kKfI_zx8chdB^wL=0b12PXjg4EDK6X10@ zP{yc(PQkf)3L<0Yyt$W<=bv{&ubaem=0BWywSM5A=Uy)pcapwwphW1Dp%u5maqg*4 zlxtzvTbvaW#w0m(?YPsFlR_ImD7oWETkXf+O@;3QWzrl}C$6m0&L+ug{)`IqTx9!? ze{%cdM>@w~A$Ey2jgqZ0{_Sy;X!;TD z)pLx7T_J7V`?sc<+2zR&Wz+*eE7HBrsp{){t4&P{v#N_aWTgRClVZ@n3IS(5JBqG#%2+05kmpoWM> z)HV9#{+MQHMN034k)IyaKXf|t>Mox~&0mhW@3jjj-vo!tQFmR|U9gp^TIS|-4jPfq z0Ik_d-gzVbb{sWwV#>m8(SupJDDcl+5qy65A~wCjw*Oo@LlECrt=Hv>7U@ZU1dgaNqmA z7F+LJ?z|PJmmX2^K>zQh*a}#@Nu}*fbH*v;snY2?+;4GoaF1j7U*GS>wHtNW~#u&n|9ag z5vpgADN?~8QfXd(=%o7Cjt9$39hFoQsXf8Zp9*=WL?*KWcB z|KV4Z2|Wet)7?m1GO<@lijCzAa;N%yFiiT-OkJ)q_a=2; z|FK3)j*%W_R20LV;A%N^6Ipal9yHYkVCjvNsaDcmXRJG_ilfY};0{k**D~w;+Py!& zXzW^U50yBs%nAJ%xX6DbfXK`vMKldTYo=XQ*qY8=DgC7IAkNgSHPek3J7WW(^m(tZ9QUcM_&WHK*dliILIr#N&p{_RSC1{2HWa{8pXU062zPbvBB$*GlN zch!b${A!`UN6Y7DW8~W$8%TP4fjH8#aN3TYt`~p7JY}KjJI;2_Mq(!O`-r9p+#Dma zoBy}BA|x?rxli?NH%rX#M|2?_T0r0RWZb`ht${Y-sj9B2m5lg`71w*e&+|H1h3?@qlr>C}~FZ(~}8%j&#zWXSK}M5Bp|ScWM^b(PXYBNaYj!zTFiRM;FXQqJ6% z?IjS@2azIe%?$}HQ6lfnZdfDU>`$Ov_%-%rH;k)!%!E9DS4 zMZ0&W$MjX}YhVdpg(et|*a!}a{KheaSgI8ceGD5G&HFdI-$(DTr)_BYCy8C2rO}yd*z)jR3wh2fCnk(s-^hhW zZDU^1I5hF>&8lXpfF(W<-0{Az0(z9s_Mw02&+V5>KR5f4FXO}df923dGFrcbz-vn$ zvg>E~YlLr!u22C2($Nwc{;DbN$+5N*_q-i0`G(~rndCXt9}dK(I7@>v4haddQoaQh zJ{^_G(zRvroi~DaZgl%eg_?9^yKSgnQ*|hI=Bu4(?Pl(LD{&IYreB5izve)=%zyhc zaDVOZEWdx=<`BbL;9T;-#KWLSX|`oWTwue+vA0XYjwQU_Kxan=Oaay>4kFnz+E` zAQDde2&^#%Uk^nFZBvx8`h#IHo27scMgC3@cx-#T&*t@P!3sk{T0D7O9^jFkj70X- z;!OJ?V^yaNpJg+n0{K7R9Ssefke`PCmp-$WkUtgqthiu1>uoe271YlHSI7e4&s=GG zxda=*t3ax<*YVPX5XP@b^cwWqd{syhonsP^Hk$gn}$slGruCwTs2S|JR$@n zSa$~M{C$$m)kRbAb_A%Le&2ufc03Fo89f6UIjAWXTJr^^Cjaf1msaa*ci35K`m)(` zDq&`R?FQz&yXI&Dnma%yOH&ZgmLyFs8{5s-vV`vHqL7hIe3LYLr^?qF3Ncl)5`!*7 zZWU@h$GW2pg*MylHsdKb_$qUyY%9(2zE=`t=~ohv)&u|&Ug=rT;z_eB_Pfr%yDgsZ z_HCX1D!f3(Szb!JHP)uV|G9}IPlr_~#)$0A4|zPRpW&ZBnX;2F)+3h!#>&%&e_0^! z`D0GJkf7d7Ggh~IgEn$xi$1+Z3(XMGr%7aK?D!MibTyvUBv+(Gi|mb;;Op-{6{*OV zr61)1KZqE~NJ_i3G{QP_^GqbMJoE$kLBYIr0yPLY_T)XC*W2(O1w55cuyMnCZTkp* zGEb|!X=Gbyu0B_An}@5J7zjnECJ|R!grMvB_anwqQR3or`YZ!6{OW16e|Y-WtI zrGnJ8d>iPT+I?*XL7PcIswTmP0@3a<0 zNLAT`9kBVz3AJTo2wDn`5+Sj=NbybRj4uO(ke0{hFG(><#I0lAmcwtD^unAUz5^oJ z8ZG4dU+?XX2J+o)bM^^-b(}B2z!8Jto!w#0oZxvoZU$6``t_qhY2$ZORnvW@r8ZK; z?c^Mh*eccXQlJJNM2jT2Xt(&fcX4jnH#Ze#Aaz58sOq9Lt9f#ENHb)gw{b>j@#X^O zE}&54s~3=Yvk!atC+=Cn``Kgu-33prowgT>A&A?QR;l4b{!)P}KKdV)3J2R(I%;2cIC2F{PEG!!po-_50bg8B;=IL*8wf$h zBf$Eh>M1~FK*Y(&%iPY)*;-ktmg?!PjTBk$z8`CHQAiRoCgUrRso0-~J&ATjnm8FVXpCy4t9I(K906`dUTj9U9G|kb72Q_00 zxvSB$N)g<3qAW5AYcDbd{F;qs4u0ul&e zLHaW>h)zL!9QYpL2jYjOc`sQzahGNZ-g91lmPu^R^On0;H2WZ~vLm3Vbj;@8Qoe2#(v$s55sR z33XF@>)-=UJc3N8@6(~z`yy_u4AfQB|QvyJJR%(Iz6xcF=eda#&0T_r6BtF%+D|UcIa}nh z^G(_2D3SWJ^A7Z|<~XNS0`{M;k9b-fFok2U=4==I=*hpWM^1{}A(JQVf<|s!&-9Ih zW#5{;#Sg2C)(Q*^L?M@vtu~<9DbCv<*~Fj-={_5YEBpM(rC~;VYY)-Zlju1g1}grf zK*nIRMAZiLB*3Jbj-2SK+ROerm&GRGJ;)Cc4P||6wt+P)w%R*)t}0 zPoBTG1r*0jG>U|${~DVHf=9nBaRF`PXT1=D`l8Y-S)}SEzKd4st=CbV{KIoZkY=%- z{|>o&5O&7A{WWqHCdXn2WDFLAK;HfQ=ji7=%TyFD*27IkUZB-{_p%ki31%8IK*W&0 zf5oK0^Xe#l{?qaS_hQrK44upk2_b_Wdsi+kcXh~XQvG~n?qRilPPR$35F#Q?oQz*V z(@mpQh}Zsa*j4#yfWL#!TU@tGUoR|#B-1|Y*CB~PgQvR&_NEYSTEzDVS_9EE)SfqAg%z%*o#BT1LMN4-)-z%(K5* z{|(j&b)))@j}k7q*l|}Tu8D(mMNTx%ATR!474!8TPYBYdf{@aRmkNLmC}cedFa{NM z?Oj4+&MOu5-d`0>J5}z#eL{crOc{rnI&MbjpM7_NfzWo*D^MM$ zox~uJGE3CETTa2uubf!n1Gu;}#C~nsRILkN>O_}+xmJ+r5C8|$m6FrZ;2S8}soCeW z@AX~whOGwQ;`={5LO)WWE)p^IShgkodU4j!v-(m{q&%ZIdRQ}dWX*wz3FE**Q%Q;P z`_vXs$3VB0PLv7+oPa+_a>T#}pNc_nq-jCcK}vyUvA7?RO0f*fVj-h${54a%~;STDtMQx}0M!Tdrudw56Zv4<8R^C@Sll!nYN&fLYD*Njc=dqdj`IBWl=xHPm{U;t_kJ~{PVV&O{7(0BF zt2~Azwe_Ydi8~mXqtfh0lRn@04=Hm8NA5{nwSDHbt-x)Cv3CnbNB8}08) zmMX3nDvZ1yk$CLMs3n%=lQ1;~spGe4Sgt#_z>zy7%_b&D;~lo7hZbve9_7|^#c~<1 zD(M%l|39pKc|6qn_y4VKsT3+v+1hSY*4*r7l2*!+Zj`mOlC6=!7*k1!P&c8%q>{RY zlzkXiwjxU?G8l20EHl$sW-w;vcitn?T|U3NeE;h4sF?S>-sg3e=Xsvzp%Shh&{Lur zV$f=NzojObLUAZT#PIZqyiXs{90Fgkweg7iBFxoAotT3=)CXwKuj}k=sFtg}LFhz} z@D5W6cF?-}h7&hI$`aO1VXwg92ePT$;uC4(3WXQlOJ$f~CInu#fbqhb-$fvbjbjwX zsR_Kxh5FEgXTujADCCOsi;(_)l7Q}EU8A6^7Amqv!8G0TG4ju>oq|>Y)fy9KiAkw5 z8eLo7Q@^~>_c^z~N`WW`Q&8-fmffzW;QzB6G}|!m`3YOpI-2e6ie)q?pw~+k^&hZC z$#4t$A*Mcmn#&x#ccvv5t72dG2~3J)!PTE>cqO;=RnjbCWgY1{Iwss#P&Bt=5GvC8 zy^S|=m%nKg)|S(Bq>us*K`Q@*b7;`zoRy8OZ{83#tBb=~pOzVi+nMURd=xg?I)T(0 zLL)Yy0o_dW;zJ6ZG`inUvA!0kVFmgor9e*jZeW6HTc5jpPrMKy7Rn2Ax^IMYMUl?W zi=#+pBdjNpl>Lu{UcZM5TlO*^X8zQ+>+sme##LRd89Q=j6gRvA(Ty2^Y9Y)nnl25_ zz^FF;`LO_k9_mfJGq4zE{1%pruv{kCq{$e~SUur|7wRK2tBS*W)Gj9hq|P%~juih$ z0Tu|Uy>JXH6n<`M{MR#Ypm_&fZ@ccp1A87_aot!?-;sCcz_M>nAQ>UtC7sczB_|*8 zC|&sUVQ2Ocv|;rv;5IwH0T^i|!rcM&f%rQ`4MMO@xQisfBNbUrs|>tTB;T&wl@zoY zad$%9#BT=E;!?-kU8t?kD_mD%7{9<)m5ydpfPHtMpdr6(VmMQ$Caxyzh^l1g5mvSL z=~&xZW$BrFhuOK;e}59du;zU=0J65Z_B~$-VEhm)kLJFxrAw=RQ9by| ziuB>#{+1D!D|KBcuxGVyfFiGfEIGmoE30y1L$JM`DBQt2BEQ{lWdiD%VLI`Ud@O3! zs)Sez8Q~B?CEP??1Sb5m$MUm>4IUd(t@e~Fkk)OIlZmkJ3=wavaAQ9V*@!bMaYkzZbPv(RZe2lAF7qwlu%2>@iVQWl7dG- zk4FfUAkYNw&#&~47~y^TX=Cb%hBCLCk^BTq=cQ>RL0G9rv8sZy%XHx1^Z@1WQsn)nbCuiIygv@lybu1$&`Af?$iB?zC za7ai$M@Htqp`UkeOfxLWjL2bo?(utZ(RllXfe8C@)B^qAqWZK_t_C3EACt#rMU(3q zS3<^st#y;dSAkgiwT3OsN`1UaB53mi8 zTK>WVtDCnNy80KaO1%=dI`!6`Z1n!MO3apF9%J14`pM-#k)zvO>Q*=5_ow z*N(wNWT?0sraml7rUj%<$TY$AO!Ci}dkwBT`EMoUL&tVcE)m!#okE8*1?K(nO_Y}| zT=KASg;?t%SK@%#L1fRiXxDJGXZ0{tl=b(#f1NvVFZ`-+VWY6h1;3fygLSWigXP0s z2Y(+Hs;iv3rws+QLHjtNw)52NWK_~!W!Vzg79<=x|6fmYQ`Ru7U;e|P%N{=;L-2>%whLAn#5^- zrWYDaC4s3;=5B%s_zwZQk>1K5$CZjJToq_7dtIO8%b)3IwRkC}`ELEq`wn#UHm1f$ zDO@YnV(DdymkXJ7-`uGfL}Hqe=lxdAF8UCG(-n zB_mC#S%pG)?mi@c!+#zro(&Wyf#{GdN&dNcOWa|qh<$m9PdTIQUBw@srI}^~+`b(@ zVM=6`q-NdZ9zt@c>CTUPv(d73LAUzXOQ%qJBV1ft{&dNb1t!zcn5nkOAM>DoYX{QE z5Qsur*_~1Op+saq2Gt+hx5i5(CCQ^geIWh8*K60IO03^*klgnAMI5z3>bp|c655VT zMMv{ePRs^7c_h1<9=!sO?Y^tkE%`0e_XKPz?n5|)s;dkCl%8)VIorjtaGTw%$Mzz+ z+sExzh0cYZ*8Djv+Q8d3({KFprkFP#N5nL-+-3UlBBcTcocw--^gQj_-BoRq-qUyy z^7CmiqGkXI2vA67!B7yW2A=tS(A;7Qb;6YM69&}jr+&Hg_Os?jwMDKCst0LO9kko2 zj(y)S%_lwpniu(ygMbBnI}~;|X?lJl{I_XoJrXM?lcR}>NeG$#GvBBVS1X1fRZQGf1+sZ$>#kAp$y?O{P$BZgrVV2J#(xw>Ba`D4(oRp4gQ}!@=#k@ zhK+=|if_gub94xkN^J^EYw_7wW|o~6p8J&$_6P3?T0+x&qHqv&dRmH{)3j*X0 zH#1rC&3d_uN;0R*O9NeAM|v#l?*upO_`RuBJUQ(FKOIFv*15HcNI>C za_?7=z1Uc#A{X$?>$Kxn0RKN(b-^vE`3VMs=0uH5rpI35Z3$>yVZPE^|Cn85{5Z{- z3CC@xmrKa}#n&uSH~2zsQjmxY1^mXDs{d|$$ipJd8)Ih|Dm5?r|L_#9C@wx7Hj|rR zrk)IpTW9@sQ&v*K-qlxYeD$H z`q|~sebmaLinD0b`!a~{TroCn7ec|{$cK#{h5@VivW{ut=CJ)6vI4||A};)!ST3A# zoED6b*F6=nBOii3iIhl3ABzVqY9wGj3=U3du3wxw)q+88)5(=4bMELCM>I0tl{@s( zcY-b>uOP-TZCzRENOR!giAmTm6XgHEww?@}noR>9BSj$We+t_dNK{^!&$_FZc5197 z$oB$>>%DiR#ac5Ba9#Qe?>UrfLabv-*WiD~r|GWWkn40muh#A1^ajzycas|S=w6_f z3xfXRkJ&d@WOY2APbCY61@Fm)H|-22Kp^B4k@$6LT9G(W%uS22h})XV%&^4ME!0P< z-jp?E<)EoG45EJ%T`A;4b1XobUej@;EaRMn`8p5_2t9OtrNf%}3%LmoJVD@PcF-A$ z(z=E+?Qd|JzyD&#l@i>7EoqGTBP=BX7qxzZ7RWq59){0{dE6qJ!C~t<}T4XDZ%@!a^ zx1d5lO;+LWUkYKyG%^r?O+qC=XAD5BEWKjDXzz zH2m`alOcX{1O0Q-?U!*N+n{4SoKLK3-4-RDaJnIjwC>xFo~G8`t*dXoRp+;q%OG~e zx{FFR7M74x(j(#1AQGW*PRG|ETb-K4=bRjCxL|C)5%^OtLQ+?=-%*x`L+U&2ob$Oy zv0_#mC8L~ON-y4qJr=DD`cBy0{X2!LzkU6+Bek-^I^+!mhpH51^hG~u9SJb_Au(XA zD3NFPjhu?-rGfQx5LwW@#7mwhTlOE+8)*Y=w15A>mJ&^>p*WziZ|<@Jj_963K~WhS z$I1xm;iMuIqNyB7rx>or8&?P5Nv|teB9Ya(HEJnr4u05aY@h+U-+i!b$HQvL>=0*e z-oJh21-GK-Rje+0UmxCcFWWZ{#L-&A>9M|>wgTk^0fMJozM?;kRh{4czs1)lc0c@c z*AGu2=0X*(PhKXwLYwn=p6NGvzu8b(P`uyXvu_$)IX>kC<_%Cp?rHMu@A;nfAo$E3 z{Jpytl$~q-t>MAkUK7Xrg1!tz!&3QC_&nD47wI{8bK@L@y7u+HZA)t#Jx(5YcR>J~ zZDDQ0k${jo0KX3yEIh2ox7od_B=+_I3^<6D1@8km_n$jroF+chC`CzKW${IP=|$ky z7HktP-h_m<{d*5|^R32!VaPZhbIq~YSA?G|IqP93Ex!z9X%NeR|Kwz+bPPH&|2z(A zM`~w@e(1O|2d?$~@Sd&NB$;|QQ%u5b&u=o+qIn&!=iJFH`d_N+ITjTFd;N>A7t*u(3sZ#D1QLu^_K zZ;1!2ks4|&tNJN=Zba)seX^h#Bx0(UVmnd`c1-xryT)D#2??%|hNTaen25|8(IyC7 z38Uh@)MQs=bHm>)NPm|ft*Q8V3rzM_EH)m`)k>8Nv*8#Hx5qZQpWps$;!7HX^u8f2 z+Wc-$RGm>dq)mnoUlMtP!|C4t;T?N!@#FN`*c_6lfCM2s!<5c^L0Xd+@c@JoD+I2cd_+>cb0iVN|&v^EzuI zY`a4sK;xTYF6OsyZLGfTMSe<;VN58$2%A}dQUCnz;gGli6{_AKXe7_RoT(8Hr_pe^ z@C9zqUeUSfp9{09WWBT*J%Hn|bB|J%B0EIjY!XdG1l>$#0<(G%jjj{Z$TrXZS|I!t zWbv(qOm^JjKjb@(m-WGLl(z|(K5OlHx&~;%61v_pynGm24Mk{RE znt{K-5Pnw`2~phja^4;M`qu5iLS6FMI-|=E!?=QeB7rpw-)hJ=?95AV68;FQ|3JS- z9&DV-I#P5x%q{!gkE}HOSmrXTKr~|kpBDIexDXqXtkY6!UvyJgdY{@1YRDVqi(@F(qurtVHUb6C4 zIwbFib5BY2mZ-q6Gj?hHq;Gy{V4kM!F^$8}cIU{-%)Y2gYr>-)$GNf_B1<`QHY)sM z?=yM9CCoQB<(rT1Fc`k8l?p&v{4G_B@Y07}9KlP+Rzn6`#VP)F8$QoYHjg*9Us=^H zo&MfC-DO<=X4pyRx2{*`Aq&X12BfsmwoITcr*49Dew-Ywp*g3=?+o`4YzaQ>*mF0a zK)0hMu>bLMR^cO4Ei$=yXxiG8B^m3^ZpFI;$$bo};N}MI4#csHy$R>zFVKXN_K@c#@%aW9LddX zRe$>PZ3a9iJrt}<*8UDr?jN-|1ZyGoaF)0@yU<=s(74aNf3nSYM`JOz<{?6DHHeQ~9 zto?cdj8xC3o)Q&`da`58x4IJ#xlhkG^}vcpD!pj-(D}y$;{lI6pDm?*7`XC%CQ+p! zknBtw7%-4`nH@Y0Z~2??@{7f$_78J>q~E?PTL5Y|OBDFk42|I!Ey7Nh&aKK@<0HaJ zKQ)%UgGs5T>_?eDws!wKyU_YeP<)~y7iBIPYpWNFOI4QU5C`-V174aL9DAt6h#Rn3 z_W3c)&rJ%b|Ff&Uk&=%d-ytahYncYAdm=pk)MRbtLasehn;Q zIle*ardeb$9-FKhZ!0^O=8j_=dDqjo@R`L~g)e61*5i~%1PET8W3apegOQx)(#gKX z=g-U3h#3#4rusM4f7zJKP zaP&>p>b@N|R*50i0mp9b7e?Se@?7mEZT>kVPX;2UKg&`t=lN6#`>>?)nC%q9D zcMTVyt#5+RUQt&BDHn;Tbn{&qglOY`>2f@OpyqLpytNz0xV-^2(A0iw!5HJn8&g;@ z*4xstMF(q}6qMF2b$vo5A*6n;|tfzWY0e6HjCv|9IHo>e|f(VI=MqMTLek`{$1e1HzkQ zjVq|enHnjOV~!J3BYQ%=!kWK(2~HmaM!RBa_@BiEpEsU~-y=YW5TetCti3;POy^wC zmhV@MBMCB4?vO2555)+aII&|GFcbZ?CUffXB^_^@b5y?OoV_L@U8A}cP#1sv8DdE8 zXp~#AHwBOG%tbqf2_i)QIt^Rf9i~S1Wfv>(KYvK$HK!Tk2CKBOYD#AL0zS6}#UJTB zfgXDsse0&pg*0jIl}7!wWhFIBQG)ToFzL3KwB_8>*GD*`u*jPH!DAc~KnE$R!5&y= zJxtAIQ_&?H6t`dVhl?}*=a+>Dl#@C@=MHPkg4V{p!q3*&)OT`tlOhxlh6)FWjpe~- zym}MUlYVcIkU_;Mbf)s*KysgHY1!@5QzFC&x3YD-7Oi&^+yn%+t0tRyWnt353H>4J zU8o%Y!lMRV+~(H~8@Tz5*(FO#I>s&6h;|5kd5187(Jq}D#VPyaX^b`R3R)Suv=u>zuz*1-$zKYO03i%=;7?LGup?n9rUy zQYnVwjb#g9NZ+Jlc8aAetlnHBB{1ilJ?Z7Dt0h%tR;UnmjHBXX`1=d3BsbpcNEn?b zf|2^G-eBv*~;oB%}SYBAIwV^+zoU zbG4AIMd`q@bX=XVSY6RDS}bBI`ksY)a+Cz@OXmNX;d!6tMg<{F?)r(o)U#)FT^;Bd zqK;d@Wm_z|HM#uVdUM%F>!ow$56wLku`vCvkN*kSq4qJ(?WP#{w(ZpwAkGB*%7vif zhOrAsSy%1lWtJAN5UKKpnwQ}%(V0xStmCm``BPaL$Bo2mOT>6_nUL%g3P&7t$>Va; zY{Tt|f1Sq%&C$&`XJ|4@fiTXpHaB`?n7LAlUOrP`(=PAq%8j-9?vV-~vh4I-1fr5i z6lG6oO@K0d?FzRnfgGF1K@U+^k=5{Dx0O#I_C?=y1Tk#Xmn!IaloHK(zXlMSYU0HE zP1E)l4Pi7}vW}GSTV13%}M)IuA>e9l{+Dt_35s(-u|6|0e7i|yD#n3S| zH((d)?5vJ?v%JI>)?a99Nbj&69el@rfe#<(`&i3-0Et-3T;^F4BuT&{=nugVqGX-h z79ZlPz8aA|C4s;NIrj!~#8@S4|Wwx0CClj+#aSVIsj&N8VbvDceH_p%zvOkK_G@&G2uKJ@9u8 zq|3L0SGOw3$K=&GhYfWaKWw(oH4d2j6XkecgPZfX_nw2_x1GrI9gZ3$ocL1RJgPHt zt2VHT@2OsAYaLn5aUKoE7!`z2ah<=S1`hb!q7pY&ppxuB^Qy$2P$dz zt4)=osi;w>>Q=Uz{kY)631UG8UMK5SVL5DuS=b@CV`YWwD2nR{R=y*gz(5efV*{J{-Tv}j3KUpxt5ZS^xeOk5@{`hCYNE{4`Xi+>(y zHGureuuXt{+o!{QaRb{$;IyU7yN7h*aJ*;h_$YHu=28N|jHj<>NVFT!Y@IuC>J8*U znjNOw%1b^$-};VTm#l1|mCIn<&G>R>#w@(R%;9qgOYgCY~_Pfm0X)}Yf8F5dI3-GpH_P>-+(Zg26*?dT|ZQxV#6svtQ= zYS#m|uhzXgYGD@bzH1CLGLc!;>m%(WqtyCee2Zno>{se{h4<@9U+#!VuXl4%m>;cB z;&>+JL(WQ_R_kPJ73{FRYs@rgGfZu3lYNot6%X&lCQdZDyy5*1v_hMUk~D4;=P`|J znq{w>6cij~e#eiq8{YzEK8X}BwARbH^j?8o4l|Ai7GW3?1QcyEtfI?0LiCDPDl6?< zn12%Ha6MI|V{US9hxT2`bYov3I+N%>pzFge^q_|qWG1IasJ5ig_P^0&+f(D5tCu)d z)x4{ABnDslVZ6(SdQ)u{SdLzr}ZkuV~LmAj34*oKJLk%%qeG^%$JiR8Atx|Wj=C}*7yaaWBi-kMp6Xxl_ zjze8p8}%d0fo;C>V9G9ua(h`4H{xAR(II5|I_+#g88!0&dgqj>{bCTfW5iyI=P<4s zhS}z)S!RPyXJ#gMP9EEJ-?3j!730Iz$Ri!2Azk$@EWsTQ@@QS4m`a|t-nFE!*~A`~ zq@a|f$!olY9`2d{R+XB;SmF^~NvA1$hif;~yNTJ${=&P;T9>D#6(xN=(tbhHiLu+0 zkX7waQvkB-TMW4Zhxnlw$0yJC$K6QgJ`Vpt``OLftjB2vm3x#kGF}ahlb|np-NM#@ zx_VO3D%j58RWV%bT!FvYC~p3NGlXYOGp9@g>6nTdd`At2_CtNLbOY!yr`@D-?*qrbD}Xe4%#)qQt#wVk;JFeW9bfS*k@1fEjci%2@A-- zypt0T*ik`|X#U73HtoiNFoE-(Gib}_O);CJ#2%zg&ABPxFhX_Bv+>~O~))J#IvrOhQ2%q-I4C!{d9!X?)Dl|6VWrsz^EkDK=$#?F_hoJ0B?SoxIL@FRt^KUl z`2|Oxptq1tt~&Yoo`~`s`(+I|A{Vdi7I~}vWJb33sqJCPdcJEmEqJxfDD2^?-s;se zH7+bSxRoG5d2SrJGlS&3vDHlRc;LJsm3)n5LEV9=?wQq`T=RFF{qJa)UcX-)Jbd5c z3#qiYUvOv1G;YIVn*C*%2idufar|?`>+b93Xx`B4kje+JyZ+;41=J{`KKEf=Q8^@H z4V~MIww1y7m%nKH2OuSB9kkMGdaMdK=hf{!?Jt|9n?5#4ztpG8yl36Z({?{W595xE zooTskK77(tf;H!eN6RYu_6t^MmGPGl8wQguJIY$s7*G{7m6C#HWJ)i>Vjmq0y!RSU z>%cc-JUrf@Xs2;73#0Y9KS(X2N=K1XW;Y{g)+2~U4{yOtR?wC8a(fmf1?_zAfVR8e zYCSHSk@lp~!Pt(>Fr?|)Ly8V_atk&L+T|~4)QI6ONOvM>v-hXhoxQT;(r+c^#dEAhCQ{nA@nL9VP;4- zh^aI<4t9_WoQpKDhfks(_czsTDZAGSfa8L-sJ%G9J!Q#moU(-=g|fw~twqkfa?lya zu;ZRG@p5Rp!c*a-kGEAne~>jkmQG=m%x<9St#+No{;Zk&#oh*-he_3JuXG3C9wVL{FK(i)?Hf`Usq-J6F7 zhDUU3dQ6n$u7L!a#VuM#Jf@9%u&;JBV6;hlBA6JR_X)&TTuwX|($~eZ>&R)l;@Zo~ zY`rX%VyP>lf7Wp+?cPiik)Yed*bf@fH9zz_Fzr?eUxDncMn?kg@r|mT3izSlNHM1G zI|R~>0j$S7L>n6$6Xe;WZ}X}iCXYO37*N2Z)+I-Y`8-FQq%+_#&$8Bf_S%d2G%qer zG-u^7OYz*@&Mdpg+?DSg+Vvk=ERyburpS-s7J%nlsaD{}$?>Nb@N%j%PIN(_2~p2H6x}((Jm< zEBv8011w>WD#pu#l-9aBu5I%xAIbrP$kvgovrTVI*u~uqQZ8AIWA6h`nnZUwQ=9w@ zkm1DHX6rSMhjKPkXthh2OP~6sWu6i9=_PNd-aZ=)|7>_+<)KR2;?#a)QgoEqDX~*jG~{Fu_lTU#U3Tf&vSjxRk2*P* zAq9Ugg|M9z>E3cxB{%8cQvHfOlU2<34*&#P-QeB1QY>lz%Rjts&ta- zl*0_a$d1c@pG#34hV-ql5;6?T_-$=dh`3uo0)N_N%S;N|7;DPB@2#HurqqZ|_DKq= zC9~}U(@yB}<7nPL2)~K?qc<3@Pxy1YJ?xUcp2zEE;&tb?$Tu*1>qqnrUrsBQ zc=hMEP}sK3^vwdFzIT&}h(2hLM~Riec^c6#yxUB?h$lKbIKjaXhp()(^tWN0aJx@9 z==(~^+3s3h(aMh0IhD>D4F6ggZRAvxYi<-=u*TUocEwAuuTW=ssftw&IyK?(r*h|Z z#<^jmMTs5EhJ@nIsvFCe98sub!ZKe{&1>P6nRd<2crB!bO0VjlvYf8II$)0Fu`QOL z7`;6--ytjkn&#kegDMQ9FS83Co5t-_d|7DDj!&@fSlj?jM-@piFf5N zpQ<$tz04fksQ@89ret|k_m^OpD(sWY)FmCv)`&X*5yF$+jjo$<>o_tT9SffFw++(y znT=a1u0LN+U8CyANf?a#@{Q))EU22Z>H~5l^aT+ zpep7Ju3hI+F#Gdq^B$)=`{AlZsC)@-7n;MITv4-e;aw=>{19e%UqOzCbX)@Wf{tpc z3P+mw`K?TL16hd}FyjeWR)QZt%(*w$dJIWi)vvLSbd$9gHJY1XDHAsG{N0A3$c6?5UdBL|*NTXPBDI6Ec{SL0^}9(}@e z93Fje(9bU!&hP4+bULq_o&BlYdmoz1?FR$92MmLZnYRv97n!wMvvVIoVskf@h5ca0 zJp1MVEm71`7}XZ_s9ZoFKM!Ych|}t}@=Sk+0EzxyS2~8lWxJ3h!ah5{!RJQT6`8dx zU&XgS+5(wc_IC+%x`_yZeS*OLP!q`33=Qbo=-88xwkhT7X-O9!R6|j=r*hd8a!zEJ zZ6*4$v(zg_eW1y~me+S7HyAaD;2r%u1DtgGJ7$aF9LK<9w7Vy8S`H>8i`{(DOe&Ba z+`W;`3~7tSo3&=C$UW5+$;Nuej@R1?9nE){@W+pX$T-=8D6w@hxjK(ptvwD~*bY!R zzLkvyG=t=gPz^{W1sP@1YMh7KB2T5s9mlVHuMr(daHg@4$@aCj^5!4Or2_id&-f1B zdpy@gdbvWLas56VryW0ec-@!{54_bMxsKas)u#sMoTmcQ1{4u=R=>C@X0|jq$Q(8L z+v@~z&_XndTGvc(92;vaB#%+IWbK{EliJWc3W57V(90Z;XVgew^3tGaQitafb#Yu! z9!77=aZYLVV0Kt2tJ?>vII~XTX3%ScvvW>9lf}Ij{eCwDjL%)jMe}!F2R|6XHaPp~ zKowm^oV9Mdhn+c2l_|6)ini^0zQZB+C6TQ%niHwd?+j`e7 zn<9tkmq+^Vi2o!su&-w%)y!E?e4CW8$86%t{Z=fF4(Ppzc{7tcBFXdNZq8DclO;6; z!fCAy6=BKZjyQ67+Fp%8tS_#77>ybTiyR7ckEN0)bErqXrJrwWcW*@w>6%-1jm=15 zi(`MG_PTAW4(n=lnu$t3^j1IDrLlDO#kSaakWSq*G#`csK9-tE8)IX)K_;2(j&td8 z8pW-VCutvFEd0Xf4#6L{h2;1-CL%Zb?9u(NX0q5a86L8_3J5$My@lP{rh6E>~6TrymC> zVIEu*%X##rfmAQMY|jqJ%^lpOLJG&?`8mk(9Pvvygwc*=FNu8Lc089S_g3hs6AtOw zpxPMN*AM>ug5^AIZA_f4Yx$#1gq!pt`?u2#=PJjn!_{?&oo#UWlBx1FI}|^B0>8QT zG+w)3pB>Xoycy5-=aV~M@N`4bY46+{DFlO<&7JRGHCunN9`;D?CF(Ky<%or2nwCZr zM|70>lK0(_q4a+{23ny#^Rci`} z(y29SygD^Jo5v&wST)dcK+^C>_94XKBfhycwK7XOwnI@e5bVj_-Bw^OslOv%Bd5jD zI#P=p&HWApT|(*37{CXyD?&<2@@~S%O5UJytIM z;VN6hSaR{vtCQAIztbPR(Ju1X2Pc4K(v{@wEJ!&Rda|QJ9oY#zG?N1X-A$iRrRc9v z=Wx!oP*BB`=kwu(tA6?@f{lDELhX&bBmBnCZQ>L=Tl|RZ@$BxskxlNodxf{?i8m0Z zVQa`#UO4MGIHie%kYT&5ft+O@QF!m`t$}RtWZDde4g>$5^1{19r|T*$NEi(4gD>3` zi|Hq61_;ZpW5lpfFwNfEfiao1IA3vflwGs$wvio31%r(oVL4(MRsPN*~EF%jLE$EfA}M z<3nQ7v4k*VZd=2^c;3xv5qwi*PbS)Hc0%@`7(Cr0+g%;yeoKh8l^3x$bZ_YHGYE^f7d$Nfl@ z|E7-oHwW}WG9qO*4X`O5=uE1LiAaH9j#1gJiNk?~BY~fp{MH+SP5W*-^4>Yp6K*_z zx?ZLqE+G=9UD5M8<3xX?hYi=tWWQiT{n_27OO%wLsB$v7W7>rj!Prvh$6U*KQZx`d z)@#f2S|Ti48v|lD%Y&7Tbu(c1&F^yGjbfe z*EQS6nU!I@iQMo9Ye3FVaa>*5N6F5h~ zmHZa0&5(B$M}LZhT)${`Cf502+hZv3E*!O6Ko6&!s_7BlpuaecP0}Knz)<_R(JRF> z*M59-nvFoM&6YAxEs%;mJeo5Ze$TUMYvTq2I}s;Z{fb3v%dt8pXkn6Y4I0gy(3%1q z0nF2=YMk_xqeI#(rfniL5yiC(m(LBmd#+=tc+TsPdQE58ztu)}CUQMkta1f(W=#qo z#nOrKKb?z4Jj=NgI1>>HtbXz|nR2Z*%=d(3<>PMBp_jO-Y*~PI>^)`7a#y~y&7lOk z^-)HotiUQxBnUcU$Lfmtu}~`7S-e*qQHI@L@yVTW`97zirS-oqibkUp#hhlZ>VIea z>`73HnZ&ai5^OtbSb=*z`cWFFgc^-T)#7P*7X9Uz=V_J`y~b8{u+l5J?ciY|n1|0> z@9}(vK@5eARsUR*0`j ztqg7#clWYNnu7F9f65-{H_de5Rku3#ckL{<@n}KJ=8>VtaM@F@H_7ud3uPj5L;QNP z=svDBEEd-}yXsdDt*63t{!b&%d%MDxQ;un>d--}lf~**nmve6VsXFwkJh;j^`QBlP zU7$r}vzt$Jq~Dj!U(zM5>vE22q0f0d7x0Rat4x*hgfl0C@f!je6vCYxnZetoAKVVF zlI3a^u0qUbgrppZYTg(0vIyv|NI2Fg*v{9tl-1b`C$+jDcq|QuByTX~W5m(ppC98( z`HjDU5Aqujn3*!E=s1$&>o1-dflYO@y{}nL`rV)qQGI%iLw-oObY6idij}D}V@Nfj zl-$fROor>Fb~k?C;*)5Nfc1kHT2{g_wO3MeaCXCGdw@N$wHec;k@V~U&I?ujvA`=) zj*>v>-1{?LdapSMw?I1>blWWeZP=rU2vEF%jPVTJwZj*`+|l;FeT%fCEu#;3#h>g& z7TzeF3QbThy%}_18mIU~E^1uXnT~bt#6PxaSx(6aQ;+Fy0WJz>~*?fBEB^7~O7ynCjx?b+|4J%WTTa&S0{aTJw!>F#o>)YD& zj$p=_JDF+>igZ)CgJejUeI-36^>b-9I4b4H$I@+|B5wgU6r3Twv2*4zkCsDp-3+V- zAz4nQBaSqVe@jt~dyY6Rqv$j|6O&fZC8b0hQ^A1+$iH+U*>ATQ@^35~55{X@JbernPG}3|bcz>=OQsN)y+DoCi1zzQO zb<|@MhN@Zc^RIOjz^#2XxQW{ixAQnuL4Tj4z#pri30V{p5_B&r&DeIU-2x#xmDglv z^E9w~Y}}ZR2+Z1 z?6DdHC!H68zwr#4&aB=33{d*h{ZBkv4O5T2J>}7I5p4K@VIjLr_I|-$qE4WuUeu`9n~bhGYUR3C*@sLyvtp#XSAO{Aw7hfG7Ef zY%6R#u0WT4s1l#qC>dFhuq-+aja^5OyFHN;1pVUhX!(&vQef3Xf_OA$^Xo?uI_o3!~=q_qkR4!#UcZzxbrz<`I`R5|&?9C$_`|m@`50XEl9-g9R zaUXpp>)wwgiWdE(WlKB#)y{Rga-}NE!n)fU8$;Y`a%&|i zfNm~<$F40FXUKEJ$?p`jVa!ML(BPb3(<2N8r5KxD09=~f%yreunHMmD3P5>o-;@4` z;3r>=QebY}+`o<0Zz|4|s`b}W0`(>fcTSn-n?XgjfOr7!rabYvWV_MKUri^n-}WpN zA*)fc(a!oe2O+oTZfm-1Sa@f%Nq*W2LUn$Y9QsLmO)EEVY2@|w35^5PwR~;9K~Ee^ zLV5x)X) zU&T(#@sLxRGQXvv(tTVMuVaq%TKn5|MVF3P_6_*fj31>#muib4jMA6L&5fqDADgj? zv8R^Oy{urhk-?5QE+?LO|EkpNARiE17%Ec)_KTf9l^LFU1bmPXVu>vYmo5d%N|*qD z9T3IMyAj|ig>4ig%X{jdAvdSum-NV6E*|+@EFQ`C7T7mGH&!MW9_?T*>I$+cDGYGn zmgFeO#Eo$r*RdA~qRU%GSMqUkRzue+r{T64AecAx)|d9R4m;6RHt)Xg{VdU@Ay5Ck zrc{m?%n9^@cH~DSl#01CO(c>ow)waYF+3%p0BWT{_bmUk1f>i^58wU3dke5P$}wRF zq45zdY&-658PT;B{syQ`zv~sME%geI2YC&ruOWyaD}L?~Bm73PJhi5Rk-oz;>bnlc z3@(4+rqDT;7e_7%zQ9iJbX~*8y`CPk?0edOo{U+ZvwVn;IG<$O%|wuv*e>OD;m!ey zs1MLrg+sLfP1gmL7|#~`-GafQ^dHX0>k!pAOB?`1^$Bm%3Oq4Hj@i|7?hLH@qDj4g zb*IY04B&A~;zaL7OX47@wwC+@GZ2?HP0 z#w*cuLL>kZGN;h9xgSNK(gA*fKMaC~29FnDK5NzxSeYXNNZTOCy;lp371$eHv0ly~ z!S)TX1=HUk==PqikePfJDy;1PaC#xH_D`8+4y0Td#Xs3@?5gE`xXH zZ7qkk57MEmtBMwrEPV*mfWs3|-LP=1?CjM2+IIApDL2>x3H^}A51(7Hi- zHtsM4MSIDXB}LwAg6X=yxYXxE_S}qpG%WS&3wT?U2|Ge6U!?g1S@68*qzg0$zcH>j z`7G9`b---y<$S}Io zm22tD_s_<+F@448BL~LqoAGvldL#Xl*8M$OEKdWt3zr13YYU;Cg$3xv!kvd3yJg$M zuM`IQ1o(j@$;f|&Yqcw_0ESig`#1ppdCebwY5qv zyi;1MT%IkuCuW1xvn(_G)h`B8ZauIM-(LNZ-ur_8XEcf34bMBmcjq;z=M>c+MvA3=eJAz~&48v4novYQZ15cAj@ zg_s9Wrv8b`2^6`R;*na05?W?L(}v8^Za#ktJ5FSY`#x&$UX4q!q5?r}c%Vi~FlxKRMF zC9qmH5#hmt0x#1_R-*Rt#|4@%SGk&qpMdjeLi-6AtY^nrz17VxFU)@01y=+5L*Bzl zv$m%{0p+}Z`ROlZu+lZqO)%tCw6Ywz4uqEKGpBQyjPJdYgMAQTi(1M~pwpet6h%85 zxq7sq=!z#eWj4Paf({50?X4a4EF?+u|E)RBtuYXkzRb1x4DHmRci9ZptgyPR9xY}q z3lHn^$v(9&9Vjn7$YTt)tf%SG`r)tOcL;ygKl{|B&H7St zYbrzw%IDHsB^e~JH)tdSBrPxXObjZEiF}UZSuKBIGMSmtAe=pHKtilA(n?%^JXJV( z;)E5GutG#AC%|(22(^*@kJ!{U=l%?5@z&)ZC6a>fpBYXHa^!Qdk(b+IjRU(HnbfwK z?;)H|WK%giajCHV5DxZB9C{=iws|yl$5yVk0_yujC%c_3#4&}y3Bu?AvcyYFT8sUU z(xPLLAY&0Dl(C4Hh;}C8YZb6W-!#sYbZu2Og9pSNhuUr?YS^(-1vHN?Asp-Gme z{@bscQ|`)awO5x;I@gwCeDCP9mYq?{{5!XT6ro!FTcLb~W$svAk#-@D;~RqJlnv(( z2azcJ=KGyM4k6|fur0Tb6lW98U}Z#Y#3u+l$P+bV;_o#@{o7b%C=X-77jliU>vfLi zMt=Sr#Lua1qpy^;1>rKC4RC``N^=ASj*Q5@Q_|r(<|S&ULO*)FRtB*f-#_y`VKkZN zWevFE|G{;auR-OIoKm%qhiC2jTz3aBR=_rd$`^G{sJSG7?TYO1Ju5Rr$2lt7>|Laj z_!ViK@?huCg_cqwc$M|v!#WXX{lRCk3RF(L_+=9kQDZjJ2{6ne~XJx!94 zXFb51$3F-o zK}#};=kQMfR%BvNIeu*aAVMT`_Vru!Fh2))o>|S*& zZS_J*1tjz5?uH`=3JpA|a`T$E7BMuMy;!?HuwH*4m4h!^53)J4CgR{1wo$8h&$(0o zG&(}@)gjn9c#&xPqPB42W~t_n#gm*n8FLD}Qj`=fwsD}qn+&`B))}k z|DoSh)XVg)Z7Xte>j7@lQ2D|?yk}UEu>7B&dHf+g-_I!|3VowoMyIe$g6ZwaB4;|v z|E19RyR}E$r-r&B$Ou~9sd5V(0%?pK1O|}Oww)ZvTu-Sgyqx_us5&oHg z5onR71A%bzRH0P+%EOubK7X%hYu{ELEq2cn2;wA4{?Ktkzgf%d!@7ZrY|VCipn^_T zrqj8GQ(x-uWobHZ0R}4;M=5Zi5GcO{6{~-Mx#W^&H|&JK$Cs?~Kbd%zx93%0UNA94 zB`Jf!0D|M;Ye0RPdeBK)GSW<#JhT4=-b{<&iZHq*fi(!B`*UsYW6Lil<=|erGKaHU zktT%@di&2W_nPH=-;p9(R}dbH3Q2Q(J8(Qf|7EIi<;AuM+USRjKU6$17A=2 zzgcIL*jC)m3r%Eg#(uSYFRWlyU&Q>4|3{wD*;GWW5YsYZl~H=h_f1>Q>i;8e&{Su1 z(sTy&wiK|xu-!$}3O*=U|6jQ*43w7h6RKmx#z%3wKyUhG)s{~K^!5w&WfhdaaH<$eDIT>G%`|Gn${k?+i1P6B%a z)T=gtLvv5d0-Z9BACcuM`yCT)mvNdWd%nLCmFl~!mkKtY|J=>JopjJUADFq`RNt2s zeDoUF>9Drj^DVBv2DoDb)^Mb#V)c9Z0JsVyPr5#?&Yp|;^n3a2c*lh^s8r}IFS(#C zx+VG6ZrL@0S-0_vrWJnB9MY)040SN=fw+yk2KR*c7g7dGf%<6IXog>(5P&fAarW?z<;l&qx(N zL5CJW6ttO1PV)mg3z{fmgMn4mc1W1THRN=mYrRaeSg$!@w(%SHWkPNX{j|Tf0TUOKd^Q z>>*>6BxDXHrPb)8a^S2%RG|yJ>pFBavPtXuj7BzW=4~`*6R7`2b2el%`~O${zstv6Wh6r9Y);pgJ#t7@;^zK;9~Q5S z2ip35-`&5mXTR0`+5Nxf)un^`f9{pHiDO|`?%sLy%+lujKknCmxoQ6+zMkvwujBXs zF9tcKn7!vo{ja_DGj^4HSX}>q{jTigH}2K{d;32QbkVoVyYuxwKFZtGZ9HO;rYrkr z|Nr~y{rUw5U;I1$|7*RyW!Cdk@&CWZ*MDp7|M%geQ~9wEAN~Jdp8VhH^V9yiPwIE) z^GlrM|Nq$E{`cwqe@?5rzm4hSUb^=)+w_ms^&bwO2k!Iu_vmE(+?}7E*Z+IoUoTc= zuWT@BM)a9k|3A&I|Mlan2;0ZW_En$a)ArSTS^WRP-gC^0e?Im+f3tsm_n(LT^7>It z6aW5}|5x$qx_td7`T8GU-lyxS{hNJehDG}Et%5TS-v4v_v^U-zt>OPz+Q%4zm7uf0^Y+0xe*?noR|l^@CG|r tA`QH;3p;o@Y85PChG_u$0Z$G5&n_uaTm96ysh{JBsv?qI8Uav_x=5RH`CPnhcOgSE`f{5tMO6K&hbxMg^q= zB=pdP5rhPSQep@Yl@dt^5ki7VDDTRg_ndDt`~0!rf4^&AUYe!El=a-}etzv<4;}2R z4j(vvKtx33@TH4CUlkEKfDjQ8o87+`_?ON@@Fd`2SHx8-OA*Yl+#>Lo6o1!CFk4#@ z4d8gc$evwNBD;l80e*?>Ixe#3uVWFB%e!R%=lJTblmGWQq9P)x0U~1m_c_;rN8#aR z;UE9=xVv=M|M!ZeqW}Hu1BlYy|9vbrEBt@C{)5MW2hgpHZV@6Pr@Mu}cb)XRP7x6~ zCvxd$i)(jwEluxhF>vzHUH&CUAU4Cq|vVEtP z^bR3q{p*?53n0VHlVU$u>^bFJ`Qaz|Mfq#LUtJ@H5fV^g!vsh?D`Gfim^W-nWVccX z)r~W5SQwtiG{EPi&;PE8`M#}yjDM7#(f-R_kzJx<`z4jm{Tl~*U{Rb|Cq-HjvG(3x z_j<$ErsyB=jnq44-(uHhSsi7Youq&FrT%(5u$7ad@+lyIS-QdmVQSZ#>xOlzrocx_>lX?7s zufUjQr%-U~n8n@y_PGaMDalP*l3!#Hw@3QlTCM+j?*~UE`EoOZ1^*8V^Pl$tYUDrX z?LYMoe^ga6dQf9l{beDI%r;XnOAh&=v(SO>1RCoVhd{R=carz!-f7_H{1@H6`* z^J$_P5dO{BPf7a*dnKSA&esE8)b||^%0A*!v;lpSj+-{4T}AwFh^|jEf1~Sc`fJC+ ze+Alyer);Ha}e*+#UBr+gf!=fbxKK6!tvU8cPf9PekP6qN)lgSpGq=TzxDo?qdCtq zAZL5D3{o=O84>z^`{uOU59^6=h zLp6bKXdU~tEOqcihS^nxgl})c0Wh6M3G3?E;5K)8@{e5oX_u&@R10Uqy^4KSd6`c| ztWUcvx3h;=Kl)~HmgA?GaWP;2S2&v+unUaP8BKMW-SOFO1RUJ>a$9{s^XbF zjr*1DL$Go04`8)WY~j#j3$EJ0R~fINvu`T@H~9W1w0wgrmI8qjy#{u`;kvZ_e&v1O z;j_;Y817UJ5CWFlXY$<pV}uM=@5md9&vW{k$Ez52w3LqaL%|( z-fX%l#v@|LU^l!)ZX&Svu#uMOa*J>5yr#FCvp0g{V}zZN6g9qJ>}(86hMfpm)*w~O zrEX;U2$;>f0OtuJ!wN$NFKgC}c??DTZ?yxvcUj%=#7C&5y^VM{@U?&IzNB7!4b#3l z_*6gKM97g|Z1724I=YD~kM2V%b~n7vkgL+fwEX(qH)pnB_BPRcJv*qjqp2v4{%SmU z!aja-bkHYN-dXBHu>=@vc+5=oU(d5%tynJFrP0Z4=NHnm5=&$kQnR0>n^ek05cE}> z!;x$6t&S3xjctyAWp6gMF4f})`=MJatKGXL4q;JIK*i|M_6W$SxQ7B3Nwx6uM`Bz&3%6Oqbz6P&^%qU*bXRSIH(;MwZ^nuxg29YKa=JJby^BiV0pxN7 zx(arEIJg}&Slhv!oafXc5#)f1V&Tf4xfy}};h&0B$i&&gRacPLa@uirlUkKC$%M}B zP1gFbA2&tatgOvbpjH@sbpoisRlN6B{ZyiWsoH!p5xA7q#PX!tUhTGp)fdY4yJ`dd z%UzTbjD6hwVai0Mqj5lMqM|7La_on!l51Rm<<_qbYOZ4#z$IL6uLS+64LG@#tYVHA zSa$P>i^r8FN~o#t&BtG!>^zHFyG1S0_->0au^mh88Yc=0Q$TL4INiZ|28tE}#qJ$D zVC{8v!ab~`eq$A{>-9oB+Uw5e`{UZT*!#ZsE5rw{&_X@i^LwO4#d5DCK?PW-rff*V zd)FHGnCU2MQBLlQ6W_`t5HEw9$goFn^e{)GIhZK%k8t^RKD`Q0hNC^DPT~)tnX0Vf zTRkABk>KRRhOX^sZk(62k{&{ohas1AG&4FDHQ$%)=tIDMMwSA2wRY^h)ITpmcl$ri zZf~&h$r6yWpEKwI1X=3$zEuJof~~F})F|GSCuX`n9Yd{OpNY@I>)@&n{3+}L;{@wa zCPWvPd>MGxd*^zPvSIDFY2VM1LkaiHP$O%JB@Cy~J)SsPEG7h$S<|(}_fM6Nei`~X zaoCu44Vu!jy(Lh;ri2xT!IqA;f7;^@w86q}tww+Q$bYr*WA;Fem>Ie6sGV-nBQ4** zJi-Hg{B#)y$82&xvfvRl?xdbDk#GWwT%g*SGOzupef=5+C_{~FSfJk;xUh?WRmB&b z6HVcC`D1P$d~Ol7Kl1(&=aankYPfmTf#k%`2fa@MJ5+71!^LxFptsJ>Wu8xgN$vaHM3^fOc$m-S;G0fQ!>K%pXVcig$XG=ktTq2Eitf5{e5EcEQST zggVOgjK5xyuqVj_e>yn|%Em?NEg%K@B5eVT5*&FmQS?AcE`Kj@B#Tf zAf_tBy<|@;7Y5f~GDxPq{0Y_}>qOuSo4;P3nAFA;9F4e4vLSVSf2%tPEcD0N=_2b8 zhod`c`M-*Sed)#*VlB;)Ii9ogD{K9dlmdysO=48RcOk&lub}yz+}MR#mnz~d+9R(o z<5-=6P4CB@-D{SBb~P`t^+}@AU5!L}p6=oqK5=KXYfb&;44kQ^PXe`@SW1#CLLHo>>(pv``AH8wP_d2k;N=BfCe^0q7KqLqscGm z@S*5(doKiM{E@&f-D%JhII4;=n!Nd}Ua*XlaDs*C4g1n>FTTigru{NZ8`DRC`{A1$ zk|GY;_xY){&M9NA@iLj!CmZGwJsD6T>_8ILflaO4$)GBe={QEsJkWKjxNKpgGlBn_ zMZSxzU3($6q-A|zF5x$1UtgO9+$`Olw3KBX+A&wPhUdFZ2nKnXE2n9w7w$?I5EDXZ zF(7f`0tery+5ZI)K3@($Op7rlIlKAiyd`C&#kQ8rwdGQV>`703k-#T&nSJ}%XvURO z!{>51jc0sA`_k1RD-SzD6q_JjTb$aeXs>9pzbaMz!-V(NYTo#rtIbBFEMWcWH6k+C z{rtBUjJjU+G2_|?kL5)BRRR<&UBvpWF1^OjlzgmWK54>wgr-%(PZxhfS>0a$cS9BPZy# z0Xuq?$(#p~&sqOtOqtX%(6@w%Gym4T+sXT3fO)9d;wAgazS1iK1U-jw4Utg430z&y zk1d#O<0YrS*8D(B2ve1m<53l{F=>r)kz;#FcB26TN4IRf2W8?ADftn%KZ zYbfcD$Eo`Py(h8pK=jz>k8M+Jm{=PgC^3PD_r`?o+xXq3{f`5(XOh*1IH|FaLjV3X zO>f@iRsdC7i=MV+jhXvY1Yugs=USc*5fRG~cs6=_Lu2Wz4TV2p{5eJXGXMNwb>7Fs zujdstK)c8LTepHoW=~ZVT?IO2gDT&tX&c|Qf!C~b>Y77r-vMxesrD2n` zE>+J1;FDJX-$(j7>_{rn@nl09If7ERpyvIcK~SG8H=es-oojs&0RUSNO)( z7u~Htj$>O$8rG171SXy4rn!YcYq#c6yDp#G_*2?mYkP~e?X~Bl)X6aPQ8%!Fz?o~A zO|Qb7(2XbK%Qt)c>v|$*ZPmz8rAf1%-5%|;A^BQ%;pA{ z5~iz;P6#)!gUQuUOkQ^MDqX^~n`N@B=3lqCfgL-30q|ai6j-=wJS{3drvzx8!E?gMLo;!N|rl z8(oeZ4`x-cIP0D39rWX4!FZr~x_b#GNzPPKy(fmKZ}(d>LvRUk_;P%>_8p*1y!Je# z?80}N%x!wi)+6GlzE^eetLlNKto_3|`37`CZjj}Yv`S6h*-ajLJC+Qr@6n9S*3g76 z(&(3oCmm|@5*_!QW2~h-bn;lU~Yn<^|y$6RcZj84eoDP#xj{#X$k8C zG0#(=63~rr8rMvvN!(~MfI$8kBPZQgQd(=3Os_ZG2{;-+Dh_!vSyC^9bx?;jccrB$ ztfnrbT&0}fDY6TZBlaR^kaNVe8=xaw-}|9na*(CU^YSVKii=>%_+Oa0h-OxyqBUx; z`x*&Sz*2p_`c^mb^~P5#Xr-tNXRDBL22Da&90sjzptsq`K`eT6NyG1bRM4k|CMIpv zDSEQx9P1@sSVvFmyv@(UsNASR=}#BWYQK2|fQkA8m)AnTWC19?T*6Rb+4O3+_OrD& zorrPe35%#pvCOzJ$_sn7cVIW_O9UN)lEf>g%3b9QToL?P+O~i#LZ7N&=DICO`Nd9- zmKcWLGF3(sN^|?$v`=aJk11DhSKi%u(wP-b@x9a(qQik;yToFoL~WIfz+H{&l&Mf< za~7j_5&cXDA*jrle<^T3$smCpVj7LGmg3U6L&n+cpL~N#XddznXNZqe4nkL-WMn6c zo4h=3`T$hhxL&L`3x}{#sy#`}C^~7VUivs88l=>@@;IzI(LvvBl;6H@38^oJAM`j~v&&oG%G19L2toX4`aU#@%S8IgVa3Ze_^Z3TDcT<6VDOUIvvj5b*yv_8*{ zMU6bIm-=LLvQ7M?#PzPw7>s9Y=%$eOSSP;BW_h6Yj9LE8$+NNT-YhL&fCueBHLf>A zTOS9x)cu6@*_@_sdKzdtZvxd;x;PnQ`4YKFLV9j0cZ!D|a*XEYKv&@xUJ=A6i!37Z znGphV$_44jOUFv9LX)|UdFT;54YJ-E*wkymFoIf>LHq2i$e>}rzUnKvDHqd?7xFbq zKZr|;Pufp(0Gwiwx-mq(hW#VGo>{s%fqoo!h|Vk*=p$BNK1;8Jy_9@N(S?`K`+z`QBUe-aiuiOoNTx zbjymxQ1ux;_aM)fC~acpKV{Z=xS(xr>Xh*ZLBqm>k^|zoEB4!K_F1{bx3wAm(eK70#+u4KPk~7p!1h6# z_f-_Q9{EtD)!-7}&|k^lOw%O0^!503S&uD)7;2x;H-7Ew6;mJWlp}Sn(!&}3$6yOz z#SVgpALE|4rySenhHXdRMjmexkBaYP7qhOBO0RUurn%E@H_0~H&Mx6PY}03jU@h>% ze16&_Jcz_fT*r-bbel2FJN8^5{1v)_1?Zh%`$iIIZQ9uwIbxU3VgWrK55StUtBQa2 z`xBS|H6aAMdnOh7a0@?O?2$nsZhb_kNIo6$M+l}n!=1xot@~3 zEbS;V=;NUSt$uy47j;ad@M%uo-+PW;IDkDv$?kmGeLghCv^pk2(7I)YIPFttmQLwh zdq}vK2QmJ7EFIbveF9C!bGg>4GOYd(h~uwF>+Fd1}W<<1x~?vBME1j*n%Hy+`-?=67H1 z88Qwl`21lk%mErQ@jgNntewfQ@OIUk4dkzvs`|X?yzAAsMuTqi{$ke{bs1l9ITgSi zAn&ZO`Nx_KgsTjsBh_IP>2$hpcg2(xKzX^PrEy|~j)h~)dtN>C>I4=vz z%c4&@hMeu$em)%1A6%l?Hg`;|y-+WRo=Eo3i<*F71ES4qDPO3LM>$YCv z5aZVg{L|UOrUgN7U9-#fW^UIuRSvJkoJ9qV2!W1g^67&SnAxL%eDVcli|pbcx56f$ zhy-FR7WYMjJRGX>;#8^*7Z+JA7BX2@?!m8r8R)_UxzFH3NVB7jb5<6MNY7o9LY67i zsjB!v=`NNpZ&wkC3Dlre(@~?DP5kl~)?f^v4VtXEEbstx_ z&hv`fD~hIJ@Ddpvzm0EU`p#%BEQQtKb=w|Lp2M9fqO$$*gV*Q56u& zuc=q4o<4NAU>O51iG{_!f8;eLTX&HRFOG7i9s0F5yZD>kpxxOQ0eSUUYOlCywNSR{ z*a(OTttfOjA3qQcntGmQlAU*&3kcX64cRvwd(a3UGCprH@XHULmPIE+e;3Fly zKhAAs7Ap;uDp^}5f&P?>UwS=H+}spmHkN=89OpEpj9T^!_~zi=?By$Gwa^!;FhkYo2|YZ zR3-%8pWh|9tG69 zJC5*^kK+3s+RGWh+;E;&#}WD)FGC5zXwYpyL;KQE zg={2Md8?~T8dwK#sRoe9q3&D}H-6Ye*aH~-G(Ez|f>=aF^74>wdoJT9YMNIr-cS>h zBG}zB#_A<5T6=3Ke(&B_iIT)&_ANGpB2~{C-{dXFbWvo!54-ZyW7?kuuP9=rn*{M* z@Ts9CiV)m>bh8Uxnc2E+fz@fXQ@=Cew0+oUs;-@8&N|f!Q#psKFPM$1Y4oD_`uXBinEJLqSUm zZ79Xv4S6Z6>=PN7mBxKJmLz$D(a< z|5?GX#H|9Kkp;-V1jhCrjNEp)`-bxUZ^ney&7=)|Q?SFl#3WEyE++7E2ON4tK{+aa z=95Rl3r7H*vgN?TvF2MPI`70}PvhtOu>V+)*hZ~re8#4DPVOS6$kNXRO_IG%yF0?3 z1`>b&@lo7^dJMBba7kY3>!pv{U{%BQoKmY$o;t0 zLA=>>G%~k14iHca!#Z0da*1M>PoMC0$<(8idx2b^dxCQ1fCym6rW!-{pmn;pr90>h zxAl4PqJ**T2X$~(AkKc_$8JtAB1@j?GaX)SclL7%$f887{zuhVh~l13QhVB}gt7%{<&J9%wCEi}I<{JZ5&&6#ZB? z^Bi1|fahqD8u{K{G1?%#u9cNi&qP1HuV#bQjjbgOL|z)mm?m8kOe*vAnivYc*IJ^I zGHs>^HvRm|QO|2bdgLPxghLsvNA9?odaeBKa>(FW=51yR&SE#SRMd*z{CxBzC>pjvjlQ_~KsNN{y;d2Qz2s(wOP8Je3~xBbxg0^M z5J$bjhCLv~dAI5;2riH^wCe5BLoofQ*9N{}Y}`3W7a)w9X4pbxVFLzV0g)$YT!KPX zFQ8ON$lv*tG+zAywSo&NV5xxG7ADQzkip9_olqi=e5Cy`=^y~PpUT`+XWuMA^Ag9G zwmCeMUZ}SsiuHhQzmxO9t^)F$S`@Qz2iVMYOG;6Yx`wYtw`$+qI$NS-Lw)LS31)ry zJGks#ieWgOVPTQdiD;zSd(z$11eV8#Lh2JkoB6Yf+jky9yWz!Rmw=vC|Ca{>)_eNx zJc;1mY8waqiHUJ|OiOMNnm6ixYn37xeK?X z+HcYfR#ZDw%!$DG_YH|Pu>b+Tz46Wf=egz>>WEIq+K1tYJAg4=W%<0<3q7hnI*}F= zJ2P$-(ZMW80{hSLjcq<*L$CEc@#vL)8YY;Jzj!ZBCRI(gm%cfk;Rc`%3E2MtV^PE146>V5M}rTiCQyggJH%oOtzrD& zR74><-fm}&>|Z;m#q&lBNNmtfK&Lb!b@AIz*hiahy*GY2@@1*uQg!2{BR4LdR88u# z2NbdK8tr}ef1yxfI))DwCc{umlQ3W>!oa{w2P)>GTM%#ht7<#j*b-}?rNh$IHz!o1 zz8-kP$$`FHx{`eC6i%})^PHtOMbPsSHKYTUjCM1mxI^t*RKSTa!*>Js3^;~t=8CFr z?4_uKoGbi4OCiSNGStaLV`pFkwt}NKg{I0LPc>6raPkcok+!g|#W2(lbmtZJf?j-Mm?LUpgW&+H>{;+{zrr#CNB_DSgozNJDvhLpyQq}?KL zowQbiK$jbTgl)E?>-|9g!SGcWtZ?Z--W;3P>9=UZZ))nWc%P_&dMhnt+ZSE|@+8-rpn$>`R^s+!E3x1M9LIzBTQ0 zv-NQpE%G2wpaH#m@mRI)8B(ZQ-`dpg?FjMtjf_DYc6o> znC@8aeOd6RKkP(s>IbRxOQs6IUeT$+?WE4QI4>0}Vr!Sg5PjM>&F=S3(Vn~%=}C%; z%2S!@3y077p=TAE+aM7<{Ses-gnLM%lQH1Ls1bA~j{aQoy9I2)_FF4M?p;X5zM&FI zOwjt&kBLCRFNFw%=oHuxS++O5dv3ssZeRGzseH3f(PPuPG*GVhRX$-n#J2#+>>lx` zVbx!)O$%B~UPOEPMq!Jg{P!7``xc>FvrtTEvbh`8wguY!K9ptT8^;|LFEqf@<7X7m zSTCv#8qu6%cMO@0DiqK^By2AfhLzkZ9-_%43v&X)-M$fmPMZ4~p#c?iJH(WH`{r_d zb`;jLx~->x7o3eyT_Z<@(r}k>NIzc>8{0YZ?4B|l|gr90a=&8 z=b2X1xnl8?W$A?9a}jC|SN9pMwsm-MpSa1u+m^Dk>a0jM-k?7?_r2_h^Nt=(Dp2V`w0pT0^aCN>>Ngl#A^_}Q=yr2E@!1C~PD zoh8MF3@$twqkgJNM$5+7bG~9`h%@P*#%^OwKG}R;eLGRlfmEieg(5PzLvx%{W#jzg zo}_@H{xsBR2*l33-j-?rG(QMqYc!f55PrpeljUbcXl8ha+TXgI+JXHDyE7}|I~Ohg z={fl=55TpxwzLDqBA7wmRj?IC!Qx4uK4AptKr9nWgq)c6iqbg3JcxifzwNy1PZGT9 zJ&UbBTp4egHhPrraooQmEoG$*#w&PucSI@Ae-mqgsGn9Z@BK-cKzNC|3&)13&jF5a zp3||vI9q?mPVFl-%7%XrXPe=_80-BN98$E%ipM{F*7QHU z>92z%)IVGk)nY=a|J2KXAKXwrs3Q5qt^Kt7B%myE!{?q@afjzn=Puqm&2+CJKKG_- zc_Ngb@S|UY-71z$If0}QL%dxvA6Bqb2s%x1y>RrV;(D9}=riHx?vpfHD2_Q{YzhY` zVaa1KXFv@h%@uYl1!8{SAN#i)nd!h5>S8k-Oia{pXF*~-;3ww@)gepC*W?MYD%`!y zp{Vj%o<>#mhgclf= zn$$x7j|f_SBnd);qj`~e{>Zk$PQk{O!%(lk2~(mevn3;Nm$RkwO)jVU zKlmgJ?8!xqU70tf^6cQp8e5S>5aIc`dk=uN8fCmTa^BjKz*NQl0yxi9)*&krBl>X# z2R_Ql*Eh$`!E!%vk9{zjuyG*|ZLZyS7tDl(nv&U_x8S6j26ajv5r1`4ylJE(A`y1s z6XEGJ1bVdXG52pCr;pgw4C0t7W5=lxNeYm>=%Ciz*8%jI!Nm$RMEPtkB~?u;3)Pp~ zr`RnBt);}^$(`i<(^-o@SjoLNeYAKoQW%XmG7^yb35H!7gO-(HWX30(_b9%onJ7M) z2_YXzxqR|D9{EM#0ul~HX|#litEfloFfu6GD{k?He+Q6Rm0M_MbqXp`MwI5}A#FVR z`=9F0V1c-MvY1!6>Gu>`D4?!|FQDMg&5rysk|SNC zjbF=%w-F2zUoBI(e=9oW{oi$a4}LxwbYe$4B*-MC?2Vp@r9C}o6W`RG(iYmj(x)-f zc&5h#)O_ZAsKM2Hd7j86?~ulElVr_+eqq#S&j|*?X8yINTvk1fo?S1rw(YG&l%tk4 zOj+wgKaU@k95BSbwoyHIMwSzjXB>zQ4ucLe2$_nQn3;x7v^1_ro#pNjZKTsp_^IjCC=TeZGh-LZCG3S6#!uDH`&-kO>} zbM#M}4%{&A)!2*L8vVk{ZpoI<5M`6E8hn+5DKR!uN($<0WbyOFqrIJ5S&C*t{nY>2 zW5a^n^0i|0Xnlm~j@Tyc-x@;O)O(#Nf?i*B*z4erhJrY3)WdQ0zHlnp?ZXku$N9)um~p$$# zEcvsMl`7U+VjFX(v$h7YtJfz(05ENe8!4Ma>hxLfa3JA;eaWPMe83Vdlib!vPk`oM zu8V0(KhJj{wX`)16Ul+i;X5LAce@jWK}mZnYtJ43egh@x3drK7+xaDZ+C#Uxg%Pzq zKT=*I%7}*zKbA(6SEU^1hhCbVS#PP`ms9-7Uw%+FW;-h+PtPj!{G_;rid-Zxv>E;2 zXEh4#8pK|aejxwPUjw*huNumNC{r3Ho3A>a+c%x}XGWE;nl#5yzpRi0Dd44Y9bZ&fz~5yJY<4`l7W%Fa%xGi^S>&dW=NPVC7NeEl zTVw9O%>b_7Ja7rc(zJYdzDO7OaMLNVk+z-~Ko9YPfA_0zlMuuX&P0yd!e1=4rGpTW zts83TYN*3l!R!7cD*q&@t z7{%T5H>R}>H-b!2J+h#{n4xZz$u_`Um#WzAKW6_8}BQuOa* zU?So)I9Tl+y%R`-w%r|}eT6mc$ms|N9=;wFs5WHFXyfkPPd;DC z+>lDg3-nC40X{vGEQzy3Qvnfh(1zH867g_3_jPU59=%fUJ9^Y)Xerw>ms}on$@PmP znt$y`o4jhu`zaTfyC>1!k?(&wzn)C+;%#qHx2s^aMa$nl2`~9d`}8m2TRd9K-VQp1 zJkBamb3~L-uBm_h;B`7HDD+v8(se?CHQ%?1aqzPtNuCxvxiwY+X#}Sg5Vg9Fel=B; z8k295SopZbhHggyBMdSN0 z-|3bit;VElCUeJ0a3VlQ=mlr#JpGqi{}P3Nc5pu;Jrs8EpINq~YpE!5dmZULvyvW@ zO-$rs{l{dE1l3PH<;`W;FCY2|*yf!5nT@0pJ@@M!Ayi*WHQ>{F7%)hB;4%LTZ#Uj| zY949?fI(g-3gWmWgrzb|twM-Eem_~((rJ>xPV9;D#M)^;u6LvgB}yz4v`_W<8JoHj z3!vOhLD#n6V_Eu`>YFKyq@QRI>Q%MooopsuwSO@JE%1n9torb6I;QI~PHhXbh`gEg zP9x=$Dbg`BCU~8W<*QvC+EfMYk;7oZYT-iO2LW z^FZ`Xj`wy{ED1N`6z(pF?#Ad|D>*Io)c$1HN7Zs(nH9+pM4#U$KPAA(s_1yQNX3JBIISJ{rQuXy8kHX%tU5+!okZ4!Vo#ABpP895vEhinyu_&pdzxXZ?OKb)<1t|ytIxc4al;t>@z&*Vs2CrI1eO!Y7D4+CI?wCG_V z4pW~l{qcDD(wH*#gX-OnB_RzuP~|dT94`wKhiwot$x3pqn+8xpGjqfYXgLY0$=L5dIaMuM&>MTuxKzIgkb`- zrg#%dOK>@MGrP^duQwxM?!lUba52ea4=g*>u39tdhcQ_c; zx|=cPXkR1xgp8T7cQo`tEj`n0856=bF*!Sv#pi*}ukt$AUW^BO4~f{p>Ib#5fM_8G z2yd&F5iQ<~iaCozzJChr>yF;9Q!DK;pvd+XrdD_5S%8(wJ6*gHQAe5vZElevZw7@E zFjaDt5L>cv^6b)`vZ|#N)6=rE*9Nynp_`+0=eNvywn@mP(sIT`Re-f<`s<+7m{IT3 zn$DInS@$#B2AOfX*sJA5n9Pt3X&nTGGUGh7|9WcL2bPqX&3^Fb+$-=+dS$;K>#geN zn!h4#fa7rcNI}T@0HLT51z?Jki&-F;?RC|$N+y@>O?X1lIk%xhDrehRBdy%T3%;Z-`;x)=&@>Iu?11Z zVMOW3L9IVM!N=e>7vJ%;!(y$;0A4wox*QJin;PI{$~ffp@${%3R>0yhJfK2u1_C?4 z+6StuS`R0%69zsu08=t9q~{*8h1PxFg+$E*z@PI5wXwEjATPLgIJoe&n!n18g4u$I zv&~^h00g?9PLI@nWzV;DX#vyhU?amB2I8hmbs!Qp&TMG?Zi6m^mAJclAnSk|*;KdFz(#m)<-~Kq- z@lxgH)FJ(aJAS#;grkeccb|@cVH$xL8oe6^J2V)Kk)cp#qte~)&08tN`i-j>lIA>) zuAT?_5#mmzHQ!CN^3*e)Cm6y0KAWK%1`Nto-L6|k8?(YUhiqOj$u@;FxM^mQ;ljqd z^|Cc`ur~`^bvVF%@D-5=#KF1_WOoXtS(riQWZ&OYJKZ;c-uVUs*)dACOEl*k>5v_0 z|D3nwH4th}Vh)5@kf4F)RlA>*`ft4~f!>MPbD*BRQMPfU_RiaeDK!&+m2Ww2VCm40 z!Pue7Ok4Vs_0?29OtLigjxg+;w5uU*dTL=q9m>08;OmdG4S<`S#$Sj1cA3Y^>$bb6 z{rch^CFF*^eqx8h4t7ZARD8-8feCc;{fkgr=lJI0nF-nqcpof`d8e+Crq zZ#V+`3ikIaBtQ!9M#O93^Gjs1rE^2Q09)hX0!O86HEr>f5zQq%c96jFbMUtBiUoE~ zSE2Q&agnTW)Nu zWn$-UmMA}PC$q`Va4sze6c3{^SHI-fdH2KsuH8)cH_r*ro7ViF38NWJlYv!%a=*D? z@V9IF7j*K(64ejN!EtLZggP?4*7(BE3x#CJ_cs#Gn`eC;aubdY9Lv`4WaJyJ>kqP* zpokadKdOC<8@poVgwPZ`vhI81kqi#v05Z-MghWVg&bn_N<4QadAXjh0m&a-FdmLkz z4<&qo*CdJ+4~jD+5-mutVDkTxlW5zM0>OE z@hv-bNlawbipBSi*yOUsAYh8C>7h}tu#coy-t;HmE{Yz0Wjzx_!nJl0%JbbOrBd>P z2ejNk`}{66iyfR4JpRQ}qt!oP(EgZV)Vr(kbFU49a!E5TY^)i2sNOLV=710OLCI)Y zJmfwV84+^v`#KCrrs0bYrmAwI*r_&41TzZfrA}xCRLpBhvtp+TtOsp2rB*K@>et3k zU~QxC&ld;^>Fn%c2x=?y(~utG^e)QU0H&LBjRC$XNbvvy$cnc6OBfuAi+!B3@zf; zm(Dl4M4?BhX1LXw4>x(o#HzURgyAl~zRw=mN$cU>4SaB2>&&<{u7o3qwfRn9Zv$MZZD}q;P#5nCGOY8hBFmS9yNciIPv1q9!RG zSG{0&?T)p1(IMmO&$8EQTsl{CoRC8qmnNx{`pp&^+*`nCi&goXd$9MQ7WBI>dZ)Lu zOSJJEtjA`gQvBC3mXRXk)&=>GfvF}147H1y?oXiWtwec~Ya>&gi6zi?@P)1vp91Tx z(vr9_u%-bt2tpN3?#j-5dob)?X+#Qw*gGsBSn=KlC)}vP$x%FzrJ6$~GPZ=@ki47I zDSiC-Tp@>EFOA7xuBSYR>L|zn`3aMkjKdq_Obms*ygR@fCV{^#YiRR9t1Gc&(@znl z8h$$UPR#fTgh>?YvN@f9Mp4=?bmlT{t2!WyhQI?na;(x`{D2`h#B_WejQ0xNXZ})G zWh}?d+6oj%lVh6Laf}PtCzOl%W{ax>`nA2Do`On>7j>pNu|rb9m%W56B23!|K^* z8PA77`Lye1H`@oc98>w;P0Z^hn@eyPtmZoB4IyDu9l5HW#U1s4ewg4Fpu%yvDFYGs z<&^b`csZ6vSZ8})gt0P#d;6u}y&v^aG+?9A`?Yy?rANBJ3Fp3k8#j;g58B81i9SFWi6dz^ z#_1*zQ|?*SxbXcf9m5a|cR17V?QO!LFwZ2^aaHBlS9Uw25ZcY*it}g3=jZkNmm~lJ zg9nn5)3RqVBqB3P&~qjkQq_#E%`jFBPgK2g_UrdX%F(QtF2S^;xO?L|PSP8=bKA?d zlF0kz5!P!d%t`CMwY0~eT=}@KWzw+SA+=BfLtCNCoG*+K*8|FdU0d964se5k=Jt8* zuu~}grmE#ErV3|Un8~dqSR{2c7&Q0TA@RXPP)Is%&U65|&D!puvmD99mU-elQ>s2= zMQ_$0D-TS2>MS0ViHK;w;8t%dO5ojocaka%-uq%3U;(-c#~=#jIsxZN1T zT5G8!8qE}7_(JRT#s@{{VaI;LwGzb|m2V+`6K}=(q;JKquS^B+yr11c2PLGWeZKaa z^pvgepk-^_j)pe>TQH-7;DO*vAjEve0!V#6W98Z218svZx0SYm;nt2eZH{n&PB_mO zc%dG{k$!MlkH#Y+6Aex{-2(xG0Th*~r#Ecx#R=>0a_Hy68}`&{zahTL&8+S~`cuj2 zA2U)Tr92i7Dx7H!$wTlLD_s1kz$`1+E1Le~R)}SSaH5wdys5_YFbD7tz80U-OiG_J zHs_VTxX=xq_SSrpB%uLg0_707b26RsgA~rai3->-jNC zPtU(xfdhEQXVkO|g3Sy1)reS6%QsQzzmzH>a*I79Yzw@cN(|4lzojE|^E4L=ku&UseKi|G|=T!eI2n@lXZKi=yIZWUl z3{|iLX<>aY#mf3#2}?EL~n0`*k4-OvOq9exqUkn*r?F%toj*gOXjI}|x05h$5dY2KWf6Ryw zFRwMH852)=1Kw(-d%Y1Om&oU$)@Mt|*CVzN>o}@5kbUH>U0BK~|H7c6Da6 z9-;BnLwjh)PY+7C(c|V{8xCu|G{*3}B7ret-PhX-{D5tMVK{6`s$p7z|NM< z6Q)9)T8yZV!mT--vjvHnnQ@qbScUDS6TS$(ONS@IR-p+>Gllh3@a%4_4ye13);sy? zDGA-X_8r>Tb;+K!^9$^*|q67q{0q? z9=OFp|F*zN_A@X5eU`tnK9jl5R@}FDdP0riy2I?a+im%ei`~k|ufvM}hrPFssygl7 zhern!TS`e~6p>Ovx-3vB6_5r2>5|R^gASskigX;1l8!@2qX>w^p*s{L4h`qf_1hnx z?>x`Q<2an(yVm!8*SqvDmhh=N_PzJDuYK(=N9QGYe{=QzTINjOX35e;a^{HhhmDdt z?e)@dkrAf%&qN}9^TVtfZw&r%RQk2)ww`Wo&aDue8~p5mKCCsNfKW1iQ2Ljl5ag zT#&du~rcI3@w6!;{*}U($n>4KY z!TyEI9 zn#+5s%|Ei}U7*&}njqL(d6pX&I&+;KyV$~YEY>TFsZ#g=pYMeny>YCw`%=agOGz1# zk8{8I){fn{Y~G<)k>F*P7txFoXdO_OX{@6u$U32|wJeirFt>Cp&`X7xbB(Ez+vKMG zfL&~gwf&yHR&CqPkg@A^YcFrRpxgUr=*43W$_y@t=L{CytjN8+Y*278W|B{#-)~Cr z{B@_cNuREIsJH?}*A=FLD>?RPbn5CH)k`Cb zC-eJuaF@mW^laHnAfgAN@d| zhj*U_$L1sbbl1k?E|mr4vOk2!*ksZ9(sPd1Y8CtAn^yqF+-com->W*x|HZuh_e+*5 zRS{U-j-`X=Y6sBytrIlOtq!4`x|!V4^Y(oiGPH38bjVGX=UbAvKMts>jIOJQNVlC& z>vfwZ1qa{3PQ=8+n;uPo8 zVl~dQGQuz3*xJeC`=}x(;f6 zX573>br)k?{|;=fAusCkps%M`63d*oVO>DAqc9ny#H*lkzD{~ZNN4+lS&_d$8`RD z;CxeCfO+M^;u(071C2(5QMK>wxnx&pTU#AA=hR7Ai#A?lJ@d~F|0WIht#RD^ zt8>=?S7F}Xxfxg7ltlXGS?OFLKo$5n^TLmB{cQ*%O9pAzlaaNXjQXY=*04{MsHChexGcq=dPm;$r^V@-ABB zqgn5{78^Bv}0fPEnd$+`V=K79iadh$!+V zXEokz8B59OOiWIHx3~H_%SCPBJ>w9P>OeBMnQ6G`lB@#qM8@q1aE+jgAJuGovjc&&Htla0Ugg<6XnwkBGJ|fz3OqKLy-7Ii zT+#OJ5lv$TX0nv^O!21r2a#Dm7xBAMy`5>!W(x68Zs{z(VA*g}3BeE7ry>=oK9LUa z8lC##8vJ%~VFAY(UfL%2U4^{wKklC9?%gPMuYqL$Z*KL2B@FUIf}nUDHoWWB+_$S@ z)6ji0QLb($K#}2`59aP>l;8)?#jC;&i=Y>6Awmh7;xXiz-|^Nd-}p8h{r>0w>u+i- zVUq2gf00N3=o7x3h@TS#WA$@_fWP^32Z09Q&mHv3PZNinfj`fn?Ib_Xps!AXqIq%a z1|nvc2Xd6vi27;b9;4(8F+_w-$-F$TAqEi-?6w>5KINM*1~Q#;t?G^wnZPpiN2)`M zGn06}J#)YDC^r4-oWyoZ#uI#614rWohk~>}f@X%ojP0&ASR+Y_p-Eb4@%vVpfR_3bck`W6z0p#Q zowAd_Lf-X=bEAN=D9}MAa-O1)^=7~J2QJl6Mrqgql)6RLslgy^A7$GJb8$Lmpi0TySj4C! zijA`(>y;a45cPwPz@?Kd5n`ePRmb+xdAzmGk$I<v>q=6**4Fx^k6RJ+Vd(m!;}czH7li8jgBb5}dBAX?oYIW-zk4>#zSig_9or8gkcI9W}n83=qk2SE=_8F)<`{_gb7b~CL=Nd}_0A_r>NZsk+ELNw9tas zA)eVl7dd)APb!?+wcfSTQ^1{*@s{er+b>fLyZ1zib{>+y(G??gQzbI$_RH5PooD6y z=G0nvP%Rsw^BmcCu64bBDz_u>aeZ9O4du&6cS~QV-tDToZ%EhJYsR{5zL#Ueey2@G zPw+*70X{uFU6Ebhg2S__b;C910@v#BawnF68^)*(%k9*uDPX2iQBf&o*6#h`D11A@ zk#`~dCBGRVjN)GhuH;2sU0sTslN}f)gsP-(V4&1Kw5e9@hwCXa0J)tZ*kKK?=g*(J z-7(-_@O4=oXz1w3ORuHfx*_^qG-XGpk(wrF+X4_5E8uXO2nr2V;jPtJbklgkl2&M; zi7M*lvl(wc5;!;dsc@?S@;wt0ZqsfmT3TA8COhery{Ob9{8Gctpg>#k<;$1LPxn)E zzn{qMqch0D&J{|^$W%^Gr`HcVI3MD*-Qvq9M~ON*I>tz%>xYl4|KVV`3}j*NUw0i~ z#Wg~O)tiuavk$cP z&Nb?z@e((8#Fq|pVg)$?5iOCGm6gar^I4vba9+m21TvfUyH=$@K`7428!*$1t<~nIXzZn!wHtVxUH?N6R5{} zAPij6w>7`DH8sgnq&IpSBrbAUj5v*88fSfo0Et@TOY`ZqW})+hsUXnV&HLX!rJ|$r zY142KLIj(6Er)8Bov!k3RtfmQ;779W7h`N;SHTBp!P`Qu<=Dc^oJdJN}NMOBeRK^hLs5u$rW5`~m{eX=!Q37r`!=FB5QfkjB(@aLBhSrqpnn@u&<;`65Uy6CrpE z7to0t9rXZ{bk)pk%Km2iw~U7$j$<;9>&uIR)}wMYH8nA=>>E=s1sjgTP$Sx@<(?nW zE41UH!d@eG;isf|eVv`hL%B^nxVGzs>=aIkS4jzwTj+m&)kw)UW%)JT7R@@i0g>xF zV77vpG*nbnbZhrG=!zw2ZTaeSQxZd-n8ExLmal&JVc)tNuB(7F<9XVkW674UiqrxF z;!#ER!-IO$4*SBvKnO0$qabEqkV7%cUCFquo+QY*#^3_4wpyFL^|){Mq1TF{BN(j< zJO1um{NBhc!K)3KrC!_mp#3gdj(9G9%zr&{zjy1uPQwQc(GgA;OSY{ZYsgTYr8u;l z93=TqxBlzoiLS$|Erl9w>R0jYO4)S7TN%IC>y_W%`@=ul^sTdyZK&x}1r?7iU*+ln z>yzcQ?55@P!|R`wz^knfDQ`+?{qU)BtPBLD^T9;S=Py6E&d;s$`_HYj`{&kSaB-bq z4Q<}?8OoJZigB;2Sa7yDTjwRsGsfInZa>l6K+m&=_C8DH=TT!j2>_C z^c%bMX*wfgaf->ed@TNSTNy63gU_ef=yYmN_N5Wa+SJlcE>=Pw_oVU`Jw8#{y)`v; zg z6sodfWXx2hKwAILDE=7+PSmYsUp`& zl*iH}9EcfFhIE9AkoUxzh@)PV7DSa=KVNG&MZ`s=7BJm|CtHb`I_opAN!%%~1^&ap ze6xb8!9xo4R3sv3-rXDp^GT~trba{}U^gpxZ>sEC#IU|@xr1YAZFk#hGa2Wv-sffu z_y5BQ>I^hX{|{;M(`Az4k_{_C`+K~)xiMyD)#h$l&(|(y4U;z3#K9H$a3K?<&6Oj?8v16~g_lD^L;dwpSIWC^P4U`xcD$XPomyk9DdhltbOEYzMp;?; zxW97A*6oNZ|Dg{&CzPU(3Og-681E_IL3#%U1qU;?!mW?HOz(0z1o#-rTba-0B9nC3 z6Qp!z1}e47tNfU>4Gk091IfgsB=(}~`tD4FI^2{OQkT2x-?^!(@b5~@pXs+YUYBlaIxJ*zZE|}znM>u4pu<7oFjVJ2=C@0r+UlSoigG4 z6h87c)z+BWQBs5BxPj=^JGWk_jaDcAD z3?y~so856Z-1WCuGp7OSa`s***@<(+4rm!{odU}BTomxl%8H79M43}?c_jM0pkU#- z@D*jXNA#ebs`D!vk^mVx)dBr4C(}yS9n4AU2T_v4FCLAu#ZQC-aqpUI z{s9!y(8vctCqI|DGwwJ&rWqU*q&Ol5kdZ2=;m)itl!ze0I$EDFv9Ynd6RUvR9|f#@ zV_`tE`=KL}F^!M|m#&^3 zW!(w4vt}UvGlIiC1s;IDrlYzow{iO|8gr|K$(|YNk{Rj@fYnFF#yq(ihx>H~D!j`X zMl@E-HC~e3JxIfQh*_|+r>E%6lg*t#e)M9lJ(8-}WC^#(|b7*Y!Mco9T=xmedz99`_ z&h;gdk4zL;g_H59l#~?GOI7RZ>x-cBcpPa9yzjsP3h@?6M(5OWq~R^)O19H|tOe>u zQ%SsVeEsn3DJdoN#0QdFG5C{bzImc3c2^P-jVOw(YQ{K+I`-^b&K zy~A!)>fOYT14ywyMf__GRVQqOP(xo|iJc!&D-93cUqs;WI~(X%;W-0MO#J15w2*t% zh-LW*6%nHXOZK;e3Nd-pR+v9yD zB{2k0RzCLYufLw^GTAg6f8E3X^{X(M$TcT{i6rSy@1Gu>fy%n~0Cm6Ei)hXvEv%zs zV|A+Lbrt!XSM4~%9$Ws;^@a@R_#IMjRjckdyi?w_$IIKNV&-UNG^WQauea`_^LJLb z{oXJkM{Y0}cw-B-oQvnV>6m1~yz*`|xLYs%@}QAqcXM`WFsGh&g^}h^o4%ITNoj!a zCor_Zxh-e*#{|C-;1B646F?sqIo@P7iAd2LVqVO%!@*(&)p3FRf&xl2Q;(IVrY5k% zG?td{Xr;_Ag_C@>uWv+MXJ($hTX{&De04^CkC(46WuLos*2O+wkGnfQF?{D`t{4ur!i_GQF=JsIrcOm&!$bZ z`s@5eMvx5Yo)X6A)8VJebVDEfwp6$;`^tY@eNpc19EIHEB7B16eoexRx{Rx0DI%8U zj)mIddYU_Ti|njvRO*2{y%xKdGcA;$@Ye9lSr*JRZNepamPeLcFc21pf=FpC~Cbh=n)}A9V}mgs^9B3`gPT~&3ai0 z9m$k!_Fu{Qu6uVltoVpaRO`FN(x`ij9+z^)Uki;cqhCn;Ec+M8!gb${2tm8E#S7yd zyaK|0nkYTFan9*(HvIR_}Q%-kR zp4f#BAdIcWj|#;{41(6ntLNG@=u4JDOO6Q%33>AHw5-j~&r{LR7$;g`45S`oJ z)1yb~*2Gb@>Qm%U-r_H*p&z%P8BRMkKxvdDYfko;b9#=UzJZV8Xw%^VKj=+hFQnYX zuYIrVG1WM}vgqIM>)^ojs%4*0OTD~YToQ(pn@pY5o!jcJT-{sbiO-5e&8C|3k6E90 zto2tJ89(F1PIIP^;(&FEQ$2qdhE8mSY&rSuBZt0XlsWK-$~!R3pQc+g9Tz4S!~2%P z`$KDTbGZA80^Yg%TVek6d0+g#uwzN+zzgf;`b;|EH(%lt{~E6vo?!TLhqPS2`Szk= z@9;Z$UOptqFk-m18qwT$3lq3NYcOA-47*#@fBU+w;wjbGawqm=xnyP^p5N+ss8>K= zo0IFn*kTqF6ANgzf|mD{NV7EYkoj(6&YCy3AS@<5{$k}w3NP>Ft| zJX?EZenKo?nJBO1^<4;tZstLMVs2qjimZ5glqsM&9nQ!_=sG$NQsHu^(vbC+;B|PoA_z86PZDQ)m+z(dv ztxxn-K#p0<)YSCSy$27{sTRAljRH%DA9@J<&HevlF#i3iqA4V;s1ZE4)ao%^U%uQw z|GuNK@yJ4-i%U0m#|X*Z7dNN$@=IN5FkPQNF> z1;gBT?0mL84DX7~ig*{s$jDgUHCsOnS@7F5hqF zGcV#E(H}pqT$o_z0>@87SC>j=QRjXIb$gxcYySU$;{CY`kmb`FEUM;9K$>&&Bf?~) z#lH=c8per^purMQ#6Oo|d>Hp!1kw4;lue62G>YC>LZ|VTJI2yZOigK=_fj@cBZkZo zq7s>U1x?Lk>Sa}JVPIy)@mMg|v`0DYW-%~O8dJI(8yXIG<(hHiR&#HC3Vs)jLzZ9- zp@vev7nVJ9y}@QdgdYuibCRf0k&R@@kN)OQ1N`s5=>_dqr|Nf`ifg~cTQ;p((H;b2 zoQQ2WPsFE{ssLzXru1f$lE=T_<9k26RsjQ?U&O^P_8m*{ZH}42LyPWVQz^ug+(>zb?XmxIWSp0MN1d#t|waF8ag%bh)UQAlB*Qv`KrYS!xkmP0}3UKs9BKJcBK&@Jk1_{rvq+eF1|k}wgR zgYtjE^IKV8k=(F?v(4p*Au2P-!rV#P%l`*a{pTmW_6Ee;gr`v3I))}4WWF!Q8Pg7^lFJzTxD^OfU zD{;Rv{jXK@-}}gQ2Mqa+y|%~@L@JS-j@KRNZnu(zRuK+2{4iTVkQhX^J+d6sOZlry z`cDHY+5(G1zKNL_HqQqcm^2Y1wr`7w|ND#n`ag-trU{Y0w1t8A^D=)NW*oH}Qvd=cz@V~Ru$gQ8D;nkd5m5Ip* z$+5vwzZm|Im{(f`&;pNXJ@W6D{pTMd4Poge;>d`~=rTcm5uF|IPYdj?BP5Sxbfw}M z{`rmmZEavp1KTV)o0$CLYh?&l4Ec8u<3lgOp*Zw&>-^k0Kex`$t@HEL`FZO6FP%DB z!)+qc%!=Z*AfBCv9HzIYf9 zAfYEf4I57pj0^y+3=znvkfLs`jRM_&qARx&)OwFY-J)eyR@VJy#ug$xuG+Kpx1z9+ zNV8k~?CzNxE$&NkCBT!TCeL!1B2Cvp#CyaZ0c2%mgyd;DIOisusx!1J`(p#uJ&=|_ z=51=W-I}qA7oLrJqMsdfrke7PR;Y_$JWI2KA`=mJ4ZXtj}J?L|T_<-i*v zv~ZHoAcZb#D+`r~mOZcC^gZ*vbL7~Qm9)&`d1igZFG`2Kd3venSK2E*T(JPsU;kWR zRaMn*HHl!AVS@G+QGo_l^=VFnEHjoO3zl*A8n?{d|LVm3cqIOlzD62P(C}NGL8=k! zqGDrDndal-ko+}LjZNBx&&NZ#>%~UGq-P*tYwtmQj}~f6%pn3}@0H)@I=`?`*|%ET z$GXQO?vQ@S=o+QV45jyc-qji+YF%zg-!r(J(S6t`%f!KfUmaB0Pq{+5lx>BcFg_SM ziwGlF7q&=`qlj?F{7YT={`hHJyzpE?JLGzAeBz{S>*=9IcpOe3q^)>lQZlL)sgL(e zvLHhHh9UixXpaveSgc0!?T0L6(viBl%37X2nou4yAAo!_w^fs4GsOfV?1zH;Uru-? z6+lJ57J!Np+rvE<;#b1M!^`W35pBTxZJr#(R0x^F)z$S>SIV~`>5tv&0McfY5e%Sw z!n4nP$sVDRI--FW66{L&2Vam%F*k-$!TZO{G11Z5&er8F9MD?%I-`Fvs1cA>N=iwU zBgSINp{;--C+{yAvwvI}XP9o~Mq!&fkY*Tc^L*Za1dFY!R-f|b4*|u(!kBx8#|JGh zNTSan_XFhdE@n`p+0+|9 zg!@a@Izx7Won5`qWp#1pg$Xxt!3h@!vJqf)ZH4v z{PBM0)}z`;FNgwti#NA4te!@GBVh`;(>kL9UV%U+7=d>_e1yp zc*B45AlE?%*{iSJRZP1!f=NnBiU9*NPxOF;#ulZLCJJa$_hlp`Bx1S@19Nh6UR=@B z)xDY1vH9(PG?|L*_kyYhPqHKjQa`r@3jjQ@yR@s4l^hyp!%sTfFF7cQnD<{TvbrV$ zVeFx3ZEd#=kw+ht?^#)~Hg0Tmb=}?g^-1zd%=+>nDr#CbtM`8Ae3PCh%Y+FY3!jK^ zx3@F;A1;$VjVrKk%Dcyn2dv&)a^&ZR#5?JfI8(la%Xn{Od`#CxvNOU5QI}+|G|b9- z`}$TU_{D2EbmP+tpQXU?Dhjv~lzGR542CH0X*r#g28P!8Q_Ii(oe1y13~NEk-p$aT zUYL*XaJ&rK!za(@BPovoQtNzGT-?&pZW!}%ms#Gf%XtkVSOvAbfu={v4S{63La1e{ zXtY@E@CSbV1i7TNNBF zv@Sdrf2^(Bh(BjECYH4EDFbIW-;xj%G%HxM+&R(RNLRptdy;wpy&scfnzh_iy2ts+ z%Qbn4OM%bdIW`6ui2VLRy&yxG*J{7-qvbb0{3hPAYJlA(qPfza#Q&Sq|tWw2ZMl72~2z;WLfKYCqQe=uS=@?b~RLhl* zrrw?jx1b&@uFvp&Eyw1o=7i{AiJXfb|87u;;=GOV??r zsLt?ZHa4(TOS$&$gH~-AgQmD6=(+r;Z0)$|Q#UpDp6`pkzkXJ-=B2m*=ZB}bX}tZ8aspg!~2J;(K6uA0{8J30*SjXoF8+{>m!N+64H58aEs(o~?w z)?+tkYZd;a;c3IWC+x-bHM67H)>f4j-l4AwngWBB_>)56)M?pGn4@rxXzjPuylCV9 z5;r-?UP#|Vg^IIH`@^J%rmM~Ig0gZGzFs3dvv8z*So5kMMR;Tm$$s9)7njMkINyQQ6XXm}|^z=5D&HL1HB6~}bbVOwtDiTa+4cWCg^ z&8#cpSC=ny3eEo^jL9|4qR216lC8gK(=ReDT(1+^B{kJ+*Tz0Zw_t^4<9Z(PhdOML z$;q>;w>2M};};;Q%S1LzK>|zt4vz7# zDSLf~%>5f{@~FA@QLNic*$27XGG;e)38*cek1WyDLDlj%T0TgJL2dT8mfe5bWm)=x zkQ2v9*Bg0Znq1?KI~HF+HsVfLqf_?5m*6G%H$N{XQ!`n5@yI1OOxsi$sGFLGHSe0(Y9Tl?}Be{~#c%ZWX=@{tR$<7O}#r~-yTxTH0Kb6cF@j}TV3&AhzZt6<|#(c z(A3c6Z==4BCT$uhdr?$M?ZQcjfF{}%Mkk6W94K^MdJ5@ld*9Hh9Ew*Tl)v%0hRb`R z|CH2}^LTqkd1#&E2J~BDe|(ja-m08jfmPj_7-#erLgZkTz*TU$Ev2dz1dInk=n1@& zSHFG8`x42cfCQTE2kF%_$Dlcfhe=lsTZiseau0t3wzs4q*(T*r_9-kZj8!iy-30i6 zr^;MJWRRPYBMhm6i!0oU8n(}4pHWwY4CN<`m$(W$%J;gKRqv&~KK5#iWYYIf8&OW{@E-p@)6E6VqtQtadk;qXZ!q?rM z{2LPKA;J&7U%AjL$ntnGfrN3Z!!GN3Rr6%&tJxYW*)Jvx%HBTOMY$bqAS8u%M&uig z9b?{u%E`_y7l&ZL4tPS^=1(kv7r#Crvd&nH51)E}jZ%#a;Q>sFKww%3G8KplfZW`> zOwF>#Zw7n>Bqxd)`iYRLn6^O}0#1Ym@d1Pp?0A4VEYInNY)Woc7Hc2Pa9y}3(xy&l zK}u5cB@LSUWs-4UmZQe0+Idg5-*$qZx&l zs$?pl>0ZQLqB5!S|Dw-=#DGSCKmnJKV?LY4okfdSoh7nKe5JLmEw)c>)(irZLWq<+ z5wzly>n;!1#`0Y;%sovh-I+i;Uj(8!9Z&azD>1jP5cTkuy$E;LJJEi$IiV$@+M`kp z)W3AZmn3)eg8pu0*+CH+1CRNoQ&PJ`czJnClxN}OH0y@G0_M>Cy_~0Hsn!nKL-*GN zbDpZKHG2c3Sd-_>~@*ZbUZ&tE4FCYG4=>T9%islQT z1u-DA{zaxgyG@v87qq9%}{G#X1FipFyq!nfe`#Fs=Y zpIZpmrPCqNn}}yqkzb*!&{B*Ni>mRmX+KD;# zUe{}X`!$9N-ZzDVZ+)F52H9N1di@gLC%hSt+4tKYZ06?;7r%k?b)9x97 zQx(`MYe0m@=b8`BTdSLV6fxJtf|IDKEE7vihI16W9VN=UZbW2aNdmyDvNRC#Cx)o} z+7rAtHAV8xjjDeL#4CfB7&|9-oQT2xIYBU3|DT$m0-oc3e`*123Pt`GCo8KhR_?zS zd5GdC7aKaMDJMHFZyD)}8JE{MQ24k(oCiKI-p2-QlvRU=z4oG7G1^l_I{a!jlbLr$FcH~j5E%MUkcd^BZiOqR?wPJi@dl$AJ9%w>tFN{rBv&(Wpep?>_1KTdwp7 z@GRNSzwZD3_x}6CzxiSA5QL@fi5G~8l_WsXE$@A#2Qe{OGc+0&7TUz{g?%dhf22_z`wTTJ0dmFD<>xdZ6QHJ_yTv<>9{Ef;E)~rB1vgCuF-p@BdQI7!XV^oY=hlx&z^Pd%tasXX~Qg| zrZ_25+DT$+V?^CVueb|qi(%2wY8MBDpqo(7M!sL9A$+$RC>O`$NmQo;e?0NQNfTOPtPJ7QH$e9uXU6uknYGOS3JBXp&G?8+F zth#QU*S$e%v%=?YJk^e81I+sh?JY;Armz9>zUKx(C|uF`Iz=*wC{s4s4~GX%0*vWV zDLq&R*l1Il^IQ^Zsgk<1k&#h8(4(68hQs^7e37G|V6RvnL3t6KwkiW?xiZ>wD7U_ClA@%tp{7#^H-eY_vFUg@$b z&TliWc&&iPxc!;)>HvNGA|0nufZ5?fuVdUBLuTuzuf4X=^;8noK*%+r6Py7yzM{i5 z6cbNMhoL4I4S1xU4c6J$rrawAq()rAfi=NXu<_wJW5)%;VX|yu931J>w}Qh2I~w?e ziXq!BC(NlUN6o3f4kR`EPJkTcKXKCep6W|0n9R3=Uh)+^9c%$lHzv$n1ROWyOy4(- z7uZgvp3&;bx5jlErG@iMAKN8%#tK$_$9Xn3HuB*LYoahY&%^p3>h(y3wX5c5O{xRf zaL~>4y5%Fk!OHh{aL5$2oQF;kms*me28Y?^=hxR(k~*_>YeW3l8|dC*<&*Bp%O7o{ zijt%L8_WM6g!Vs}*r)s3_a9q!g0D;LEUGsS8QfKAfy-ERT;k^E4-sCSE}K=;7_ou8 zLW_Nd<^xQmL?s~+Feq7IY<74ar=ycLE1VH4PmVaRZNMsTMua)r%_u2g8L}8Z#B#_!;LlZS8*|roF zg_gGfaVK>yT(bdD3L$xuj$)MKau<}dQ>JX2CmX=mdevmhzcx6%o z&qgWY_>B5{&cWZjlIsQ6awg_#>BQk$AY3Z2KBBW`(s$pE1hZvWD>8QR)$2NNr;XX+;l=WY{7@eD&JG!XI?aZ75v^A@ZwS{=dixA*| ztD!iyU_03zyh9M_EzKn?95FxM;rDUOJ6MQu8fkD)#)`&RnEuVje{ToC_IgvVS3}^l zO}a?v=;+|feZu;OQZ}A+k{29+riJ2#qUOFnG@_92aZm4$yThcZ6*UovIn9TH-A2gH z6yK|rCSvV{$VYw-utjs6>*(oGlihf49A=pdw4=NNY212ewowyOj~jy8w|x+ZrD2Iv5ArC?1v;4{9z+gU#j5bnZYW z#ZhOrRX6tj?P6k&CB(x8cNac+MPqTq_4DV?R*?iuTc^c~Uxm7@wwEz7#%}n9OlDe^ zeER(P`S?<5Q7uCaZOK~}8s@c)abT55?;W$FZEY~68DdlqQ|`BZk+Qy>F}96n*Fn0Q zYP1#i*0Mbn%IfQ%_4!$j*bEAEY7MjBIX0wf0H?zs*`L1X0M-8lRsDTK4e`JQi*1W>m+g9K4jhH&qf|BO ze#hPCCNUX}RDw>6w?-vt=TZ~T4K6G!#Lv%ju583S2K~3Xx>oM+nBkrws@dY$H$|c`6oIsdh^fp5y5v1 z-j_}&q;@%v%p@e`6IaiZTc&^PH?G_vc2k70SFCC{{=`Fe_mc$XIpATr8a@@_-xJnS zQubKl#KL0q!+O276wkX?{-`BM!{f4d=T-8OE!R{QT+gPVcUSjsEx`6)6!*Fmg>{gi z#5fI{LpxK`3JD(&cN4~MUfj*WR;f_oxQiSW{o&>0Ty-nw=Tq981C;~c#d3W;MS+)$u)9K3>dKZfIV-=^0g_Gx4#`* zB3qr?aX8<4?2dwhLT9?>l}_;Y z9M+eHn}9j)|51;A4gS1Q+gB)ny>R*+r&hv3&}Yt6tR%Y9<+r%CS#AODQ52(a@WtE9 z9Y%9JVGgpCsZQw5>iB9O-?xp%-A(c9Q65gp$pM8VH7B13S{W?rVBL-;#1gP#k*cMZ znt5Vtrg~Vn*n@Ag+1~EjH`aFZ6qQxJH??n%U?$slBr!O$vnqyqc6KK;?vm=Cx_|Vj zx)WLU!LgRfsw!rV8)I%NcTEe$4fh4OzTsmzl;(GzN|$fcfzi>W!ctSs+f#Y5S&msk zuDOEpb&sr)DT^~l3~i519Cn%yAKmeQ6UDBQV;pCK>C+y1shk5OaRpjBPE^Dw1;$nw zU&Bf6iiPNe0Gc7ZJ5!p3PfeIq?#M4-lHhs#;0r^2Q0I_u~Vf0HxWk-a8*EMH>aoVvSp&dPu~UGKHT=v>=;t^-1puwFrR2y?#O z+=$YT%UsUPtX6m4x4aCr}|jgE2eg#TA1 zFDlw4D`8Z4+IvC5WbVkw)X|CH;msQn9i zZDAyGmM2|{i?76%XqE#%>OY8*vNo9BdpU)2Xm-Nq4CpeeQ-ePWBf2*l`eJ*pirt z#xGqCli%e(@g$gQ*jY=P`7-*vz%a;=d^Eh-VKJ$((%E+B55eKCyJ_m-rg?(_WD*4% zK+e#p zT<+1IS-Y*Na-d2Fm z=lAt3Bd=b4Uew!qjYWjoUEQy>s4M^Gr9eA&)f2{dZO2~8*S86*R}Q~>m0f5z;a%Dx z>S92XI>M?xko_igdlBkrg{GRDFD0FJx=UmU*Ukcq*C{he`piWfE~~at6=q`zS?SL8cvK?=~y;!eDkibZK3v}6&cI%rw<+)9wM!}$*>E17bo4<1z zd?g7nhYGUh{13yPoI?*~n%v&Edv0vF3Cqat;(Cx>JAE?$_M)!{Wk5w#urPJ3#PK-V z3Qb|P=iYDU7vH~*lo0MOPdw!;OtHW&mXVs&puDnyb*s>fo~y-~%FL^d*p}tnUXdu| z@e`ut?(V9wF0mmGQ;lRJ4GmxA6%Pv$mB6h9a}P{6AHvpSz`$}6I*(6ezROQx>qXBv zwdJ)|J(Lyt-BSMYK z3)faPhUfIx!l_dpKUUdwL3f|$v-?XnK-ABIa*|m!(q8t1ck5`VB$Ck|b8M%2xddsN zrZzTm%m*sSPPCL8XBe+R<&N^)tN!WFW#rr1%3lsrZ5_*sTp~HnPUvW8xP1{n2vtZ9 z_*x#y7J-bcmF{3+g@6DT^%Cc`S9Kx{mktYt7uDzCYs#LM+A5W3H$H8n(w>n&7zBi-_FKqX9U zS8!$u@bI_UK5Oy%XjKI5uE-x?Z9`x23<}vAf*$eO4fjWF-E!SlwcifreTD32l z?~LvtZ%aCWLbz<>u$FavOtn0ZMr!hkiBXKuMP9w5w>JWjUm}3qY79uk2IuHTRqb|0 zis1|(MmCt}%x2adT?L=vPEE`)PO|{B^PIzL3wR8*A5 zvk^%pXuM>WKNMo5JGo4Z=!fF1NX;wg!=tOqZ6y)4A;&O|Vs@EMo*i5RM#UzzAs3bI z5kPv17bh7~R8)i~e-3@c66c{oRXm7MaT--aOs$ad(uqV!TNmqxpePaugta`xQxg12 zkQij--udnOsh7PfDrd(JXLbV`MX3nZ5J1p9D-9#nk4^^JKAq|;^y}YWYsVGu zwh&W+wq;>E#IJyMLQ=9;Q` zJR7QMoXm)E5Tww6Dl|NMmGcEJ-YID0h}GTcJ*%a(w8-B<@wRriONBxhrHRKOLnkyW zJd-9#`-c%sN9TCI7ukP5SWbV2VCGs`m^0lDEkgYx)^-#J7+`Qsj zBZR9uox}k5XDAiBkBi49UTBx$hpoxr;ctY5gp$Ia`$U{+B$S@O@$I{|c zRC43g3TzMvh>M#$prW#pa}p9jWEzLpyCI#H56Cx?;`Vc6!;ku+R_0?7Fma+gk8}UQ zgYkf%jT>26aT$pNm^V5;Uh^&?Ng+)dP?D5yR7K$9OHs;=D>t@-d7F9Y`33?k9+g zEZ^=4xZ@k!1#zyQ=`_0FltoFb;suC$8eI5&W)I>JyF-LWs^un%jW zgQH^;0`(!RGvE}ePPS)6uRcSVj>dU&fRpi{CX6(a?2MTw-#5;F&As(ZIr9|MtV-r( zG_RftRDGd!Y(d|MhB~T6$Zts^m__bwsauZklRSV6ob z#VBjZ!>=eVf_Q#~(LgREBrHra|P zcqi|G7P4ua7t84504<9&teUq=E-4XND-&70;xX8ymlLFfUnto?Dq+PE-2Qm zRWYG!-o*{NA$d+5(%Bv%kQo{ovAF2Z+tl1b-a39aN{peI$Tu3(I`DBN2;on_=DI7_ z)zcI1JpG71hLoc){VWj9D(*}nV7bR9oh1s4aq;v!(!m1hC7T*i2S})RSeQbKCqmH_ ziUZor*fZ^Os@QXQr&^wcj&io4eEAYlbS8=;Ku;U)GAmKL=sBcNJnL^&d$BDBXsJil zWmp59hr?~}3aziNOzBA^DF{vPuz5c_{*|?bppZe~pE3R0zA6!a!x8>Ia`%;o}bK506nZUg#QL)-YlRa zdNd)l4ups7fg8!ZV%u9XuJaBQ>(@Rat?F6~pm~w}3Pq=RtCF5hbxI&dnefrmCZM!6(!7Jd z3ktkj2B4u=?fgV3J3#~~ zDdZ0FrJSE@W_v*gj=99Ku?vMl$ya)LdC5&ASywt?+{D`PdLk=I9#e-Ey<>6E`7N__ zxVc%*(9BE*@&_0VjpfF`MrfSje>X*0-e``f&IgLf)A6+gC8H4x{7aNzd}5#2)g9Xt zUDyXj_Z=^$Aa{bNpr5N~10qpoC3MvB6JB2&u!8(aJjjk%^jeM+0}0oV|H)wUJk@B_ zfn+`om>C%*A&W*a6E%tCQ1c2X3ZcS3D95xXgl$_1DC)?KHO`SzOkUbLDE!F_dC;&E zr$y`W_OPi%MZ-0@+}$W?kn&QgQPKSr&sI$xNPf2rW$Yp^-FR7UXL? z@UPeqo~i@n#u^c^!sf=t3oDAZ&1!1yD>u(`=oE;HiQg=*-LTAjId_I7_W9@M#$nkx z6fXO4*jREv z!9-bcR}x@fZNK;?Qi}HJdApGZqmx`aNju%FP3A98ooLSPKh5HlMG&wPr1bF)TXsi< zdQEzjsA``>v-`C8Y7NFcy{!%GBdwMTS*0tu^86PSrW5TWirfd{E2$^3Lf-q^NK!mo z#^MTZghs)A`+COqU0z^1<8_wh=NEca?TlwOp%IE&wqRG?c3w_J>By3 z7|)0I_0#bH19t7O_F8kz`J0+-oLbagOD=gOpIr23XA-0IFyJw~=eiZKgD)MgBc9B_i+w94ASpI0gD6& zTel5LRKv}Pk#su0*Meo+EDq`$-*-?c7@lkVOmm|AxNU8!p0BjeEpgY6GE>~Bh4bxK zG^l;^@g!eSmLmj@1PorC;-k;-y^Dh(I$~vKGkMAB!Ev={4|an0*PAZHl?&11MZ$}P z01keFLy4^_S$tm3RWjW*wZZMU@%@eB`Ht&p4~?9ro}vcWdp_~lxayHsW1F)rFy4WE z3aM%4OEqPuTAR7or(E6FPd!nqdHzZEAG{>t^z%s@b2)&doI)y zm2}lyT0`22)>b_(BH08Cj}GZizaJ5?Eb8&1M!2l6KI+1d3 z?Af*G-tWjBRQ)!UY#Z1TL`j}B(o*Dl%=dgsTTH+>C7kwDu^ggXQ01UCbpL`SpiOKp z2Zp(?-NvbUVBfZL^75*hiZ%4q|VV+IFM#U0o<3pnTnl-&Htdn}naXsr^_LP}TR2MENfKDpQ@Ek_O833hTS3 z;m^5@I#p7$4`u$STpk>5NOy(db?Fc+s)jLeh6hV z_!Ln>MoL=W|6pY&oQ>y6;e0d0rrF~A-R*STtsQIJ>R~=SR}=b0`;EUnKO-3@*VOq*OV`oysTdoF zW}5oP8pklPt}NY$-VtJ^-lk7ROI%ZKWtxjnOf+FS{ zYk!9aX4yP4Y`vZbh(8<)W)4^EA880F>UG=KT2%9OXTuX-UFENl+0XMPqxv;f99xEY z`-cZ-Iv?Onh2FrPII3=9^JSd)nQz}>dRtqg8j?&?FyEYcTkneYdx3c?YWleX=F{5i zF9uvupT_$Yf;|{Jds< z7scKUGZva(kGiG8MEy7; zpnkN0J1t;fpk;>PW4lLzSZ>BLPFqe{*>G*Q{>OMb%k$XX4&%z{(^J@}?NQs|X|wCL zZD=>zV@Yogw1<@P`aVt*c`0Ez_}bd(eDsCBK`Io0oDLh0-_qk$(X*-ko)a^zUR6eN ze7Lfx7a|pt^JPYqsz*Yo1@Pvc;fu=1)8oIj;4DqmOKWW&(@914*oc#7Bv(c zJ>KamWLdM4VC^L28gGrYVA;_!@_CM5SMcXj@fds3alC@!#$t_}$^tZwgJM*2UQDnIbEJ@*}KZuzI{f-|+sTr9GU%O4xJ zU-jraQGsjdk36R;vL4z0?|O0mV%l#)@lO6+hgS8g*q3a9%4+QSZo)tqjG z_r=+gYg^!#{9wSUS>cXuuTRlVGH?8~c=*pFY&1i1FhuJ>m(oh!1-&K}X8Rv@Y#o0R zwjLa|U72WrrH<)*!kP8d)KmpEcfd3~A2Ni+og+Kbi((-4BJcJ$agpAlj5y;xq!A8B z-;UpTL>wJdgXEcT-x-6G!iQ}Qwic_%TR{v6n=f5yjL>{A2%zkKB!F>$7I3*TNbKla z`LZyOY7uLe0wkUqXRz#SY^c>iSm}=Y>K7!Yv#IANf$Y=J*hQt&j8qOr#o(fC8{Z2t zJCK5~Q$z2{N+;%4II?uOpqe#lud;?s3MP<-KZPZu@a@|DaZraThP;=4Mg)-+PfbG~*k9G}v0l|K zzs|5aDNb-cOT;#aDnJ4gv)PESz;}^)OIS##1x`g6I(TbOM=ZIYOt97P;Q0lJ?ck96 z>XN}obT^VQ#Z4s!8w}UCrzxkx>PtB4y6wV1*|ZnA*v&Y|oYW_DuOr!0s0tQWw=!NA zSOEPh*R%fci|X64Tr7sHlVXnk-6G)1S*E(KccTU500Z;D@xe~eKVr^$VvJTm^+*ei zu9!}VI!l_^_^}iH>oflNqCgtb;=iPjaucaZksmvj5}sf;e>OMVeXDmcBa+K*-gy<;7 zN58-V?#iHJX%Q^0`&S{miwPnoD&)b1ukrHKD zE@VnwK+>1tRy{s$k8|IkY~+VB{E*>A1nY! z(;gCMd?=Xv6c3B_vl+7>2TMj~ySaQ+k6w9`>2^U~Ozek$&WLHYdi`M+aN8w8IMwVD z9yH{1^ogT^4$`V8ZXqIs*~_54jFI<`oIZ~eY@InQf8+6$@=zOp8$G2)#co{#%!b-b ze$1_?XxU$X(L@XG_g6o#Q(vT|ycdkz#UwC`ohy+`{PjE1(*PQ$%<}0S%l=hy`fHI^ z;vA4iNV?6b_uypXiGoART`k1!!3ur!68PA@TeKSOVKnLnenQm88cmMBKKy^(p}2Ds zcp#DvQ8)H5o-{Bq;JVF3=Uw)2rX`{A@oY{nPvpNIz&|s!|9XcQ@xh@PV68j%aPOSH z2T+3-{VC0RxS+YvIK`(PT%xsyGyMYGf8MivflPmW`2VWMMasdUb-VPf?g0zq4b^t6 zj|z;DLhFAsCHprPSB54tIJAb2(TqKyCx1UW@xLFP-;d7kN9W-0x6bdk&i>zTo!=8S z&)*Za-xId~&#iSl*WP}@BU|{ul9BcD@-j9ux@d|>g$TU~w%g^vSmhw4(bCnW($Tmf zCMp`!)6;WLTU#6PeV7;<8~gE@9Q^gU_#IkL$){qBJVo}{4%~dnLJAmh1Rw>+dOiF~ zxh{5g_BI_I-FvXefR=Jj6I8e9I_=$MV-n^0!3y!8BpD#58Qa)gAwdHG#bB;<7msc2 zgj8p1NB+pRk;$b!9143A?7tm$+J!`)Oa)DxWErb)ia^)30K>Kks zX_+a&q~Mq-K7SeymY39Ub6aq(^?ML@E*W&!oAv_J2jNCclDXZWF)(0mNE4mei%3lR z(=PRDo*M{cG5xuzsmz*Im4}SGbM#gHYzE8G#vuBg5w)3WO2D-#DkLNFXp}|q`;lyy zekv47Ay{Pj``ZH_R!w|u_OP<$RzQN6TQEV`zW zpR|2@6R1RY5w6*hFqb&NM#o=XZc01`3!R#R!{=!yH)t49y^~uyAKtzK7gr%n(?Jzb zU_msLlsAzuX))Zrbqcb8nw#F7jLYm&4i-LfZwK&IhW-{{D;eBubeu59&R*RMoao@$ zONiYGF=@ebl|FJ(_t_eaMd7(m`LTty;AxqLjB&aE$GjPAMU#<{J($z}K!(_kW{?tFMMh=c&I^$}>Y}sG@>BHmj%G{EY5+tdPbSDZ; zM%!L?kRIX=b{o- zU~YWTrqur`R!?uIGJDu>CqHhTyqlma9&zFGLo>a7KC3PbI-ECX_dk`O4NZ5Q-}=bP zH99}tY;=S7!3?$N87?l(?gtU73i9#~lnIwtfW?qeI|Bs4qtH*1DU-VwvF`i%CabG) zuq&bS@L~K!IG=gXX)q+Jhvawt01qK{Kn$vwG}StuavL@#CAxtZwoxBi36;Hx>75=n z8&6I0tv&g5A&DXD8tfk71NOyP=V4|p1VwhYwA_oyo5E85DdZvOFx$1ARF6fBtO9mZ zO&Vg^LDp*PKm@`hJMUq8?6}GiiT+TGFxaCuos#@h|R5)-0U#Rs(%B6 zpD*}#y}?mjBx+eQmw()YbuYu|o3loR@fZPp{Ps&RmN!3gg5aJjvDgwgw6*Pqkq@F_bgglm=$?a}T^lcUW5Y?a$D64X z89$_iHSFYZD4&~gmxJb z*!aS{$%YM`#U?3oZ*WN#@o@a2^7T`SW4V-d<>d<&l|-<#7WakH-0tq_Q46cNB{#R* zJGjf|2IvLb&6BZy?#z@sqSkOJ~>}sjIh4I3bvtk`nr+4$gW18cE=; zYC*H}NrYkmW>Su)Gx$EYm4;#P>UKDc8_?9nC$NsAN+7TDu8*1vyqyc zD+OinM8jJ+I3VcHuq6D$0XbnlzSyDFwKXe@Su= zuW%^31;(;wViJ>_Jk+s22Or$#@9WGsbmZcM#(u^n+u{9#>T#y7(-cgcOASR?_BT+a zoR#e*oj0$zVkNT~)7Jes23`g#+>D_AGl3suZMDJdj`D@d~WTkb{0jeL1L{}SgL%?QAPS| zmx)TRB}dQsicw7TKUzxY>moK}iY8gfXII~ql4|t6{z(;Y<<>$TCF*zow6pOKD%#UH z)oXfz%wDa@Uc7Ryl$fr6*2;vZdVu9@7g8adCO+tALsE z1^n&D38r?Kj8Gu{T*W`GmC{YR@W9=M@Yg&=I9}Jl<$|cLH zG*6Ea`oB8ahzcc=hh}ZsWZLwQYrdLT*lrMg7D`HvpU{#yq;cA_R>1Z}Jf?5XaDhaA zLnFMzo{B`kmEG=2eaT$PtF7iG`st^sHY+LYn!D3QS$3b;4ZSBDeLwDR{L%env)#qU z=10}z&85ci=(Wn)F^!8;wvAu?bN4T~+$WXd@*o2!Scr^xdfc0WaO#W6cMqkcFrf`q zBt~y5V4`<4-jS#DEO`);7YP-^H(%rD{7lGt<(&C8q<~#^Z$Arvin5 zqXS^$*Z|(bWXo9>=)6)5p_MMj1EXZ(23=s~JH+bIof5#Jimr32pZs7eEp4_ke&})> z3&*rS$=C1OyGfS(Te>>d?_?MI8-pm_Vgh)kPG0Hk5bq`4l3R#Pvk~N8n2J6s*XZrd zqa|&_K8wZjm${yUnN@2>uiqI2w|H1 zGVt6zJk^$JCBu#hMS^B$O`0Bg%vV;1PN+~9XFXVY&HMgkx!TVc<}qXakIaoxvt&D> zWEA)p>j`xPk^ROfv!FPVL+4|CFR%2@&6(F0aZdZ9@j3?e4??dhCkqNvRJ$A1D#m}D zixTS^3^n9Rt`)AveT!=^=jrq4gB4MG1WMe!l-)f0Da?u6uFo1AxfS3UiCAFeUUHq^ zfNm{0xgJTD73r>LSKks=wF$fh#v)5QTNMP?_r9+K>GTsEqJ+Q8&R#d1ymNBud|SGo zLFWC-I_AsbiPxJ9PxK{UPRzX$-PgqW;Ek+mcOH3$CQXk>rPhzNy4;pGx0Z)LUJ}dG zyA~!Aqc`y8&na8Y7V_Rh(c^NO#oH{JiM71%(^^VtGPpO-@~%}juC;rtFj&^LhkGb@ zs@TXCm-?zGCrS;}v~@RAbujH@$$nNHXiKiENHZ0g_F5(P37eOoF8%TbE`mbxumv}N zVPEpyviUCZs2GwJi@Bi2AiIapb}h8$Vd%}@E-7KID55{(6ywFTX_uUY@N z_)@|VV`pUXY$wa%bLmMs$x9!4UW)vYogywsk+`AgSZrd*R@B9BZ}TjB*U0f*0|_dLHgnTx*DH2J}NjFk8+ z#y8{Dl|R_6J1b&HR;w<&(!3G{I7r#jFsIu1F{}87TDul_%%6za4zZy_#c0m9K3y8v z?smV}l}@>|{Q8wva4`d0QYEfRGF(>md$;>L#)*C>crw0>H&i5lvajW08W;Uh{myy2 z5uPNyyz7bW=K<%XmaL?Mj(jE773r=$WjlP?Y`~WOUe)!PZ_QLAtpe?`Fn>w7)SFiD z-hBr+)DJRKGLcHmu>7{CA_u5PZ^o~rKGrrVGn4+;9nF8;68z`HMnh;n+4^AzS}rq) zSq;Og-}d8iuk(cMNpat;g$fV$ISmfL=bSwT{cbA`wBv}~iPyMJb$VvzD5RwHROWeB zvI8o%$2sg38!UVBj59W10_zKQjUxL!O`k}9&d<`v0P$=3L`<&-mZ;S8;6|~#IaTPM zp$}{5bMiG>Wf~qks|oks-)C+w3u>*Y8EO$2!);$k*+EEHl(e+bgbwFUXn`bfLz!{C zImH1r4**tU$QZ$1qR5{!-B+#k@3t=gYA>}M0MGFlNYIEPL@?ggSdZme=3yjBR0wXN z#%?ETyCzCnwK5)f9|=iLfp`N{Vu~-&7atSa+1*WrVEc|^@04jk9;Q7_)?L)YWJ4fl z5-{mP6u6?X0JEEWxKJ$se#RBSBt8q4(^)_qE2@5086i&C#I!!FrR3@E?G;`hfH}Vt zz~`%w!b>aYc`U+h=mP9%`g;QJvj;5M0brCAix#rK9OF|4&{t$qssqx*IeDzb9^{V0 zJWnK~h|u{31->5JH6FLUIP|h6XP3bu=Yw!eOMy*%E-1VRf-rWM*8$d<+**keb8~WN z#R2xou?s9|Zxu^w{=jrU#uO@&GVB1?s49Zn@F2RwNNo>?!JdK~#f=xpvRec{l{h^W z3$>88H?+~K5B$VI;7R};2JVBWNV-wQ)*^B2Il^bAEUAE;^b7w&c<|G zUYbSXylmeb@+Ap?U!unv-%w37JLG@e%IciQkw4M40 zm`v%)vKl6gyKS-M1@r7-+ApC6Oe-7}xf3AEhg}D^w7P&y{qq6?vc9V5#zb^KXd9X2 z;6gf5&}Sp-*IE*1S+rD5Q{~T5PDw;eJjquEh9^mF?ZG~mPzp^=O|AUaZ9}>~*CxAHG(>{bUeIoIWKrRSE*6g^)FsQdDep zUR&n-b#+l9UMYUqOH9vx^gfnLfSc%c^$9`4REN08$Xf?4flEm;`JRl23Hiddsda_x zd=WF*`9RMIuFQ?xdv*PegE8`V#ksjpsi~ba|Q?{WZK`*GJ zaRK5iMPKh;b#-x3x>8qEsykrKbB|;XAA`aFbIXJT=H3(ZUc{h-S4a-y>u+B?@au>D zSYmho6R{P3rTu$3y4=U{nEIzz{`xiYUq{t34u&ai54{Wj^*ew4mERxd_tlYL_A0!>p zg<%>$4iGK1RA{{NMw~9~!0%^e<0o>djsuvq70`Tq0aYZdVg2pQq{Weizg~m>QyTrp z&zuD&c02~zCkB-IQ>4d{Y$Jftm_6Ul-rgJ8+k*WpmOdz>nLZrvZ(h;2&cw`Ig(7U! zEJEcT@uhC^bXsp{HBy(BXJ``&SsE=q17oUyw%!d^*}YM>E$ z5NZynbepCQkc(u!?l+VAbrte>H~{Iv2w=Vi0#o>IJ*poO!9Y7^ixCr*U!p%CSpoad zMPAQ25NQViL-_(X<;rY#Zt6rCD+{>aNaE0Is+wnkseVqLW3aD}Rz6OQ4oqG%`Ooa# zfI#vE% zl)?=)?H9kaI%@;c=pvY+o`MU@_*;r|CcQNek&^{@b!LHjqv6o4k7$Qvcv@Q8+o`r- zTJwkt;uBJj+Yk;~gYfF^m2D`1Q#Ud(`QTn>x(D&)v*01n4Y`BfIYX{5^0_6r@3tZ8 z_)|*vV>DIbfV<0FsAU4uZSh15o20iuIT?N~`CCwl~32(Ga+yx~k;I=vn=^LtwSkdk<=B)@e`tRBiI?n-#vT^#yXn z#_0E#E(Fa}rP^y>9I|z&8WIky~zKa5NO4YxOiwODGwv2Cw+M2 z6OxmYg^Zg!0YLJ>mC%)~XK#G{Pg<}K&O41{rvo0$Z{*D$K0^9Q|bML)pO*02j z-Aq1;j_xB#d3vvr_o-hYOEdUQnlw`A59Y zi2?n~dI`SMKkgH9+U!M&Cj8J=r5|@~%~XS~imsfZq@k0|kclB*IVBdgVkRC<+e!Y@ zCj;lWc8&_ox!@q&OH5padny)QU{AGzk`v1;G>rDBlM7IITIpHY#G{|}Z2ME)14$8z z8Lw}7sv@Xk$BsP>jqus(`9%3ganXz%VTr+>*m+m zQwa~4>ro|-E_|&ZLuZBG@Fy&c*oVhsbW7{*|M>nr3%jzPde#gjLM7A#wx-XT#piv% z`BnZ1Q_&5emM^s-%vCo_{%-wp082=vE<;uAnC-BNZ#+iwv#ip0XoR%Y*=Not6pE`- zWUl)9&RyjX$M?_u(0Afxx#Z5GN=2NP?}t;jlKY?FP`D%|RkdI9s8L2Pdht;>)%qXT z_JJ=$mvil^XQy|bDsBA8*n7nU<9mOznpMwv@)q#N^7HSF4Z?V&U*G zvAI=mc=cmwED7@~Y?iSn`=OU%DdDwAD79sZ@G^Sy$Y$b|PIm1hOKP=8xCjnWkwUHt zZw04(H%-IRo^D9wyFS<~=cu% zb0>>)x#je0KwByHE-Ud=o5z&VpTSY7b2SpVU+SX(6T(sBf3N@w<1t{5E7r2ZRnOt;i0dmRrnNR*91I&|odW}*(Il>tVL!j4zq_Q-^;#>$ zg_9pcbURbV)kR9-8(E}COkKF6&DM65m`Zg|SXv{vl6Z|36sG2uFSr(|u@ib69GDp8 zio`>N?bOwKyT!Ubl1~I_=In0~>!KB>U|Kg~EjkJ=y#CtBDj$-oKUNF%-Q7KJ87kVa z2Y}ZyRYLvbSh!a3jMpVwQoP1oqFeO=^Fl%)1V=t@2?r}Gu^S4)j7{?o-R<-2nd8Ws z3Gk+*$Xwylc_NoxFhtnGYgbwhZ=(K;P9HXt)8?|+H=Kl6s-`#2Tj*luy1A>XvpK)nm<9 zU($Rz8mDXw%L^I+VzAh z<=x%Z`KUE}tS{9i<^Fl-HyqRmrru;?t$)I$H`hG-b~Fu zQ#9Wi5L;k+=Jhpc^2?{%5be-0+dGOvJ(DJ>MujDR0Q>XyQkt*o4tVlf85+AT%BmvX z+v0|0>gg5e(c)IQ%H|afM1lmqOqv(;htexFM5UP}3r6rUiOHFPb*Y861sM%(7wWkq~Y z+e5^|U<()j6Q>JxE5Q#JI0 zL1ELm!^~9Md^9(@a7$}x2U2T>JJY$D`^ z#iNv=!fj)By#j?ac$Zt`wRD?r?*Z&^Si?^O>plB=T?|RN04>M;1WffNo(owGN&Ga4 z0P&MMjF0e*InM2*7UXjx$Kd-shjW>-P4qz?DP8`-G7L3=#!bL;Hl(&gsuh*ZR|~?d z|7<3}5gkKj(%~SVxH1yGLFQc|i=Pji3sN%KgGuLbdJ;0BMp&adU_=@QdQ!M>2LN?{ zF25%xCamBs>Lc)BV)mq$)F=-Ykt+ls8DVCF--+P018hQ0JrR-3i~%hDJgq8Izs6&oB69^d+<8M zp94FI#29d_sl~vVbXitG^^S5qx-1}-=^*65 zAj#sc-GRMiCD@R4dVt*VRAmH`UX|&-nr3&fvD;u1v>vJLXRTQ^pKIZ?LKR`d{YC3u z2sH3mAtbVxe*4Nnz-p1CaGdUP1^{g&L)#1;!UmgZUnvjr{0K{`qsT;1GWyBDo{U1b z<8{2?iJn2CUDS`LQWZ1lm$>Qg&<=C&vgYB|*L@*F!+`ju&JiiI=YJh)_6~&{M)8=y z(WzT(OHR63ANKGFGJq)HLxvA{6RcQBXcga4{AoK5k~gvihY6^w&wk^X_*QiGCdc zx20^VqxMk=QvCK(um>%~3L@oCprm*E;63ff7}%&ik}tkltI%?3e6qsv9G0uFt! z`B%b7#KD7TgOObBy$nRZKhE!~^ZRDoxBvIE^Lu1t_&pT-9twUB1-~abFzNa|6#O0v zeh&q|hl2m7LxCFi9QuTVnDxjt0q5mM;4(M@+kVDmLc+pQ+p191gttBM;SBXBEs841 zh+khXfBu1Zooo#ZnA~93bQnNBh%kB#viToK@Uv4>HxGyvSoBka9(E$5Y^?{79(l{U z4+I-KutSwz!_Yq^Zdr!J>5qFXXDB~7Q?`e4BBf^%RmA9HO_JOR7EG)EI%4Pd6c3{IQw)NFAAw0-tTyTEoQ_}M; zzh8HZTuesHK-KSCAqv>eQ+S2wwi1UmuCXDC-xqeKI01_zpXLs=dNXogvIr*iVp8FM8P4?NSE8KHk=r z$-9}=1Vu%uU)N3;{E-h2O=SK$yHhdHg+L;{QAz#ZWPsuxXTK&=C1ke+3+j zV8{XDV@dlGfCROrp%zMkShS$;>tpSRB=!@eOcy^|qSKf-SbqJc{=?JzU?jz@(k^4` zKkEfd3|CtQ2I!Hk(c@nHlf-V-a~+x5D0=6ymx%PMv~U&zfK^};yBFe+z>>TAT(K0x zp1X=v^|8OxYIkm!nCAcWyG`O@IU+Y#pWsx85qhB5De!6%Om4>`qcED|z@%D)@`TP= z3T4C!!6_u97FOQ(R3%eD-*xpQ)}FAl?h7*M%vfY+?y3oRyJQHE-c(A`;*Ho#ELAD_ za}nc3j*;PVD8sN%9<=Y&Oh`*hV@JDx0S&$Nu6SB2F%i)hpf%l%vAde7S2ht`AMr^` z8$1n%+P}&F`g#7Gd;wt@alKoH4L{zG$EDW;7){vqpLr$-JIuc8#R2o(-ndRiC0%Pp zdQs!z;I|oG@fUUpJIQ;o!{kd;9YP0z>oSB{p(F1H7@lCfo5_I~aSEiO}EZ4(uly8cyc?s7=(W`|vn$55KA9Xda(h19)(;AKH3~OHJ7G(BRn7EU%Lh zP%aA%v7}#BYvdX$nuALg+Z}d!fvik%cp|O0s{Z!-51K!IWW2xaU71kX6rJr6wmkH} z<*w&ye=GbzPgNW4ZIm;2;^&@>Y0lTU%Fqe3$*zF*EY|}^)8}qAddr}qqiq1t;@n}c z;5|H?z&Y?1HGyC(^u)JIl2G!uXMCOW`|+{UdF1jz{Cz90o$2#S!!9-r3ZAj(8KAFb z(s<4Oa`p*d?+aYj%a;k9qVG{!$A>Ntyw9|~(wFyXr$Cc4(q4S zZ+1=%H>CpWn2tzX;ljbgbfUSDuztJv_%(C1ptD`Q?d z7Z-`Cdg9ujpQJ0*K-T;4c8V{}8NT)+ssf~tPrQI^X1G&aZwtH$CI5E#;(S5a1N z9N_ao#&MaW#5Kk}+#~U}?~*lvu;K{5A59t5Yzq*v1a`6*rdrM@%7`gFz*X(v{dR!*Antb_&j4fBiAw##ujYMU7z%l{=@VeGgp?%Loq|v%x**&5-doDtuqN#5w** z<)HohA4Z+TB<37DS$F1JPItrxY&jO$>YSTPCf^MdByx!bw8Yo&Qh||4$pd$G&9Ten zCWU_C-cMMhZ`=sIP}+2VR@IAz=95<4lUkio%0=gi`Usv$fb3=6X|g!>?Z>PCcx8N! zmT)UHSv!~H)L-88Y3pCP9iKm(@{1#>2_eoIxrkkIeZEm-#Tz`8Geh094|%ys80?$; z^7p*D=L3f0;uO`kBd`;S@8%3zD^d#s6XWd6hq$v_Fy7ZK2R9e5gu&!sQ>IQ0lcuHW zbvMlZxS(ZUYwM|z;D2r=Nxa;@bS|lzV|~Q8fY-ER!zS}P(VesZd1}I*Be5;X;hs(5 zC1xm2k83w=CaD;xL zfd>&T_)Et^uy^ck4Mo9mj(=@Y1zc2BUk5WYwpti zmVa*3#^3aui3)=q@@d}pCp4d%?bH9yFZ-d9qhF~;4aAA7F@Dxbm>JA-iUNmQe{28R zB>CEXXEKgtwCwNTg2LAa*^_OxZ4-HC;jKT(e!~E}1{2!$L)MMu1x*Va727%Nf~Hpb z>7Cp9MXS$yG1@sn5<=qQ*gec(W={Xa3bq)!?FVxwqhs|Yny_Lw*w40)Lb{-k=?lD5 zMqXseYU-Aaw4HC4P4ApN#os=?7l#OVT;Sb|(9=w&|)^WFzPnNHjEwmr>YnTg$q5 zBTUQXl49A~Y`0>}$If4W4*&4Hj^t8u5!*S~d;$Uky`iI=dq)%fr?uMZMs8C}Z!gVH z8q;fxfY79FUC{v47^_az2up{%V*&hh zwcy3itiFpsAv+oIaOt58Gh*?@+s{mTtE2pGW8a;VBP`MFg8_rfHyeOD~je zPDE9`S4^FG0zB4Replmwx>{#ZS*Q?gfqwY#A^X&d+rGG5?=jWP?kjuplwI=#t8*jY z1DMjQK*<=MJ1YNVMO0LjROqG7l@Fb_Xrn}a2FGRnE$Un;Ln5mtiyOmCeF9N!rt+J zajyRUIe(wB|G414dS!l}vfrod-;PngAIyJyTK>B){yt^@-P!rK-~0c~Df`S%{~g^1 zFU-jk8<_-rW}?Sdfrl`GASCWK%I`kfTmb*?m}V!}NzbAU1Y(b|ZOhG2OF?{b=2mfb zRVaQJDJl-$9oOC+@4|Wz);)sD=X%TMri3e2goDemCFQ10=qi4kjR6w5LgoE=p+9%^7P&9WA^ zC|+6Oj|aOk4dPCUQsVEFbUt`UQ_Q*YEXYx`RP#pgyDAmo4(+JV4NgP!eg0Uig5K4x zV|wUPc2bO_Xvx?xt+Vb5t6r|$DpjMmT+cG6z|DNVZ&G}hO8Z+=oD1dquB2?hsUoaT z)>w2p0~oeKseXA?esmH&@tP}c_M?~Lw&3WBas|G9#UW~U#CIelyJzc)OG@Y9j3b>3 zH`a3~9lL6W&dks+PjsAOSm+&@7urfKQ|{aG_1N&;Cmm2rDdDregEmyJO8C~8QFUE)Nb|nu}yb62~Y#)S~=XU$x_i}_>8<)5Eo_TbP z2(z(P&FpsNM%H8}*Rj4jnRwb^S*+;FO+45$m+#q`A8$#%m` z0)FC)$By~P6$i}6z~?qOm48liLxF%(kV=nWZ*wSa3csgse{e0v7yFvo#VEj6;H7zs z{qhU)Gknv=lR7sew>V>|C>FWgq(;N7Q+~KJ+!4)2?cDjO!nhxx7lh`)?S{eh?CP2i zSDgAG#o%?tX|^{pk=$+QfOH(bE!E&c@zcbIC)eNlvlqu_Xe@LY9>6k#vpx~Ql1k;M z{24%HM^>0|RVRXya~Ow%GUX{ClP>?L4*Tj^flsRtav6Im?L(pGpF>ISb51Qq+LV)Jk6NZt@&m9YO&zjYUIgDjN5n?Ct$rp zTH#CYGBqWujc+-Avqopo`j0d6q*aO%W5`_>91EY3erHT!cdW(pP#=0@*cW4~+>}+L zSxq(IB04RjKwTYpiFB!&@)0XF3#s@{uZ<(!&VjTc_5(tNThEnCb*JKvex{`>!)h*2 z^6QB+}9bn&2sH2MzN5f6902{a(DB)wAhtJ zD^=&Wy=w9pV{ZB!>9VOtvk)S3sgx(aW-TGhoMtUAbml(aE(;|Apq;W(o^w~Y5$?rp&j(#I=ZHMHrO4PIEnVo zHl5&#lslt`%ZdIHhjE+Wo~)S)=i$D=B6zXZU~4*VcRCTZ(~Q#mQKP7wMm*Kfbf5=S ze9F>&&CtUki#&CM+edbnUK~bT0)D$p=9QhC-h(>LPO>Pw6OIMl;+yUxxnd;q%ipX! z1?(Ux68oJHgkPWADO*4v4sMiIM5Dy-UfxeT7N&C`D&S}c1zr(;{_8@Cw|N5vUsLXg4F=S6RU&Sg2oMy29- zj=O+= zC*#)gQt3p;FDa-xx2VL#?aUemFV#|>5h4@sr-&ep@lhDFxkZ1Vz|XqAG7xQEs(z?I zYp0x>{n-KcdD}Q)zU;}^jWz^)LVUZXF>`_A+7Mt@2*^np_SfDXA(`dnk}m(2KX!z? zJ9}Glg-GCLXz3MlMlr_5XTBRnw<4;lhIX%rJf$f!+m82jy%4PJMDJowC-O8ZTVgPq z`0L58&`u58hFkcG&fQ#mOcBxYC8hFtU9%b~u5B~dnVQ+r^^D#SX1rB*165m~ubXIg z%XN_?axdTN^#@1kTh4Tse~n2eU&=})Pc@qg8S?Q}EY2@eFdt7X4)$9u_RZqh92!QA z3Zb^8-N&TmXAk7f36Kz?>MASW7Y#kIZkpt9Y4-QoxH%EO6fM$x-L}Rq|ACbim1>K_ zdbM`t)$-?9A2Bm~9jiM5r-^ON9NCY}d=+1&5!32QsuqIV`sxbj_K!=)ENBYW+=p+V zT%Y-Sv%3wGXgk?0eWU0+?{_xq!Y)OH+UvTUYxXtAzEPmtsxLLT4xeLZ9ntBszz!uW z5s_o$*;UnM2Z* zmLWCsmkO?WhzuW5P^oQd>=b;Bu|#9liM?aF)H=(Co6jg~a9N$Vow3h!?PnOY&7g`t zDS?R|dCj4oW5)T2(z}QZsk ziNVEo0iE;gg;5I^P*`qhaz`qYj2e}y{;4p5SeU)Ow@9G_Kj%)hr8_gJxjS=PEOsf7G=DeAgC@XWWMv zAD%k3GVdD_x_N4ydco|BL$z((R^Pd4Uq)qp!Gi<4YX#(b9ns~Q857dM)l~cDrd*fp zupbMKxfq;aZaXeQxMqI%)a9=gaqyjWNFe`vbyN4DNKTCKfDYZS4m z6;w-*P*hc|`e;kd5=v_&wPI7%)Lv~-t7xg(D`M}&rnPq@vG?A5(;vS7KyvT<-uv9= zJ?A{<7+wq;GO(TWP=4FH&$zNja@D4Taqs=#3jku?Xj0==o+1{mp+GK{^d%bnNG5?{ z8qQF@kzQyh{T=4EFOr(D{>LeyH8ZH{#j>m#8puM5i1G6qu!8<(AJ0KN1RhtidVJ+7fZyNK(X$0+ zMseHC5^Zs}T*AH}P;xder-eV`3u^sPt`^YEIs{#2cJvqJIn!m;ZpF07#T;zIUiuaf zrduhX&Y~|CARXhtszmJeY=5_sq%=kkczwY|a35A00nz|H=ubM@w7jGM2@|+1^56D| zJ90e-Fr#wz1_k?qA*XGchQ(T+=?K5j5t5;j91i6u1viLH-H}9qsH)M$-h@v(QtL4a zR(fvb(l#dc_sK~C9*q-erVF$lbV|??aiWg%=xe#2ZsVvmJ5SeLF`!+*ofPX4HvKP0~YP&YqEKM*wuk z$dzmEZ?^jGnikW*JpattyV8-SbLAc9L!1Jv#}uFG7ym=%(c84Hz+4}-3!Z?qfueyu z=2D}mj(T^@n9qHH+LI`9Ce7mgIZN-E`}<20yEjoIcfmhWk3S+fY0k0cC*wh&B%fob z+96bTC$3?1SGH#zYT~<}xJJPQjcHG2Lv-wfq@I6y4(G1$Cfs(PvpK;iHo{GRO=Dgi zW476X^2`0mC05*z0-j2n%3m*^ReVnGSd<`(dq+Fj;#$Z+&rx7pPo5?Ifb!c&@XfJj zaxKza23}Wb-=k~qY(0gECIroM1lqiS^h#oRn@I0Bfnr zSr{oMmMJ6_QTg7|Y-{tfeLCYY%CO|NW&e-^t)rai>vb(NUcC{%Q{Eqc(5TCd(QVSc zBHBs~szEl11ibr{T9hdx_vT#~$Bf}{8@zAY()+62lW~F9%}}r;66UxNVKHdb(-~$- zSHJz$Vla#p?smW|oSr~eF20i-ld|^xuRihik??C$W%P`srC({ru>tzUQw%3*xK+Zc zb*M0PiFkH@Rw>>#25-I7Qk7hu0e`o3@x=D`(vv6kOAt5lJjF537iwz<@Jb`(W|1KHvo(#iwfbfXZgK z6z}83(=PEKmtqSQ#U2VJA&v&#p-HcJqf@rAowxnt1(tTN#;XH+=UtK#|n^9VwAI!4UIp7>jF7AkC=rYcJ#>m&w%KCDkeS)hhyZ z+(2(4p(*3MX*9+X38rA0vK*QmO>m)u3p2>{Xr2!8a07r5H)?<{e`tpy7ePy%4zb7# z!I8=3wPyXaFT8SL-04!{V}7yj(wIvoK8p)jBZ*gc==kdAnclJW3gu$i$?-#I=_cU@ zU#l*wYNlGx-)9=4j4$hjjNTOw=y6004C?d;jTBhj-?oW~)kZ#Iby@#VvkZU_de+0n zJ>jJXZMZ7N>2c;B;lYWb%&ptCarKq<{iZ`1##emWxXPV4{Rkwn>yc$7;*KkSf4Saz zn?DyHgW|vBnU=%!HqCPeK*F4o!lPKEOwdV>%=ph_Aee?JsbvHGf#+)AoFPEQI~%LE zk>);!bt<}22qRHBuCx=sy&b4w2JA}Q4J|sM91KL!zmr(rCO$-0kyM?AsdwLw@w)8v&8T@KL zo7czpZua5yQbPen-Yy*KDcvSchK*xTd8qd#j^UlU>LTS<-l}tuJS8r3hILzncvhzM|anyXm+PlKDW=k^ctWruHCd}%UrXcN&30IXDz zixPvQN_yA#Pg?TirVE|fnctS=Nher0w9&b`gb1imyq4`JsQWoz1o4o8Z^|aqzCA?b ze60cZVpry@0(v^x>Y~9k3Na#|aV#}#*ZS_tV>Da1z)>hAYr(2plYpjsUw$oS2uoLl zhepLI4Zsuz&cY$ONWy($BD(_VH$HS%59Nq*;VJn9d1t9m($;Zi$ zimOf~T4NP{6hj-2H?S1SM4E_R!K)jpL9UG6 z6Rqs}J~vBvpYJ06v{}sfN6$Iuc)fm6YjlT&2UF_2+z!c?|!LRsKS zS0LsM7bB@}^S@1~P_?=jUldDm4)wPp-dH*uxX=^Tr{AWV+Tml;_uq@1&VjBNV&^rs zIWShn$@uRNf`mU~YiiPEnJ#scduo+q+${-6OX9jjRJ9T2)YtWDu+Nz|t=2Mw{^%W8 zX=;E{^4}USg+^Nkx=jhce81KFvFZjz)PPy|jJx^yT}ElwovF`JN}`C=Gnd|Cvq!*j z=XIk*SE$a3-6-56dcdgiWdd?1qEYnBR@N>gt-V1RYoNHF*UpTMxpmZj)h-5o$kc;l zlY-@A#h|pR6AYUq>4VzWPxZj*Z5me>*1qwEg{^2MXI9_2C5bi7YfN2_2l^ulJ~(5p zb{an+&u{Oh`ur*;Jq+7%7q90;nbc#aW71KMFG&9<{$C-~gUFZP#_4`Wz z>&@}9_d={fJ1}RKAq&7?Nv{h#rzHxqKgB490@6v=MNr?oWTy)sTPnTMQC9XHQhjL+ zAiL0ilq(BJ4Eguy7}-5qND?$SXF|MUUuOPMMSBPYvaix9qSvsa)Rl!u1X#a z6dGT43srQr;U5n2r%w)-TR8r2*m{dYkh!*0YzyuYD<9Z19Rb~|Q|+AIol>?7Z=`-{ zD<1bdxi0TDcKa$qV7f3 zq&*Y!Mh>gKC&ZB+DX@~{Ocgu-$oiv<+iNXJpVKvyx~M=d1scp_UUxN`XKwy zg9}5B-IXqkDb0j)kaZZcXk9DaeIfknVaE30qV4%<2DparZZt9`I=gRu9G7(%wevJh z1_Aowc0Ci91?mt4oLZilNo^0Fe^RP0P05*o29!#v!8hv?90vD-{Fa8j z>$JWnsCq{RSq9vo{ugql8T|J5TLU(VI4T2!kJ>OjaUf-4Fkx7(SXa&>z;!{LGlZVw zhKjw=;J9cO=Qf2geSKPi)kRsGrE7)GgSy(szoUXhwc8&69h*%(8bHV#TYMj((*0>p zM?}kV`~m$SNHGR}b^S8!8{%9{WQD?n^4r5vQ0!f1*BgrQdJL`2IIG$LAG!0ky*2TpqD;ho8Wc zFv%qti@|lE*A2{?%q_uw=(iXz@i@&K@lLL=-3Oh%~T>BZmRu;w%L>hYC ztlGWbAB}o^yIIVILA8bh%4$(W&A6$o?HBbtvea|^rtALQK<=$;`mI=4w~QaFPk$;2U(4?4srM+XXR#5F!L=C;Vjj zBY+Ce*|f5!1-^Gkun~S^2Z-rPeD$Q$IE`cM064+M!$@gId0V~=QfQq z56=MzJ-Uj%#pdgEvm>9hxgDB`o-LZS5(sLOrNJDy)plG%)33t!sXBCMt{^{jO)fb^ zf$SW^I?rCUA$8??*EG|RBKV-;`?HK-)(7&JhZt-#TVCg1!BnKP=>v|&XRz*LzBECk z+~O|TNxA?x+{$81rGe$$6Xc%`jH5z1QD!6N`wwBDMK>pBPS z`mAR4cR@da^krV|Ajzc;{hNMEa!fD8?uo$_rvy&WnpaYC5z`On_BTy%W`bYfsc%O+{xzMAR*X(lA|!WoT%Y+K!sL?ApMx1JAvhjY;ExU(%7MSknY72lU5Ga86LR z^iWKkMcrko90y3?}+4lHX~aU26h#_|&KDx~u;()9r(pX2~3p9DR;i=ms= zl~kK~olV$fXT6B|zL_C#Spw?*Z$|+tWD8}FSW79cPwKN@7zTDd)Dtxy2bh9P@XI1e z7I&AUmsQQe4`z0$OL;rQ%ul-R&kOK=4uCJkEy>L+_=DTwTgDyZnchEz5;3pJpNK{e z_?+)^rz#U^GAC_YQyIS5W8k2lGwXNXEt}yP)-`I`Wnji|UUxzE#Xn?FaJ!BB?ejKv zZSW7Z`CDEtJ(|Av`uAdzCQCmdVHM^c%dyRU$+rk5Op3L%OoV!N6;#dfS5<7odR--U z^lD8#_@Z-;ZeneuTUBW2^d4=EZO(!7XAtGpG0YvYyh6hgnI>eoz8aSYk||!ZYz1_g%_MQ9oEiJ;H! z7h7hl7)b4)->Z4$^IJ=U+)y5m7$t_iBe<(YBF{&laGNw$N3_O~67qZ~x)^f5QHkAk zmwC8=gn|dUZ+XeB&*uUH0VG>w+sBjt*;i1<1zr5@Hf;QT&I;ZKbeJNMc0PqTK1k^u z*XqX^Bse%2mxW4TA>8w!RqEZK1}v7fCEGAI_l#%wbh3A7`bwAb2jQy`D!1HQG~F-O zairyWkMy9KYVoH6>?*-Cf%hV{c9+czyo0Gwn3K-f=D4`)ES+ z4^TNTa8COA&@9e%>V=66`3sHUo%#FE>512SC_C0> ztBnO5v+fv|F6UCF5oLpof1|77)3q(Xf`s+z) z(mFuV(RbgcPi~@qJoA*N-#z)9=u07S$CQR_^}k8qk1{Gjm0cIa@QfNX+>TVV=rv7o z@`Me)ezXpqOUBfrrSz)Sxb1HeAgxDj%{0-r=cekQIiL=LX3wnagK4Xn#UGf-fVO!w4M$YKs zHgwO}pBK~jt;TEE*nON^En6PNe8Gup>sqclu5s15L8Gzil}9A-F;P5f>JIE6{mc!U z2>Q5F81$~`)Fu6bZA`h+yUWDNIBdS(M(~Ew_NYOsi#2AeryglhslFdaM-~wm8CMlY z*Ps00X^(lxS8&;qH(Lt2ecmZHuOy1^AHg3K6^hVtfFIuF!(YN4@DCmTjUkhzRgItV z58Mv7qEV$dA^-@tpM~d^XvpmCPg+8Sg2oQsOmQF($hf`!6wV0^UNuk;~O$v z)Ej8WsA_YqsZ8jR(HtKGO?1NTya@i#2Twp}F}HY1gCocWkl< zhPz2^u+-=X8)3-YWq!(MJAoukx#1MCmlNMw*DVR2htzND^)IyQ!!$T z=ThhDe5#|OLhBDXH1`w~RD{l^R?*98S6^yUOL(X4gNCwNERXFQe>oI#GVL3#@o8U? zdrw`)5B?6?SD#WV*4n}sfA@BcBt(}ugw-X0>T_uspV)RDehaMb!J6XRo&{{UwX}HQ zKh>8f?#nSIv@J{CwPV%i<95TIj>|QuVkA$M*H%%M?w44Z>2A8vG~^a>^!QPRS;HFV z*jN83V@T?loYAz3d5i^$hp1haJ`q~#tn~QkHDm2!InU=GV+QTZd{Dc6!{gLRo{JoP zyPq3;1e5Wy^NCXENmb2-TwslNTI{7aLn80`O;iw*HJ0Zsi{#!n2{d%x1hv%}A=k7p z`3wSW;!2f{h#yST{Yt0s>{vfWfCDoV7dbr^n75YBcTs4qZ@cPHTl7Up1^&k^Xqzes zriV=6_Og{UEY2F_@M=whjZQksmZK$(KNFtEh%EK935AttVSBd2OWv@&Ox0sW=Z_)A z@S-$YspIt0uCcQtVL>N>&Qql+yjgQmV)w~{1gv@jg*F%NKaYxEuUSVXmXS2%JLkmB z;giTfIv-oQMU9N!Tb!9YvS&`j=Xx>VKhKN6FPRe$L;(?e1RHKicpr&xBN4F-dyK0l z%UY)&d+h-XY8hvt$n!g=`3nLK05I`vac||pd2?H-v}RKOf-P>eU&(HIJRTN#)`bqo zqR=%mOU){4nsa+i6O)()JmBG|$?;1oV-fWCpq$d9E)NxUEa`n6y1zKu`5Y)3-k;Is1I=?GPA*F54LqX9(N+X1vxEZe|z0F;7U zRQm^9Ph&5Q8iMnbnQ29HY&T$e8;u!1<8)o5%O3DWheKPV=ce$v(WSPjrP2+ra@=+= zhGsBas0X9flW*y7D*lJ8{jDeTAPJ#=>q5t1eAlQm$y8f~@!!=YibRHYy#Wg7wzO=A z@PrE4h4X1VD)&FOoEALGuv3@fq+0nY$9TCzl6c_w!kQk9S|l}mQzG7`4d{6xk4z$1 z2D%%-2Ji2kkVr@ySlq?h8Lmk=_X#d{)8ekH|4OMRZ;q?Z8jO32C8=*U114GB0cv-H zaxb7Y8odIn7|5X6W6$Eb1#=A+v*YCUnu;zX=8}A{_HcCtLm;uMY9|i7fTlT-m3f@e zE^iL_D?O1*3%IR9sxUV+I4-*~&%0!|26y8aGJI}se4x3aOY#L*Hh%zm^!YcQMzcvX zO;mZz^8`>Ux+nDDMZeN1?x5ML(ItnL7Qr&E#j@x(Jg-V{#I!?{=jc_cHt7hZ7z?+| zV&-K2Zbz=mh2jakA>tq)`4RTtosa8xo>J4Nf#cqZk`GPM6a}YhusFttyFDFP`W^Xq z=?AL~Ag4WXe10FugA5IHrE0*5tb?k~8tnCA6IdZ+y z)*=_~^~n#02DJ*`o`uxl4DTG2c?}<(As=*73O(PJvlF7~=%n;vA9iM_%W%d$@BC-WAc{`~$( zqt?bm}7{4gF>h=iR9jul*mfppQQ9A6Q=3hHx#J;*Cccjvr>|n|>?pa-nON&x8#;Or`1a z+zAiOHK52en+A2b1ph4)eUOg5lwF}yJQ`we;qEUzeDbgtL4&4gU2X!rcCkFT3KaUz zrJl}4;zaKE}5{F~se z4O?p0?b;r)_y`Ty2^q5AHi1W!40nIT)$^mZKCN=<79T-?f2`S0?NXn}-aFHlwmsKS z_<8u}#(7o-U{Ez2?+@~o-@Csb?CzHq?NlC6gcvAaH`~ z*}+=UK$Y^zyrZ?yPXE2gB%U!ytp@l}Q@_&Y+mp)$X}mSH@|;!?xv7)SfkM#X}6Gq%x;Ziws?ND=B3eFY~B!Itv)d4zjq) zc86zk3Pp?mQN!>+f2>t@tvUN#V5~mv6ev=1HR;t4<%x2-TQnnjV?5o8=4w_UBR*o< z{5=#WtyAF9*EOAV$x%JhVke=j-#XZRce4Uj(`eMo<2--WF5@zs<3#+smDw|wT>|ew z2D_!~?q){m+rCJ}8Ao|TruC>_Pv`nZHKlnM7?nJ%eVk2GY%pTflx{%29S7!yh9Z?h zq!654pWsP|$JMVbp|hdWoK9GTqZL`+xS>);R^iUkC%;eS{wOuhKkoA zKHaIUV6Rv9I5Xp}elF)yRDNF84?kD&beV<7D*!S7=ZFE`hR4U(^Sq>gNVoK6RI zM!!-?QQifgtLQ_VD3{EfF>W-2kPP$ZqD|9{;aTV@gPNKsT~Nm{unpU3vTj*Se4`vLaD8$)puZ|g{CY0h z=^fM2ScfGW?2j53(}jJnQ}ou?I+rWc1?vS5pe8+bE)kGRx0Jy##_dO96)^9{@4B|+ zDUAQ(-E?4iQT(R7eJ2ggDU>5EYKvjDCnWo-SyIP|(cS1wY}OeSYH8ykHmh6o`o3WH zk9VH8G4#S9?w}=BdYM@r8!Y$izw%}QD|>@5=Sp=?QienhZynZ5CL}TaTj-o9Z9OJI zNxymM2B_>YQpYMDq{&oSJ1G267Qp?NP_EA88t7_!QSt9HskbXqXpR#fOvS18o^(5qh8e%pPLTp9YKYoLaZM2NQL za^mD{8sBqYr^1>{0Is87m4B0d0xaL8se$R;^tX+Dt=5T^Be@{d<%vizJKL z{JRr(0^7tQ5hB_H^T6~+z@08(E^b)A&!3pGqh^Loz1}MHygtUI5CAu-l`{0gN2KGW zOn<*qVCI*^UnA=Lr(>9L{JC-}0{*kHAlPFc!b+c`wlqP^|$VRyJ>Bx3o=Vjg~A;34j{q_(539XMI-ckVZLc+c{{kGnXE0ZHED0XQSD`FeH{J&g0 zdSJa5()r(7-8}!9Tn{TFsy?Mn{ZN82;i_ro*3Jg$=d_pGjpJs)*H9e!;MO& zDTJ6({y;rGy!kzymi%Znf9sGhM2h?6rv9eg%WxH%30!g_LrYb@&N_%hFo$?chVC9v z2*ZQY@7^i9S7JBv=s=g)GGkvHR@c8{=FoMKDCXb`%~*SMCA(Th;XO8^wpQQt+vz?z z=$ut)($aHBp(lJ&h&irg7?2p$lNn8uf(+yy3iImhFYw`w)Jz~DeP_*i&CxkuWck`nDVLG%kUUfVf)03X?&-lj{s$R)s&sE(UrZ$CW#~^P*31Sh)e^kf*db@E zOcY$@_465a(uuZ_?{|%wybtK?e2XgV0E>QW^NY2UdRKISH<2`yTaeGyhJSs?UacBRVxbbLoN38>5=iQ~APITruKJ1dUjTg%@3`96!PkMl*!Yr+Sh4wBtQd{baUH$)%DqH;t2L>R z)DII;sb1CSgW^awb~BygXoASEvKkR-NAE8cq0M)9A6l999~_TpmU#2jtnEA%6uIhF zGU5>vM>1|q6u>Rc8fC!4fxd&SI?(qF)a~EC)+8Oyu3ojNUt{V3Oa0f=6^S_j_t!zM zs!{rj-8UsRd}4SBHob1d+O9GpD_w-I*VuT~&eOknW6E@vjEZzJ@~-|v?22QSqm9F} zvl_HrkGPUNXN^ayxXC*|%B4sx*vXfFw%1)-ZvCoz>2?6!G9cYqnU#NM_8oVTm!q(! ztW`Bz_DXjqdPxfpBOC!+`#h`o`Zr&iYj6|`ow%eQJ@?$846!HXJ0zm&dFmqV5ssQd za_QSC!#Xk2w9Reyc1gg`Qma+{cR!{oef2tDwd4S>_Vy`Zc0k>mUw_SVXZjwHSRVC% zEKjJPWfkd&vP^L{dCQ&gUKw_0K=PG1hfDz!`yNJVAL{O8cX*Q~)aWqKuP|J?-~dxx z7FkTRtJ+{`cAF}8iK&oJog?l0dC9eIps%yv*gA=yN?@HRHRsf%QANy@ z8~JwW2uB0)6V9fQJ49O{pFAaF8l-?2U;Qlypp|_H6DSTtvKvSxma;CnuBPkH%(^ov zj$wK0{%~54OZJe5DvVtI_I9U^iw5NRUUrukl@8{OydmnqO1@z2*J21GW87PL`_}@e zo~uwDk7&=X4B96^vw2f}mo%%PU7yw79MMnTe-UF-k&k-OP#`e422CuN_8vAEJ&?1k z%2##u&bX(V%(JWyDDlE-w>9jj?J<5_eP`(ItzmvJ*^@E0mMO;wG*^r{#I)w`I88VW z@1&{+TaL%{L1v%K#$M7lNGN^Qf2%!?igA>Vnflq~2lN_s8jyQo`eAr;S*uA#r6!}K zDtrLu*JzGUe9P(a5R9L<}+yMBwlx!)4Uh{biU26Ct8vaI_N#y5zR{D~;ftFJ! zvZw@!EaD+#@wQS9*?r355H}8~oFu0G@hwweo zW6Lb-c`2C)7i(!7b>k*vg#{3mILhms6LeLB*fOPY-$JJ(#uR4DWZ?ImnGVoa1Dh5L zw>|qW`3Dn4YJVMwGeFZldZ~9qeTt;kUU{ z+EFxSMMu`*xfgKTq21%?P2oyaL!(O42Md1E1ZwjpGcLL6quv1X&Gzr$-7?CV>iPv~ zEajZPSQ-;!%E;^li`>eiHw(3CVncHT{oS&dvRdbI1+-XPbY zs2-e`*3ipG7OS#}hK~PKUMq3T+FD20h536q>je&l)c}{q=M#aw>O8fBjP_^j{>xPk zPdF$=-Dk0)V!2*Psm5-+6;0GpUB*r_luYh-xdF32OX(czaopaK$m^e_ne@NWJ={ke zcM8Q?G;7vLY^2hYAcCfyf)44@IL@BU@jRMtNArdfju1J6B7($*kj02jUXd=F>)K zJiaHN&m5-TBa@a;2lCqV8!tOiCXVF%SrgoRcKS`MItJs}5V0&BI@_VF*^sxkYYOFU6^|VS_V(d_I$QcvN$ev$V;A$~yv-s%WPB(rqgRIm z#Z!-ukc)he#w&q_E47898uj__8y2P1tF1)mWZ}59^qM!ehOa!*J6szfo2qLs2WnsSnl5>oKWu9WG^L1kNRBi%~c3ektK`iWN3)IrhYu!DJ07-%4J*NgD7x`(A*H z8(2ArZ6Ef8l~yO_zzaYbey`Njj=ZklUaRf_)~&y%SY>Z)I_Yy$Z4Fx=;-ZsEepT?q zDtsri{^c{`ZE22`7H~e5+$BMEH$QJ^ak~pzG0%DwE|y>>qkJ@+tZ?G0CX#DbMMLfj z7B+;Wp!fgk%6uyL)Pl>>+9uU?tFyjM-4rA{^?@w|BALy0+HD&)$Non;bV~HJXca6) z?z?*rm3Tnnbl<6GH*(S!SC!p5eTuJ|TklQvaXe3IpIFtbV_E#b`IwNE-ZWl*_N=m^ z>Iw@pN6nC={Y(98J)opn9&WyWXVr9S+$)ti1yNlHV|khLxLt+-_A$&Lfr~M4C(;@$ zQ{wzz+2}0gEePY2Co|PE6bJVh`#Ux~o;<4ZP5oEDR1|;dXFLneso8el-X6-l+%E&s zkhBE*sl2P2Zn&&4tp)~(XyHmI42WD= z^cq{jGXKKXseRN6kRV4zJm5skTXb}6sE#U|1m-w{fA>gy2WjI7Boom!=E|GfMCX$j zqc>QQmQB#;wIW9B!Rit@s)JT7!YQ-69KB&YRUngocTS>Y zId9Qqy_cNjiWFS}J&Y2gc}@RJMlurS{Tt`%$Gb$+ypL9WJ|-H(y~YQf&GwtNtNXn< zBgp*j4<$JU(etPDSI7|iJGX?0{t7(MBP*CzY{A9NZ%THB1DkMPa=YF)vAY4=OxPgp z{LFnL!>+p!AN5M#-5x{t30^93G}$|6z}flN+86AKu}W*vGIfqnOLs+Ty{<=wi%XRWu}Ara z0jtXy?sZ~SJaS6fYQ^%@4y}#)fVQ^T5B` z+1fXtUV_T-{iiS;X+>KTb{fY7YjA_Wxd!%ICyv86MGw=Y<&wlRG+VK@o^x5bjCPS^ z1lD4eF8BfLI?nL=vR~}>d!&3`m(p!u^L0PQEuc?7Jl7$yAai1y>AX!kh? z94Fx?Yy=44kl00DCwCOErXm&uai7 zYv-r_)BAxW*@9pGk;ZrAHImrHcwnyL*w7hMZ_~*?`!83$jg#+`*`YU;oGFwvPC^AZ zOaS?Ti9zK?NAbu6%Xch2>+@e@Q}$l|t<}=4;f!@Qd#Y;pXkkAHqrT_jZr^p?ljonM zSS)<|@@z|OKt1*mhNH~nki+`E`%_-fcxq&lGy627$I+Ki$n*?2ihkM)XVl5)MG?W8@pE?klKDjZMEl;Ow45c+|WV(w8)Ukq+j;00{%&Vg=>h~ zD7w$G{1nF&kHrCOJEAi=PnAWlyOBh^#e0irnSd2tj>@0awKA*IBYrq`8ZZe$v70z< zm}TN4GoUc2`d3@^g=|T_dTuRbls|*RpFfPcGbaZWS{eG0k)$Iod6+?}AO;4*>Fk>^ z2ZnI(p#=2YfxE^wN`L#du7Z4**O0y<1KW~P*Ax*X)=%WI?95Lr{kutQh3C*=MVmyR z)zD7W)mHIogH^{?J=qF<16lFO)K3#`GzFu5>JPHN5_?etJEdZV$L=IgUq)8}9 z9ODNP3;eqaLC2%P_j@2lmgFh5$IQ7hQ4a-A=h0B0DP85>aGAzNTkrTa@Tp#hmY3ZT z;6#Ffohtn}c9lOpNBci_)Id}^>W>WHJw2)gwZ>N7(36}JUC8qiwHsSRX`T5JI)Oif z=bU?;w%m-a*30`mHx`&bsm6p?g3S4j60rCq{e~m70JY4on5lD(TA6bVXmHmXHtKVY zNY{0Kfv&zQsKu701D{xSPLS<;5tz016$$7Pb(sQCCLP zN`&w8W$anL-#Qi|>m#or-aY+%*&e&Sw=AL@Z5=KIsfoLpUu=Q#;_bsapUF>82SrYq zmKPqjsPm-gvmD*oIS`TY(eXy+yUZC|x_MK7dXBeI$%lNc|AZvvzHjgcA-?)Jy0(%l z0VIf@i|Nc+m_)wqNJ0f3`Yn-GEDY#2_4Ue{z1rV307z~|-pSV3s}E{xOKMoRzrnQE z%0Z|LVeB(~stYTmlnybm++rD_XWx17kwGX#D z3Zjy|F?M>j&;Q^ntW^J7!h>mDDK(J!Z?lvi}j|=k#9JO0&E>!TG;hI;M_ScbP=| z@d9%dSj|0qn>4sw2PYXdKF+UQ!9Hm6-wRd#F^Cr-nrwlZMU%9#0E%Z?y9_ER3*&@0B5u#^Lc0-h2qu zqQwTPW$ zRkja%j+3)QO`f{my0yt$xh`tOLpjDRr$fz_U2?`3#=fpxZzh_cT_viA==$?ypmf#u z7^^8QKEfm6REoC=P$^SeKU9;F=()yGRvxm&Dv(z6+sv z^{Z{~y#S}Yj&EWq_l&McM%ys;8I1NJ5?Q|g#OMGv1G!0~SQz#RM*^JTAJ-SVp;mjPr!eplPUSrdtS}Pa3yLjMp z#nQ~J{8}#~7qgV7dXV2HtTfTmRv4L!cs#4uA9)eA6BqT`l1yh8;&qOSXPgFy>GMeJ zJ^hboI@g&LzFJ*eJxBckJVWx#yTO+fFBWSRupnuzM4AdPf9ITNy)?pZ(GBJq;(`D+ zjsJ>GZ!@V<{rx2{hY_s zfWKA3iSg&T*=Cd}IQtekgTnE1`Tpy|j=g(ePs=A%S2TE{#A07(IEq;!NB}T;Ra$)mK|0Zqh-Q(1P3z2Ok&NVIwEXlcG~#K}GL(dW!@D-`!GYdAQOX5S z3jRD=hbVcQj*wSA+~+a5#RO9QRyJc*g}x`G{{s``$bDpLp7-s2s}^_ ze52Gune)M!8u(fj_t|dAr3X%;J&L6#YxU$>WR|=6FQVs;qc>ly()ah$9Bkn;4h*+V zG-`KZYARWbdmjx&fg&UNGuB%fd*19n5yG~SdAvx9H>hY$ zS96l=AVeIHkVk4IGNgFEt-33S0zT@0;}AM9^COKzGqLSO9;b`Z=45D1M-xU*R6E+5 zrQIo*^eup`F6l0XwW!$Un|b85U6iSnUl>CCB@6OEBKbWhW%BLAGZXM6*NV^s&%iN>^Tj4(| z$T<0up6Vz+c1NeqJzOjBq%MEJ>nn3}weI9U8y0Vu#@Qd>3#E>!e5c}5^B#5RjV5L) z{l?`?FebV5bG<66u5a2adAii^?HW?oH1mnXH}fFgYUS%h>%x!DcaC*{$BINjMCt)w z4T3cO4eIRpOkW`biEicu&TBRpYrX&0o~#88l<6<0Q`Mhm_g|gIsBX=F^kZe(#WNmt ze6+l&n9*7lsKQIVk6FUmbQ%LRceZ~Q6yL@JAwCgxUxRtUpG#l&W=h2K$fHvKOPd;U z5?#uhWL+xW3!qojp@E`bCW}~AZJ>(x8Vid1_+=G}6UFMTFY=#h;Dl*ynT+BZ< znbreL=-V9+3zc>Sp=n)D8dvuw)Jb@RT<%b))kH~K!@ude#K^<%+ST(}$C7}E9KA1M zBvTA+lK<*VE5gB$!&T*^Had)gi6oxm0HDy2EuR5y%WHiy9GY2cwrOzd^KGp?MPu}+ zTb=qV3(bvOi06m~j-Rt6hWeK?*1YwoFV>R9CXYZi|W=6a|%Y`w3lsjEQ_sT)O3E=*+QHhG=POH$mT-mu?a ztP_Vo_ZRND>@rru3vnelzP7F(4QKe9@!nIv%%8IzwT<7U!3UBvOp)X$4pXtkLuy_T zk*K~6J`CY=1^Aw1hAzKzxt%wvA5o;vgN!h@fG zzT1@RLQkR9nr22plp9*`$E3!63J&qnK>m_hD>`x7+#>??wr&{ujk$tU_Hfd8q7m z#msN$rB6G-e?v69!$BhmwQE-sgOxq)XROnvXJ=-tQgDd)B|>pNo-5mNUlz zp}iv2A4#YpCI}0ZYkK`Z1&->&3_{T10x1UCU5X3iw(f;G2C|E3Ppt;rCx;Ea}-eBD`;v+mW^N1lhi1Gy?@&v#Mc1XeI@5lDR!rx{H_<7 zyy?;VONe%~adqGIYPGWpwN%e8dsymo8{Wz>YAH$XvQ$%$U>#cNkqLL2T^!9fvlH(> zHQJmL3{fp%LG>_WkF2RbVzf zgS<=`8CPC@NF5hB*0AYWvC7Y}gHQ#pba_7ez~qrCY+&$mR-a_=(EbG%yGJtd$+FoA zPM6xS)z~`Rx7w<$!wpHGxpaLun5J`QDSV1nLmj++Q>5vA9s_0?FK-;VCGc61@@WR% z3Ii~N6ZS4Pz+aP=HfQ;mo$Y^CLvfDZe8%Zw_KC>`Gp}FXQdF)TyL3S{6j#0cAA9c^ z)>OB(jRJ;_gra~FdR6HtH3C};f6KHe%=rEPvCP<4X_Z50_^UyjP70*M9+%KHZ9ZI8_#N;isJ%Te<^A9*$ zIMe%<9eZiE6ksat?R4oP+_^W#!NS}-dz^lXa@cg+5#a*bpcjeK1qXVWoZ<&&JWs&> ze!B@-5W1sVC=HC#t@)e|9CyL0IMn>=&=XUKLwi-D+fS#MWV#6D$ICHFPnrN+>qxdW03v~MX^If}nC3l38LjdMYuY%QoN0`1*=MoL-jGB* zGHW>zSh3p1OfH@z%0)(snLxr%u%+$Obb(9j^`@v$FP4g-qj|O;X&_sSPu}P9Ea0A5AvP%Xz{gXPblM+GpAH303Z6zwr?|H$GDEAt8wD`q$Gl0|^+xFvF z5n{FNp#@1pOLj20T$3F^Bf9;#xzj9Mr?JyBy1lPaFJ8Dbm%Y6H`8lu)f4Q^by1Ykn0O>zs2BCih94IxA*R zvO2A4tjOB-(my#XF975+&g^7NSZ?@R-q7bZzMoxJE<@G<@{4W0?Tf8q7^4QOq1*$% z@$Xroj|EjSJTCxby}D~%ZTS|}YTk0a z_POsn*mp6v2C*Mulh)b-$BCQ2&#^^tVxVkFc+yGUnGXvq$;G~vkEFTf?&jz=waF2? z_(}jg9!vL0t(kKCr*qFaki_hgZ<;rCA^~%0->UO-Z*Z?alxd(_F7(I(A{*iMlAZFK z#ZO82{2Cmib#8P|0;)GLasmhxCoLw|sTkbF4dsR9hHIgkg&&^u}OJB<5bZuO<#swabYU+Ge3FltwGLfAhF^+d_Qc5KT9InVg>s^qnBtR8P}U+Dy+$(n;{~$z<(9^1sdBxfPxl;m|n`nOBOBG!*(48}ks2 zZ;(r&ByU)@b>|*F{WUQzdZG`ECce%~>Xa3Aauuq5J2z$o)WG5`%13vppu|;%=%%6u zNl=+xtf6dF?DcfB(9POdEl9rt!E?b`QI`4NAiY2E>}=Mww6va%%a4EpJBaL91hU0Z zvW1(a?%N>Zf&+=?ag-B)4pT5(76!enAu&`I*hXLHgS#&wved~igu`U@v0l4d=Kv9! z*$_BBe^pR+a41ZYN>2P|Sm$-kD0c-?c*oW6qqka4qQ42VmS&Q+3NH9qExPzp2=k-k znk_w~RERfUjE+9J%3}U2N3x+ZFY5L3*R*FCie^YoNm9uK3|_6BVZ(}grtq+^Mc!4_2uVvMHds}>c9a@Fq==DekRjU_PaEPfvb>BgC~ zX*k9QHD62K`n^LJ_;NvkS@Y@DugdkR5o(5c{;cS}!8mfHmb9afoa)Lgzqf*>F=e7$ zuf0@3KEHf2@$UC52Dsn5NJ{`5L1~w9w=zAjxD>w~<<63@msJ&(f>m%AQ5RXPWB1@_ zcl-p(tp(_U7g=JvK`=^qRfDL5AK=&6Z4tar93(v_1T-0>@OC1h=8o@U9lbE$D{ALn zKIsyh)}M@GIe%3>Yu2HK9%n4LdWqOI_ss_{chgRB=O=rFcQN{YhbD3+d8Hf#g@wh# zx2Xck{l?W?$Ca3Wq5yTXtUz>${4@&+4cl?G!Y~Zb*VP_Q*rKj`?i;=vj(NDm+7!3- zFey01#hkRbUE5BR`ttTk`8E|X*Vi9m61;S`YlXHRYb`kq9dObK&dmUneTxl0!uTCr zXLAdbGW>f!Nx$GB@Z=R>4+w{b*@uw>mHb|<{Ll}BBH$BGpk-U%tN_8KpH|*YK8=*{ zw5I4L7g%W(a}Plwzq!o`R~0ckc%8p*?Ps6EBytofUF}^znR~9u^9Q+H>g?OghGi0l z7kA>F1SlFQ9;gP-c6QUw9C>L{579sBAdAX_s%m5oTjnS}$jUy9g-={d%)$d@*7SOR|&dEa+>~ z$g6dj%%319j=4muc{VB~&;h7D!>t**1#3mqhu83T&BJWeGb^Fy_vU7wv9eB+GXM5J z+w7u&@5D~^UiL{Z&1O!08~>1sYBAomBE%J+e=mlR^X|LWe0ax~SEG@3Od(D#)T#r& zJ7^wkDqeL_}<-|CP4FFfXO_QIyi)(XP#gUxjIGlS`o<%^e=h!j+v6HBlb?K%< z{Z8lXlhT|n`jL^kqDLe4&bn7K4j?O0kIKOmPn&TUZ5?CoEPCXhA=9W_=dk z^HznGAJg0r&d~)@!pZOH&%qbtzxU7y@Pn9${_QY*%A)cB&a;koO=j}9#&iOk-l#DJ zKc6K3v8@Gmx&QzHNJBHKzxR8XCyEIUyUykLP`8H2=RVKZp^U%K^d4P5G=!e8Z9H-F zT#&30ZejRXFpTamaGN_E@Mzq7;amY1x-*z*G97a>5>0TgivhLq?B;yj0)|-!aWHaJ zCm#saOD`BqWCq()v90zxMv^hV`NR;3py-qzBVi7Qg`+ZkIZesNa7|a29JGsU`gDRo z!GvL(WAs}&^h;@1Y|j{?x<=ev*JnJ*1sUn{<)}OY&#Q%6g&Kd?CS2NH{IRmzTIS>+iyWw>!@h_0dAh!@;P=PkSK2i~q&G7fo9{YiS_N%*)fUxv#8V=}Jp7LjYvhXuZg>pWj^=`E0*t zO3kvKfi@2O!AxL_lRAHVnOSW!8Yby*XI#e9+TZRC3afb<%E;V#ZUMw0Z#7NMvrjSu z>9kpu2vh08NPpUjKK-aC z8!svJ$7}3zf8Y}R&K5uj`a;a}O&5hHPTX&zF}v6!)^f9s+5DqI#8Z>ml~2P(!Q33J z)SHG@8PhGB^Z`5c^mjkF8I`X|F_FY-l~JqQE65o!RbE2-1I}3Qm8QRCdf1aobs zcbM$9p)Ll&5;%u@BrPev{kBHsmFpc1V?dwGL$MwOkhtc=%erZ@Ky$sSxvl5L-Xk+4 zXkR2O2DZE%DiDPiH4Xi(S;osFIB*@_v5+&r9U(tFUokyqxEtCf;stQmJrzYp9R6xSsXhA{HO-{Cp-1+CE%LbA%^7JHZt929#C9c&ZCqqAlsZ zmEO^3KAt)<<1iV#ppt^uxI`|~@;p+5WthKMNz#!1>5xJnCkZDvMn zWCsV$Vg4J#PtL-htT=+UcUJWq^N^pv`*W}cg+fdBMxPub`N91PFYdQrkqa!^mYRx! z%b9GuxJW1Vm7KKF#&f+y$wgi_Vdrwhs(D=ao|6S7us_UnsGbc6VrdDZi@sWc-F4)r z1Hs2H>gf%3{eXs32Ni~Xmz$Z~Y_igxYmlDkZ(7PM4r!?McK#CUX?u}}UE>;19&~eC zCjlLO(LMC2QQhS&ZJSx);m;X5F`a=4oS6!IcHG>P(bb z%eVH&Xyf#C7e-?e<~Qr@>59KfBRs0TZV#&fXG`xn1$0~Gn82qcdWZkZ1gH-?onNHibQ7q>`(bh)T~D1(!7R{PNY84&u1%~|b^))x$547b z#N|iBudT3V73%D38g~~y+^7{Ic)(A*dm!NiVV7l)CJ%>yB;VXh69vLoDOTH%qIN?tq`~WTVv!doC>=gNLh#N$!vrp(Pzgd}Gy* z|F!*?l^e)Z-o?R{I~=m zi(uF~)a@meDSe9a%1Xxc*}#0FzW+o}Fy&hlo2~lOA}R@2(DvOwP%r`fg0&G_K3f}u zqoej$9+@A|qyBBOF$3=whBcq=7rEVT+~f6-7E<}uODpXp)?#x0ne=3MDB(+GTy$QY zvj~yj$F1=M?aSM@N-@gN`&&uh1jyOSuHU_rznIYT&{t%&>T9&Yro)&EU6Zb}CK~K( zIVPxY(kvK5;HZ@o|9U#w1*zV^!c4HB#K*>?H4pMvu}qy~(~E<0&At$E#^kif@6noS z9ny)!$?idx9*H$PKHT;$!mY$oHOs5Km!zfu7oS{d>60=a;BJBW-GLgkKq4^z*Pb35 zpsp55POv>RIhTVmpwZi47l;$%c~7hp|I^mgRO|f!&O?G%MU+Q#_B%kp<@YP{t@#JJ zqCPccA@VFDVjvzq$bzlR^uw-6LvHBb1b+Qu5C9~U2gnFKm17ncULnB0+Vkn8V+>r0Uj48IS?;pv4O<1^kSoI8J6X8*4MIV7+M1^Dmx_Q#LWi% z%oR0&#+^1%@q#+3OLo64W;Cma<#!IkrRYpv&r5^?weO=lZH9HaPB*^XAV3A;5z9Hr z@MvI9LIu+2AHE>9LTwykbm-qtuJ82VLM2x`-V0ZnY30T|wA9R9Tv6SpPPf?%8uo1( zIXqw+nrD-Cxy7vd%$A3Rv(bFY`e{qM>}byZ`;4u<(9TobSDU*aO%{USl!iwl{`#Zu z8l**YxJ*@lDjx2w-~;`~c)F?ejPpfG9H}IFzlx0X*X!hFsHj#p8F+W3u%_Ba<3tNg zk;V4G??_wRmF!OUn6^B-gh-u9Ac>eufbYx;Q!2x4F=MJkszTXW{Qfw!dzMZ*euuor z{YGQvqf*I;e(r22f9`x!{RKk%L2jgLXV6;`d|(^*8K)I0I-XuQ7FBxm%P*P z%9gDi>oxfylv&%-!zpb|OOI)HxCP>H$(&NJaqTx92Nk66kmEv;9)0>vH*E^K+B)j- z%59c`c+*!f)0+gcF{?d&lfZ>4WzO`=wD-YmPS^V?Aa0E?jo;&ZQ6w88d018LL0#??40Pp4E5MVR$L z>sN9nxADVG(o9AjL}i|byz`eN?JT8H!S=w;JEsnP>h|Xx^t*mNSg(>ZF{Yk@R+gzB zCXa+|?tBxVfLBglhgWuO-7`XL(ooa_40thr4sVEr_uTh4qdCS4+?>1CK5q>&6Eqn1 zIOUJXXm{^N58^*vmg;Qx>y@t~{As(nqdoRznk-0OA<)qhskNS$8ufmTlUXe#m)NdGkdq4FtF=Bb zcPeQ}%1GH3(t5dYs?}?y;^l4>}z4hMJZ&;L==rmN{q?W<*PjmgHGf0$` zUrCk-Fk3dfM!CPTp;dvw2*Bn1K2mXCb?6DcZ~$w5pd!+)plIgUJ-qq8~ZqUY}zEWR+uwE?q9`8b`Ru|V?kn< zoRpSL)*jZs-jN4Qu?C{P!K(Sd$gd+A-vnHvn0)mOt-I`?P>r;Xk}q69_CHKZ{HWpW zgH<>9!*oZ{z|7(X##IR)-EUoWSq<=FX}ASR1-@c`L3i0~)k@M=cWbrvuTBP_@35C# zTD&B4%2dK9N}u}OoOl%mvc8paeIOMYfK}PsUKo?4D}or3b;!m$xQ=ad4V`{QhiJ

fTrFK=|X=n>vK`$Prd9yMTPN znaBO%(m8sM2&!>iqPJXmwm2VzX| zmkGatIsj6V7-tC!sdLXx9=fO~fZ2;(P3k-8J|$&(Ihn!1%|IvP{OtHJj2bh)i zRom=<^cJ-C=(kUdMj=BJG`|0M@luDWNo)l7+2ev_Mh!dYo;N5qpZf(JBjo`->5=Mu zka<+UeBJjuDSr{-$SWeoJuG$3(LrfD0%mh0=WVa=JEu1PT7%Gx;=6s?>3Qu~gF}gU zYm8WqRZg}+_tld-TfG|g8fBusfq z98C^S+qw~>!tuuK{gk1Tq44_i-CmT^Il%#)9u==e;Ld)E(v}i-C@8z870eI-bs8ZAy^xWCm^wp&) z#1lx}!x>(U|JcaT1#EqqVsjBEx0_3LR{ca!g55G{t&u)wr-WEFLzh$GOCC@br~A(g z3C53$+V1CbDd)JIn&U_Gs)uxZlkbK(VqaNtw)8XKEwrey1dEeS&A%gTCTb zVuM%_n1}^PRVKg}AoxqTrA;h*NNr*FH3!spB?lF~GkWFd*~S{a-F&mqt3SF6pL5u>7$$Aza4oJ7B5~NP}sj8pY5mBr~x!eZUHKaAk_|j zLnl3SZCyEq=6<{3si4c0-*4~rZ_71xyeW5txm=QqoE`0g?uIqLlE*6;Pv#yqD@$j7 z*UWno>4=%g!^Ih|l=^XZmS11@L?gE#Dj5>T07J$xfUU8Hb*QuV>^pPG+)F2SFSHi! zkd;cltk7N=StdAMe)6iHLtCdCkPxrzbqU7= zvPhqup=zP$+p)0Q-Wqa1!9#u4=RE&=phHr8AR4jJ^4c)oWl~77sxGe5zDr)rD7suK zJY(EBC2MUly-$C6o(<;J!FgLiSlH->(ls?#Eo^i#NO)3TtV( zlR5(TXES^qLQ`AmMh}TP`*tKZ!$~saVfV979?CYo-^+eBbR1JIIgky5F~1M!l81h; z9j?!MtJ7}1646U$)yb^$gSW+>g6X?id0Il0aNRz))XVCL5-JB<5|vUhzG}hU|+fJ=C60j%{-*? zY#}!i;&igidm?w=>MK38IS1(eFd>mxh=3*7(sXvFD?k$s_(=t}Kv8up$IZkRU`>bI4Tn}dd)cv}BVGXR10&*IOq2e-jH>;qwx z%!hVQp&xj5VmDL0l(kZQ^T#-&!i{JbNSWuD8p4mF!Ba21H%rWSsc_*_%N^4Ru`N}I z;#q~X`Uq)Xevylr5HK?Ul^*s%B;7_*8LWI-KwPG=2EkYHZVo+?+Q-9{=^>4Nx-AuP zR`eOFyqwjtK^!NzT04#H)cwj4`uvpS>ot5adYG~!ksxQ#_?@6e(ND3P_Y#&p2f}*~ z(~z<>_i+^lW15hC2Gv8sX(2n~v2%bo#yK;D-q7`ju(X)Vxg*-|Tmg|1bu@nh(fD=VXSvhR1#U(0`%F9_T( z{fM2L`DJ1#O6(oOJlmAH>^0`j8UT}RPIPmQU~Jc%2I|_-Cm$IVj1ntlm^WQ{0Bs+i zn^o~{8UW4>7@2)G7_C98yja5Yc1g0Q9BOBHJC=Vfoj00gp6!g5c<0XEkX!?hHe-O^ z$P*?inQ!eX&%1Rd;@Mp__0g|~xrA5byuTUGflk3DU|qUP_+%=@YSPB<$XDGDt`6Fc zx;WmhVbIv_mcHtDbic^MzRcp6IL3K2`Mg;9E)zuq;nb}e3Oi?p+*psQ{#?5);qsUU zVjM>y4Ztj-XNx$WKyFyQoemy6@jj@blHdp47v?5b9f~r`_C#oNL;FhUj>4tsDBsi! zA&^CX-hkp8{1hJytK`}n3MOh~afnz)Ln)?(Tyro{>%`zZfiWOG`1K7?*jV(&AVZhL zjv7DahFRZYHYeBa#8{(y3MA2!ok2zTu`@Rzi1wB*H(`0}OsLaO$k;udu2aM2`}d@} zo|*@9#KZ6K3z7{Xxw#AyW>a*#Bpt`@g@J;2wcph9-|#;3<0P&is)=Awlsog&x4H=M zrW(3}Wau%_&nU5kupg9;boP=k_LosoTp(Ja>_s_#h}Fz&h`J(71D?XH@~VHFqI z0TL581KF+@RL$VQ5*M*JDmq!^#JF>=6A?hYf2i$$M8eh|m$AN>J2ooyDV$zyb*D*% zyK6|bd^W<@sCA;wnh4a`s!coA#KWWvlk3dYMeNr=mpRo=5%($*gww2jMrr~`lwVRS z_teLBr+P62DWEO4hdwYfcbbPvx3!VvADPD*0Vsj}2W9?bfV%(`8zTeH_HSoo3}P#d zM44yJF`ychX`+YXep)EO(eE`u=!da4s9P<=C#h3%0NSI^TlI1H?R|}f?%9i#dZQ|9~BR!|&j8sYG%gs?nsr%K>J44!aTA#rYt1PBczcTJa?=C7-Y2x>)phsMSSJhs3}B&=u6=`1~s@xeHTH*ykQ> z9XHZx?8IP7TV(CYD9fslLhE-$q&?Dy!rgApCzGb>WPM=~;NH-mcH97RyQO;t+l>`+omD=_vruopSJg?=S+@S>j&Avn`Zadfl1SxU^F67;oQykhpt!Y`N{x!#+?! z-&k0K@)pKV)RQ+l`G8GeM!>II6EGdR2FQ-)9A+@=2b*U~@v<+ekXUv1;m37=^+(&M z-YURG_weHheVs4b9qkkJE~9UPA!^!N;XE>?K(N!SuVHVk4hR=J#)QOI1&5e3u|)0I zJtcS1r&J_x?Ue-{`bA8L`(im(>LQiCOFmwm7)VB-AvYY^_fkWA$A4F%N2E81Hp#9e zjK^LiEeD=kZUkGb_p0bvmD3Z823swcACpV>wC2vR)tHIDnJ-lV04dgkQR4;o45 zo&c~-5~)mhbcA@&jqf1|f~6rjDsdGZIzRq2Qdpa$KP{IgFNoSp2+3J9du@rR_||k` zT&M1|f5Y5oBmdHB`+Z^$rlWGA$8EZ89Hget>l&2kZdL#*x|D&D2%U2hE;k!jH1aSm zw60~LL+vVPHO)IH(%f36l?3V7Q@NtTK!8wfa(kT zz~fdAjgjVr4EGDdh-D(Xe5aymcU0g^UUNjxRG(JMO<4dOO$s%9qxLZsh<;+n_-C$R zY-hYc_nT?aEN7-K+oZ`@VKYW+vg~~!C;JF^?+w|TK}G|vtF0EX(&-4;DKWT*(*KcZ zOnhaZ{4V=LC1NsFOG?SX(1xjmiHI+SwBXG(8fbf+0ZNL{ZHdlXgMCWClfk|sA#`C3 z8s6d9u%#}hOl1?<1^H>;(MFd#eAE=?Gs{!z!5Qlr7a+2(U^)KU98OF$@30D$>OG!J zAoX8JVO}|bK5^V)DsC%!KZkoTW+3WRmHc%PzJjaas;*O$9*CB72ZFIi6-`}JM*xpl zTY|`3if3}k@&n6&cH}{$ABa=Yr?zJjE+*!aE2q!tII&o`sDw$HyDxFefLq4(v+W-F z&eVr%16CQ!QZ&8oHl6Wix%@?L2Q?jm+=IvI=Y0nBl-~XP+=DBFzmG+#qMyegTq;gw z)^a(L?{pRqWx;hN8%SQ;z86ZkpFuC%-d-AEOl!CQxzSx6lV@+&Vh;0hv5RNO;;fmW z%^2l=Jyg6gQ^R%q%-cl3%x$ufofV*xTcUJ+O2o8$29V#34!GYKTz$t37ERt6YHt`z z4@S|w+>A##(Hc7>NLOaPjXgkbVRGX4C(KaVrTMC-c5#x7S89!v%uAACGdUk_MhG0Y zig7vaGhpw!ki4YHUy(m8*WLcB)Ow$-F4d^t?d$ipQA?bm&(yREL0L9$l}yo(@~!!* z-*tz3TTaNnq!qyNVj?1tP|iFyke~_!&z~iQZ1j|n}1U#sV; zKY7p+q%+B~_OJ=3O1cf?`-jBzB#N1cxHsth=t5Gg-bIDsjSH1lQI0cwksn$JXLMk2 ze^7={gHtAV9rxl{lo$co>1(JZ&QCLkZ^eImX4X+)g}s>9RVEjCXN%Tdcs-tvj>v4T zbs0`+86KIF3~4MdI&$tVXWZ+l8F7syBmQnK$V5*%hFG4V#gPRMg%OB-*;R2)dp=z^ zqb=xVx{*omH<-he=4tHeaN*BD0utOF$|Z$TG@$vwjYQ0j-@n z+ABY8b;hn_4hQbND4T;%5{Ghe29qeV^YiJMS2WWkS;)SIfQ;tLF4$`f+{XoE#s&Rd z-(@{pByKwH0Hc|o<#Psg-@+$&G$31I&SySKBg^_QV!O$B{YwAzFjLgU5|=8yy8gLF z)MF_f2cIUrP`j|j$y1cgle~<$WFs~)u(_vOZa9UGvUQtQl5W%j<98Bu(wk?Vr#s8+ z)Gv-ai@jchjvI4oejwRL6z7&U@n#DfeNBRX5Pr7dx<%91DNHHFj4T$xzk{1XN-foR zNFJU;2oq{O&|k1gJ|2(ta*O<%)-xaKE?2t-F2gKxS&urN>l(fbuG_N>%C6wnOR{C* zVbz3pmgf7w93y>VH=4RjL4|YYEKAIsv7tV(x@qovaMvBLA^4GvXr{5-k!A74x&1Cl z`@vi2n?3qvBY~K*DCuG&!iHEp`+XSq&!e)cA!hus?mVOn+u-%548zq~)u@^**Ny9P zgZ4aect?b^gEe4MWyliT++Y^|Xu557Y8}o=_4befAf{}q?sV~s4Tf>!%l#W z0I+p0K|}OM6*Q-{>EnxPUFl0PLr^t1^}NgPbpvPB7}6e@v9tF4`&miyRQy?NL|qhW z5~Yr20RCTudY^bD$5_$Ezl0C6Wjb4$H?7`1TWEMPeAw&T)<|LFDq@H-`2%$h`+HXNtr(xafi6@K|1tOE~RR~ zHGLWQ^;;io>*@CqcuDZaPWFv*1N9s^f0nABUN5rK&xi(9{t3`c$n-)OeUa`CXSjR@Q#f zb5k|bOQXveC)0hkPa0JJY1)2$&CP#eR5%ig{Dy?@mHj#s({)lrPUlhpzMDdiaPy1T zNDgtErLOpZrA8Az&NCVWTb7UH4Kw3I`SIQ@9?0t&c@Fg3quMi4CBF=LHe3NF`KV}% zN5pQ({ZhRj%eXJ<+`)@Gl06f?hmDzo_l-x*!Q0uoIN5P|)!_9c!~U}tD;&>)s`=AS z35f5aXyGL4N;rSvh^+PF?iY6i77Kzk?Vuu=Tv`eI&i`(+ELv-~=^~G9i*e+&?Ryo zu$<)Y`cigr9)Oc2z$bCKv(=JsE7_?vp>WA{c6%_T*2;M5vTw)lV0>0+7<`0F)KI!`Z{XQyvM#49YTIYI3~8so4}nyg;{EBrB=#To7BRX zBh+ne5)hqqK~PnhdH>Ih(m%O=}(XZ#fQEv05V+e396GGTop8ixA zEwI(Lt9lD}6|1u6$7Y`3kQ46h?L5}Pk47`Fs^kqctNJt$Dfx%!GgyRRd5QQf30x@Dp3gD7R{GK8zwCG0 zUkRyHNXGMEnU-f?=m>v|=*?4*BKr+zt+&CjF(SpwC&rABwuwsH;RfXy9z6pmgPl2m z%v{0&XGl=-n8u46sdZhuFf!E}qHi8E_j^(DiO1G%efO3jZaH6WDt%pO!P$Eznmt-A&{{&`Nbps$B_mrnik)u|FeK&a$U8=$ ztgaW`u#QM%7BbL}c#{^O|8*(q@IRu(cD_(Hfgsp6#c!gQC7(GnEXSM}$$Zh#rmbji4VB$B%RC zY>6j3x?bj1+~czdZ@BDYm{8N{I|{p5MtD|VFBB1*z5JAW@pq`3|JD-1dbmMt3N#}m zOQM+0vj1CPPz%y~hg{8KuaCUV(pP3h`p7}wJ!BMi2kQcMHrg$<#_+|ZtBDjiMVrR> zw(q5`e_U^sR3z44lU$nEbc>b@pO2A&zqK;&>+Ec8xaG|@Cf(H!(`x1zqZmY?TSmF) z*WN_XVdCbPKQs)H@A*(Q2z6>L*k5kBfdQ3XM){QV?zMraUA6Y#36P0Z8kKp`+p8m( z42;uz4d~`-Z!H-(_fu1}mcw!SMUh5F9VLC90xmH_y6HX!kw`(dcfbXXr~1isVa@f{w21CWaR8CkAHct|rV8Gy zvN&J3-Mx6bs7kh2`{lw*3j_J0sC81GZr%YV1P_A6@9mk;82JuU;t`va(Q% zE|=IR|LZ2+ZI!AINybhKi+MV46@{I{&F#p})QA0EM|U>d|MUFEHVkT|{p`k_p0Be) z4&@pbZgu)8J*2!?fPW>wA1SIl6$NvOEJdBgO4Z)om$oZaJCGdW>Ga%p?)^#H9CyaO zz~<^(RB75NSj2UX?6ybpBMsfW_&cbRk64|Z(tKhYujges*&WYU9|Cit(b;NrWP;H< zoeFWfx7hR~>H+6>HdETcsE;@Hbr*Qm#DSyJyky@o@S+D|QPB3vH~k5=4{-9GnNJL~ z45m%*S%^HHq!p%PcR}(+c|phrZx%7)M zFbmpNEanRPQ|Z|1uG@BckZVVLekg49*)tD^rGq;D@7IhU4>VS8PmP^qrd^XAV-Mmc zfHt#Z7b*+|DRiJ4ccAGg;am`X;={GMj9Q}SiMg4w(_R>avTWc&1~I=qN41Fcj?L}+ zME3X4%M!21XX9ka^H_iqub<2zmRJ!12m7;S_YmroFdv%|4rYH_-1ATwi0`Bpk_8E- zE}S~>Q`2hjK*<=F*Yh0l5AKnirZ1HWb+$2(*JPPNH%>EeFJ81uGCXQYUtD*IMQ}7n z;?WjaWacrv%@0F#%OhEGF<|Ub;XU)8i?x}aNukj1(noCeF5u`j_`-OH-{8e-_3wVs zP^qu#s(uiq6~ea65ZNOADyHi})!^FNf~2#FY?!Ov?;3QCz^!4S52g%@w=ncoF{Z8uuhK?TFgy%%r_r zsy-N%lKxOX%U|vp5|dy^8r=$S9DD9~)hFrK_~ON7+*rEA-H_1^r8>hQ)I6SecI zFcoUXJ$s7NzE{UyCIrKfeaY>X+7J*YD?X|6Ljs!E=0`LsOWA`FAe(~!^IhOyKdavg zdPRAfF7nG=7q~7E#GTlDpV&{6_h}v1U$WZDtl*^dgX}^6g6(sR#XtV;AEJN#^S|>^ z$OAaxr`KuY@YVi2%)cY@KMxLEEA;k{bZ5C`7T3Rg`|o~CktQJW8<@It6!^EK6#;3@ zdA;(%|M;Oj4Ej=Zplp_l$9I^?-;vIb1EhyN zM>opHC=v&iMYRIRE}j14=eYjj_Q+W3J&6n}O(&-cH%SYKVye2#qm|plP19hhi`6eolqe@XbqJE;rdi~4D9_oRjw`R)Hux+(b28~sD3Qz3BN zo5law)FOWnnSqae_rC{1$cSJizronGk(sfT{iKfumxYp>I)^ zr;uqk%ih1U{FSpG0P9~r&sqNOEMQI{IH0-FF%A5G$KO{*Uwhi|Y7zKIRLM+RZ!dcl1vsR@T+| zvNp_K`W%4U)@{1jbi8-kzEc*E=8hfRa{t#8cZwT9_uN+fp5_6rZcC@>@}O9Jl{O2g zM*cGBhjSSb!`dbMt{b}N4-`b_qpeZ&-j6C|r7L&0_eN2Xuqm|`VoKpjpI-}5g-e^6KNi@B`(T%8{s~6IbIZ)mWU`I2y}IjIA%|&0B+3tC8NO4QgkW>?I~~q)=Sz=P8#2};ca zWWBg4GZCLaz$+)vq`via7+%wSw6snguGY44tMN?do^FxdGYX-=DbM-0BWaF}sp;h# z0YDA6&KI3;YwIQoKfE?dyFM3W`Aac5^1^LLzpjp37SHdA|ErgjHqNQt*#^+{kGeN}Mk^w^|%2Agx050Wcpyr3aVTi|?xIlss zC05Mjj>(y@NH+ZZ2RC=M;adoP!yGWQU;Z=)Ja0Oi+KM-F9+D+~uEgT!>MZa<@%Cl5 z67*ytL8bU%@oWHuri;nLpmN&(%xzBnmGwmNzxX&dB&<9Rm=d-gD@>+5TpEx`Xz+@S zfIS!~D2|l}Ix9NXt}eH(;{BwnTxeZY7(tD!S!Tl)?mUm*URDPH;I?Z5 z)Vmp<_?hwLZnAc^rnby+=6qXzjC~efy7g5GBKlL|weerrRCC?ccealV>P@#a^u^}L7WGH0zKb~Dvk~w`s*8NB#+8IE8)2-V?>rb7;P7-*SeKvuxz$(qmfnUV9; zHdZpl>OZLemk0mwm;WaU$4g@JS}C(LMYDPS>U{9Z?z^*|UG1td#l8}g+cWRkdhT`k zrr?2X0EnFn1>5}76w;FuyME7t36wfF(^(P&m(7pxACGe~FbKFfcr_AI7=!niJMq9J zK5y9ofbbu28H6Qs=XPA0$IBI4uTQ?d@J;@|=S+rZJTW6~GT7-#-W}^u6_uLn@7sy) z>3n67=;qqGF3a29*qslC?LR(^+N{$Bcwe4sWMefI;*V({8G3gea7~p zK-ZLJ^qsCzs=4JUJ>dzXS49?-5qtJeMOI+7;|B1?Vuq_j`$J6zjl^iVIZf7^1=`3N zRi~;G6HI(9HaH?E-B~wSg?m`11+LVkF@Y)SYCcga7Zav@=5jc4!TORseSEmDU4)p!4NlT+{s92*XF$lB|dGkxM&(-~Xl zw!QBVmueJyl${Hyoiv`|)I`7T;q@Aw&3gJR#3?;J%;Z+GH#C&UKD%jYVvOv#nMUYa zmIw=B->E57k;ZW|T&8X}{q3BKF{_s1B!^Nc%!V_@ zYqN<@9GR)1HPS&6Yy$vFoz218h#&UT@VzzHyfVYGm(BR7M~$$1^fS+dX39C2uKthN zHn|V#u22yo6I4qwXS4v?;cdf@| z7J7H<${K)OhNGv~T)g)hyhrE_)TY@=rSIyGiuY0Cb7zSJy$hAL@8S%W+XDIq%|%To zC1yiupIWXNJ!3#PG@dL~c$}in&TBxWP}sjKPrZNsx`re{htd9{p&pZ zGKvD0ai+dz`UEhQI+jmIr>g}87VZ-Q*Ceg64ub#O zLH{h-AU#&6v&||Vgg5hcgzuoUskw`Zq#AdCa1TX@k3%+3$Pxfup<9~BKwIS9^V?jS zJ@P58Ih^H;wP~1}61D4rW}BzpjQ#P+SlUj;x+O^Vs+;p&l@`%3urWN~m$e*P&VDv! z^nXrC6pDCWiGegarR)r}dZJY)lsfu*qnM)7j|sCGp#(L)k?N`<_~xcM#dG2tlbFd; z7gXP*TT^UE>c*qFP$&j9lT>Ltmd}?%INQgV`W&%Arb-S)i z1Mc2%dAfclHeqqBY5$|(L0|COtSX4D3Cf;`kpvC4dx%lo6~lz9XMoIqQr=T@x92hC zs|U8Ep9~qqA4KRfJ?A1mX(YQZtH0nZNdZDHkzU7H|2XED0G7PFi$)ye?{07RT^4$X zL5u)3jR}^ni_=>GhK9-zHEAkyBlAj4gimU_F+EWwoPW|DQxW=>1|in?{P zRb(;xQ>=>UAp(-;u<;OM-OZ=rbO-VO@b#X7aIW3=ure4WdW|+ZG1^F!5R4kVq##6z z(M}L0$mqS7QKCeT;3Py%2+_qDy(UIT5N)CaL-g|Aljof0_x!)SpW;iB+st)ed#|dTmY5(8Q9sAo#g%Sog?>uuM^<)%yA^aU11-W!SVtr zjh`1|d2P5oJBjlQvE92Pno_XKQr_hN^1;*sb^};y*PUJf(379 z2~Oub72O8)O}XFsHMt$SY@C(Yv5V#uPq~$?lQvw}#&ma5%j*2($3Nej3JKPrcGym! z%O=8|e3sHuWltuoj&NO8Re$)AuuMAASAT_5PEbj&rr7u^U!}&!}!97u*msG^YD3&(;QUzsM4Xv9eEC|(VH7ppUM&&$+z-m_3f?_ zDBdg1{VM;^_*5id{Lr%jQWdY;=`BSZokKUB%FR6l#oxG`OGN~8sI>r=3iGw1&=9Kn z&G$(HMhF<0ucghi#pqXK(t8p#_EV`OfD(+&x7_9FjPpkN{}pLerywIFPY8dr{<=|g zc~zDHzMIl0MnKp$ZC8|PBrL?}8BBJ8%eGnC%MB_boWb_ z2&^H^mzhJUX=%G*ROK^p+2noQQ2(`6TzQ}G&wVZ+=s9JoQ>I(5d|5XvP5XsR@rRGl z@fAOsdJ_kMc=b*M^xURNYF4HM0*n8l@8qrZO(F7Dly1|Lj&r}>Pf0<|zGo9h&ix~~ z6bag@~n>zdUHEjoPww8*wKD2?Ho8Lx9sQNkRke00}cCU*lC5 zl1-;qX#%{FLWfZ^4T!8oZus*6Mu#1PV@#ENeug>i{A)KMNSx9R5ojEMF9 zpGizop&E__g9ClDN3FiUt5}nHDr0o#iFp7SOt!@Y$1;^{Q#Tcontt@UqVR>VSy!>G z>hXCLlhN&59(=Je*KR8q?M9XUiu(h~>!oX+g3*5R9V0~{iCxQ^hy9=y@2(x&wamEl z+14eRW+2|I60ksfrByS*D;~ZZFX#0T_@|0?A{fZ;9kEsDMjLm*@_a zzK@oXd_>AG;LEe$@AacHdfTi|tloOwdr(4b(m7uWP?Ba+au}Gbeys&Q?_p3&e3onV zyzQTh#|}*iiCPrZkQPuwZHj|K6|Q)FS)398d-W!;=*GDTR7KE~TmoIIu0DmCyl+g} z6?jHjcR#bn0MZHD_*Pz(n^WGyK))H1&rh@TLDO=^DbL;P8;E}5Nt8o~^G{V8C+4rh zZ_mu0=e4Rv29+DX+Nn?-I^)MOX_!o(Bk}zr@)Dyw=Sv@QUBCERnTAd(t%c3lu&?2B zN0qkT-5Iz!00Rwvt0-?jY5PBFqSFqPGq#UcU)74(Rj*OIpl*8<%k6g?I4Czce$5U9 z<5&>X5Kf_#&tu|aywWG7=EXHpHsc8PZQtuSCWP=pL3^pWN$3=J02$izs}6d#K)qOS z<|O+s(e2M2HXT>4?mXz@(J_%Mntrd7l>i?Ay5PJK<7T?S+QY|{a}{U22T~bTi@2pAt*u|BmDXbKwOa6JkD7i!gVlwa zd%O?DP8L1_oz*;uow}#J>sfcAj0{@cjU>r3Njxdq-Ca~UF!xixIJ*1UMYMa&ZbGO3 zI*;%eU7(pFtG677Exr!5Chp$wNAMy7UHddSxFQ|cqZ2cDFcosUzM->^X`JuVg#K+rmVdx;qV#o z5$9JZKxK?-L6qHIKf^R28ftxM&3;>7@8`1X>WrF+L)UZ`L@u)82mDo9hpP+-o7UWK_h-T3>5(dW$}E%MDTk9)?~qzn8xE|b1P?`>56O>YoLI{ z%hP=2IW0a{!;Z`pKg$P}q*oZ_vcLNUZ`48lsn@elI)Y7wc`}n_M-&9e+GyV}Uzup1 z{h#r3QVNmwF)y$xPPeABxf5@SEJr>-i>5MhQS9i76ioev6j>)Q8|gN?01ho%sqmi3 z3P`KW9g8l_72@u{7Dd~0cusYbYEa|RE3{Y2h{-{oosO_Gci?6n%b@RK6k(6pq>?*i z&_RYpytP!4qdD|ATAlbL>*ZaI@=8zkOiCVzbCk7Lb)DyavK4*0S>V z(@K{?|9|-t;Z!1TF1)CmD~W0va=?xpI9;YFo&L=C-h2X%svNX|$4th`Y0Klc^Gya4 zF>;_%Na~|RZ62-0zXbLmsb*3$(*2)$y*{m*z;jzriv62fL}Yz_y)5Q}fhwnyzvj<| z=aVEbs3pNMhM6QGYf5cbpG~hI_jz3$JyQ*q!jkDYbgW<3efII!l)cp9f?{Km*DoQL zKU5dLUKNCh92^o5Vy79Uv?B2=xdUIjB|6dD0B-99{iK?3F%&f%Li_@h5+lXINd+(R zb>PdOXSb>=TFCA-o^?{Tg9ak)8NHICk1F@7(mVmo+nNZR&TW7eDM-mUuNyr34CE+I zfjGL~W!_Ps!^*9@iS^iU=06Fl$)`}zf?rtbDdmS&H_WmHwOc!zqTYY0vdpjzR-ClF%Nd?5M@OQ*cBQA~*C^bMzV$fu5-ZfY?T&-0Vt$+8{v>|SVq zhinsrFqK-|vA1%P+)LoVB0eI>O{9c~$jJJn^edzjE1#Zd2fi~L6VsOcs`f$f=z5e( ziYl9@L&r*x2+A*$tR!z~l%r&5{FhnOvHgjGpK+G$5O;$pi%3zB_wUa&A@6I#eRJf# z{ZHdzw-3Uk#OKE@F^RZkBqejjxh1rRwWeApIJ~Pip}5%%pn+^*+B$EwJ8eP6yw7A_ zCqs2{EDGNP`&)5j3OfWN3p2#zrus`;_8B`8AnFa zaAXx|_`nBG69C1f`C5++1{7q6Y%RF7066SUWBLc-oUxb&s?hp7QFSC|p;TomT+F17ya61K; zXJ_i zJ@6At-#nFg3|Eqlk;U*@(HL)O%Vqy{|`m^xc4ye*sWig)b|^$s+;}K;WnBB zbjO5EJ8XU_WES|@iz-)gc^t*J4*Lt;-YQ^j1FSCne116JAyyot&Y~%l(EkP&QAdHKH37yd{^%yC0(66%T!td4K-8J1)Hw?+X(RljgpYjWf*f5uIK z6mjLKc}{I)5g~ber7gnUr|qRK`6mWAL`3;Jd`gM z&%7lUdj0Y_L#UPHp)>8jtlv4t|AL#aoTwdL$7Um#0G`(4J%Bu?*LmDG{FMkFXbBnYu6fI1wlW3@3Funj=BvVDY{4R#_#6WMcGXU zSvy2P*PbULhHbVuYWEg78Z*GGuA1V@tdZF*tl`3>!Yt=RoE6>) zJ_n*fOwFh$-Zs3J==dMv-y(zrN3=Ls?z^7=^n|51fjN)-;A)CbxSKXmQn~fo1Oif- zP6-?l;tX9Evi`cFJisez4JH_#r$8zFB6iNWcGbP`UB!zwK0@eh<|6q- z$v!H;pm;jJl@@hrntI;)yUfBoZ>^dn`x6m;065vKYLYUj<{OhGad@Vq;;7Eh730Wf zL4v$a=MBF=SUwY8uC+BNe%S%q8xz@F^Qv+zXgE`iC-YX!a?m=^Q9(4@1K_50hf+!Y zgdFdIZQgaaTNAc(9qU)0>OXcAiomx(md=2UoAZCq58;u!K-mfJhlF+9;N^XGA>$;& z!>E~)YMB&`Rb;mU-`CNp5f}9CO6g?hbOd%Yz5P=CFmGLwsYRMHl9H(vM;%><>NI}p zcOXF>TwE~c7naKSkp{>=8CNYl#L9xzR+>j;4Aw2;B!shGRHUAbvPV3O+)bdY$Mzmc zQmuij4ftK^pZ7LKw)*fWOus8&Lu;V>&RQ^`PfiSq6pN15HNM+hjYx2)Sh_MQ4gt&~ zjW$rWyPh?_|5K2l^rEC9UQzs-)07$PS|+qd1(vG5zl(K$$F^tCfLF^nsEo*~LrME-|;trEa9XJlk=Q$mNyaM;i<^FEQz zT;gcB)1wtCyEC*BKr1_)ntH~SRz>eAsfxHp@`OsiE2&)}SFFCl)zC%~7kKxl@&B-v zOqxV%e@PYz1oHX}$-Jk1%0*&8u3;piRwG$T1XE(_0bivdRUr>e>cgD-q8+$?7>KG@ zbpGT$?^1*lT^qqGLr`*_*wqtanB<5rS0hf9Z||jf?Nt&ED9WKHjW;;2_)b~`IXFpE zq%;(>fH|ub)OX@BA)-sd1H)TgpsIU3Wcez5JFANCI7)3qpz`L9ic6PF_wjIQ_~Yb_ z2;IMY12&7@0jlXAo^w){=+r?H2$zgF%bHDC!hpXB#ziFoo{8op_E~z1?qxnkzNn$t zvu@2?0)f+?Rp;Kyq?XXeG?SxiWKZ2&J>U|VG~>gU+t0c_mD?lca20>Z(k#6&paB%38Wq{!hAm{wbLj6J%G`$?EHB;JG=6Ac zy)eV(z<0oohLh-Spe=%0j{JTO0uHG5pUD0XaUE$x>Bm0Va<$wGHA(8B?4|wex@v!^ zurj_I_1`awXElJPwD(?Q?2ObsQcAdxVP$}DG;*NRaPr<@@nKIDl&nHyHI=&^o5S4T zir|QVUuKIBw#MB}Uy$)y%?;tNkjm?Z-rUNd~}nHgvw3R*p-k2$6l3J)`8 z&S~j3Q`wlVF(&6DQ5{^Wo^{S@ALO3#vIZYTGCREjBF69OqJT|A9fkSd+z)l)#i$cN z1C|A*@8`b!8J_*aEZ%v`NFdhtXksxH$~VQGf74OF3?`#NG?lzq`_xpaHMWmM$jc}U{!r53LM>0PeS z8Xktugm&#Q5j=9TdHtTwK6nt2j}+slNflnMLI++L86KB^E_I_93KEH zrcN6YHsAVIZLAN;e(T*fkaUQ5h_sG~1D>Mu1K4gJQhkvz=v#( zw+QSMRJeBKN9einf4Ja~awoIlbcH2}MBlkjS#v$+z_xaCpDNiIk1S#>FEIbXfSGM_m z72}z{HuB^EuH9SmwFc6cWkj7%Q+D>iNwL_bevkgha$N|Shrrl%b8x@{-LtnF@{7ic z^tj)cpDOfIvj07~60kMBA-|t$<ZioV>@8U({ONzM z=XLxuk`1&bt4lPcR!s1RR<&?o3@+ z8BGs@HAN`nJ@%-MVi3qj7HZ&)Wl~ATIQ6R?VuTFXCZ(9Lqt?Q85so zf8;$25@w{aB`0EhT4ekyn$kUU)Sg76-|ySwq@JdJ!T7d=ua#?lgI#Ua$#8dB?C9+W znzb&jbd{AI!~RLR2+rP%=YtSz8)hKB)%2^y*Qwd-tm(t=B_Zz&3kan5-RHR~ZB+jO zTy`X&(>k|DlAE@HTkreT!o3pQ6x^c@U2-gK%OV*{3a2j5^~7)u!R-jO&D8KJw^B(U zJ$!VY!A`NT#cm?#qvfe}1*Di>X5%jc9Vz8B#IPLC7+nJF7|pK$7I^!=2xc3SiIZWv z8mlndBqr+2e$y79<_xtJnR_d1+HWcj(kbtB4ktOfulsWTFI0fk0>UEUK6Q`3bw<@z zH9S5Yf0fFCsWR?|P2}o#inOSyMm{aDF=gBD*KDd!ZV4N2wcX8i0N641;wphgj%g(z zgz<$z(ITRAY_vJf9CLs7(H1I9{Y0 z8uFXTIw?fA5v4F{?IRCtKkALr2CkY7o>#mc;(t0?;I&?6zyd^&`2D{JBCq6N0U(zY zo1`giP(ze{_BV=mkBBlvgGb1&jOJwUa7p$wx|OAaFEH!3skukCv(<9098&>GD@|9i zus+4bhjqTrC)+KJPO$?4)%b!~kv@V<05n;ZaPcb);-di-=i|F z=G$Dy5i`^d-ato*xGg|Sl?}eUvs5d<{J3E~johzH4I==9@NY&=iNvF+6 zG}Iqyw(X4{z1Oq>L`-h={#8H+Sx_T~SDV~5m1p9t9bT?ys4@a%5}97U*j=hs12w*! zoTXxE;dI~Uz<~~~xu=nLvU$ju<4=IBrz{J*1lou_f@nmIhL2Ou4U2$MR?K7=tK`z~ zoQ+g10s@iJ8q&9tIHsisRt^tbi>d4tRNvv1^4r0q=Sl8J?!ANNl;L=h!W^eAdyqKy z8psFVjg98p=(yPEV3nZLWre^<_@<^mKg4UrBbXC}OkuUQd+H;7+R4E;jlo@?MCymS+KN(0?nLOO z>F6e)oCkD57nPPjmtiIv0=_E)lR-NqSelq*mU@Z0-G<8H5dtN6D%`PjXQeqV&PdLF5h)(_Ugh&eQJA$h6D9S8^Wh5h|Hjz)Sn{c;nM1j@U~7XnH$ z#QAlojVi8;SohI$L1`(%FdTfGnIq%^`uKJ1z4 zQ&5arF!8X;kuh9YJM6eT*CKwEs(SJ41!eGJG@D1`l*Vk!H>3X>3fr}U#yCcRd>uq5 zx;CG}4Q3g~Hp4qE;t7N3d{J~<5?zl_=^v7H1B6Wj21S^mb1t5q#*X~c#VYO4$Jf{w zDMwa{hHQogyVM-A$**wREcgfhgb&$8_KVse=Yfz2JYm3mw#%?LZ zdZtpE)oLCt1h}@a>V3yux8|IRbmLxozL5fT#%Y6cuBB!zpV}_ew?WL( znPu!{78^XzARW`!EzB$KVDD;-JX(y6RiP<--~2W#VL;8Kuy?;A=nLI9T#lch1=ENj zSswEe`4VGz;qkI#&|Z_2;N3@BOg51zA_PsSFR%-xqN=1Bs=){m*4=5p+s?)5OJAIj z#NxmQFP33tY*liZ&wF;<23ig8%-Zur@s$379#4(KR!+emlvZ;b{wi<)&nbW|PdhF@ zwrti9UURzMM^o1-4}RJ_DRJqJ9rcCBze{7wVl;6Xig&xC>Lx4bWtRK<*G+TeZ81qD21bM2d)D&M zQQb?QzWdD`m;dGGKZ3w^@LviSM@x6c9*WJTS#G51@Vo|6+P2FC!HD#l45c)2tQ;N2 z`r3^wC5Z;#alh~6a&u-*5C8hu&={Ig@G*Ha0jl>X(Yx82pIAar@_rD4379dvo_0YEGqfQI&G3ZGm3ncAso6D_e37rnywQ4T`6 zxcr7Zhm(asl(5Led3wC=rOnk5J$U{lB|Eff@Wqp}IJ*vyZ!NlVr#TyCueU?~o&5sz zb*(Z8wU}~q{qIaeo~7$up*UKo&~wh$K-`JE?8)>`mRP_%>NNXa5tcN= zK5@$Ne$BAPZl=Fs*QZoFo9-0XXFU0vcL$9D(@Ne;=f@^o0Gbo zBls}dshd8){QU`QtGP%jqcldd1WxTO%5>l2&3p5%Rh7Tn-kS$E9C5-NU6q{pEr0dl zeCIF~Wl2W>nM21l^-0n3p7^rq{rUtDHKr6%q{GuIHDOEvXp}<=_EF&i)SAU}2v0GI=uS{2WW#8(4(04ak z0#90*YZuGs;KV$_;wKwazVa#!TOPE=HUD(duzz|K%SyI-G&_souQSf`9}ydDoqY0x zf*+IV4}0!?#d~BUXv%%vpObQkli)4iN(yWx)X$s3YWIf$ZI%?JvnVlW1yEe-_b;*; zU4J3SzCR#dQ{~B+PNEV)jGPCwVUt4zCPXISi)G^rgy<5vo`&L`tOHA)XlRBhz4h!> z-$zBrz*t~!n8E=Z=9?8Gy)K&k*z`$n|qd&O49LM>jwT^#*e^f7Jj=d|K0XpE1;lcJ$NMO`4rDeBRkn!8`xh3_!?GLA6_G^}-T@etM#ak~NvACkRUOuxL96+w8qE5dINB zF^nz=bg;sbeF+lPSfm`?TpcAg++Zb%18d&h2yfaciQ$^4$#WY-wgPl{MzTm_tW;&0 zT~XJ?q;JW*K}7L6K>#%QENM9P!Y)s7A&gWm&vV>YwaS>JK@Fh&EEN)Y2-y(fl#tGe z*w)+qBTy-)H2|NGCQj7Ke#DLI@EXpp#4ytQ#jc=ksm3n!%!hr=jftw>h+jdQ6MV^z z!xK%F2e17OB$3cxiSuEnIrZvVu<%HHY4T;JrtpY6Qk;q$UMT8mn!^&jH!N>X2&guZ zB`f9mUu=vvCi)h}Hh3%TcgAKBj^()CK8+)Gq1C#k(`D$=eqU#`58ip$Gf*tVz;)c5 zQ6T5(XNQvH#;>$D;8k~3?}SkoY&!^3~u37x69v1I%-8o;pI?$sOOexl^%>C`|gGFibX=4 zBlaM{-wqiR=jcXX;^tke4*&WN{zLv$?}yRKdS+K(A^fQm&MV^~f`oI)&O*E?+M1Iy zC>Y01O*sI}`t3Aec1%O@wA^I#lJMf)q-22HfHf_CI_PQ5h)!P%z%oCAzIqE*<1YX1 z`ZacmhPLa)#h&7bY0XksmhZ;fru&4qiQiUbX_itXALSC;g}dboR68!gXN8jA~~HOx-2% z^^L3@>T^Z1$yML|z#5;gx6bz`-+r}w&1U1+ar$0ytAmN&s<4C2=9zuQ%P-qUImhO( zN@~F^bK|oVyU`)3tPlOCTAZT72uk&mhJR=gLOC^1CEllk>01Mq60t$e<|?KPwB8I| ztfniC)9Xs_k7Z-<8qwKK2zSc4jc>!(PADLsn^$1av+NiKNhKX4q#yNH2l2CYH9wxN zeoN>~nH6O8DCF1Z`nrMR+valoEM*PwS5*;~#u}!S+N@2^|HIKT=D3qURR`*~32Y#T z+MuIrn(Q8g+J%13lAH||-#+IdwhZ3ExR*{v56h9*27KnmyJY(v+H4sPJTbT?o^8Fa8zKf3;bv7Xh9ZRYzXOm({>f z9!GHTe-Apl_os!A5e(}|b)45P?FE!TZMj4P1_cwZ{Q)HQ-aEf>%jO{`LSSo+2+a_$ z0V{HxY8-Z*r!9-;Kpr=TYvhGlkI2fN>6OJ7k51^SfJLe+3DQSb#nD!tkx7C&#c6&t zZ`_+c22x^hyDz?MHu33n@4X@`0}AXi-ey?7hq!@$&t_ldYeTp@qr@D0({no_Oe z^N_)=1t3i%*Jan@p|1?+-Iy@5_d=`wm68x{d$f2~i%_v_w$41D8W~>ZBN5a?0zNGu zq#pkGwa$ZSG$8u_EPoqStR+0$ak5^SH{+K5wL*8k$gK*K+gu%{Jg8X@%V|(%-KT|l zuHwaKi`g>ahAZuD7i%pyf%9KUN~i2;dJ9m#4Pn20q(QBJX8X^5pGq_ey*~t5Xye*` z_m|?oSpd}fs^UwI$3X~|?3S$B6A;0TG_xQ|(oKc}mPV7*^z*zn@1nQS;7D~On4V~K_cLV^d?G_t5(tXe{xP7tEgJ)Qq_bq)8; zf4-nPBZw2wW>i&2;vdbtBB(|liItu`c%;jd^~qj0zV#b?!$0tUXs&T4H5-9R(p=uW z6+(;Nv5=Lj z48}b4>>|rER7iB!{@o7zdS9Gs)QUa;{l~=bneDG!_vWkq6LzA3( zXHFC3ayc#Pv&xek)FS<<79qc*NJ!ES@jW4E2|0BA% zZKps2z5snmBTiS-OEPf$!u^-FA|%7<*Vs~ERPUSu+8-bX{U0#$#5#Z*!1LxdNJlfG z6onsgTI3uD@!bdD5QU`N&zc<;7fR;B*ep4~rrL2`L+)$wlWf}%RB? zx=*SeW<jE3|hgu%Lc>VYq)F3xVIiS{-t;uu#f6xeEy(~Z(kJU z|G?7rsJT9s_N#x>zY}{qR}g9qHI5Qj=W*2iYBr)q`-tCE(ekwxN?GoG6J?NijX)^Tt;cg8JYqAl+tE1 zMlK`Ew{Xd_nj}#8k(Uj>8hLK2fiuX4pJ@#yJzEmnDRD?wJ4{e5@SoNydi73y2W-!7RT+98!~=m zh3-j;yJv{jmaWFwiM{Lv_4Jm^C5hjr@}u4C)VpH!t;$PQHZGCv6WUnDXKT5`pTKD2 z1HhuO9#L}nDcX!$kyle8PT^9vEAZ>i5TTty42zz5KoxLqjQ z^<3g)&)Bh)J8bY<5#+jCz%eq??MYqG$=fHhZ+|*6fo}2Ln5bH}&HtWjIlk?XZf4T=|#D~gJO#&JEk+A$OBSb_>1 zzoM?K=^0@=W8B*l$$B; ze`6Fz7CnklN8HgE5mX8Wl}0HiaOdmF@uf13sq;sg(Ux2h{^U@T4ETfWB9y};TN|Wc zH!>d7G6_6+e(>NRH!}j73@sF4h-)9eJX1L4(Z&YrBk7y33)}Zz?4~3oRwZ8Ytw-L9 z7X`qkR76PHx7|o-ulNnYeX8}Bg}~+Gmr=yO&U?U(++WTxHk#&uzizG?zDx@uR>TsV z(8Z5QlERO$>>D|dP76#!dYA^9q0ki2kKB1?^CKg<``IViI*vnne&{tB*YaF9vm*ayyF&RyUJEFjAIQOYA@-kh9?3AvrQE z(ltBoX5a6TqoEIR`m?O1@UteSqfzCGY6;Jt24m?>P!anTP*j z3TZSUY!HS*B;Rd*Px~tM9ZfZ@XLL)t*XajHW;4lp!yvb!cCB-I=>2|{WedYo{M3sz z;^!2VJ{U<(vh1Tg zqEgH&9E!yY0@h#0>u^L-Gf}~54dLXV`?O{_Sz2ViNm)iHDgf`^%sK+F=TEN`_o0vB zMM9+H@K(^?mxoV~w?YV#Vc=J|ZPx;Zlt$-khw;`Bx~vT40cGkL9_?CHn%eOQt~NF{nXy~vSE&oij|0x2KphU0HB0%jG%NZzg5w|QbLXi zK5{z?lH*;@M>4$DC> zZs}bSVV>u+JE}9!Q5jyKrB`%{Ifo%F`ctt2Y^4DY?SC)@tF@OU2i0?W`xgA_u1k-O zC?1$x_xrC9` zXyn{i+9$tXbTs1M-XN+#+NTs=be7VC`AL_iV&2E2Rn3PZhkcDa&#zr#OP1knHT}M(ni#PGDE~Uy z)YR7Ofzmz~iC#00qN-QW5!9lZ9hv!w^UnwPkP))O7)Z8csq7DiE`PquPF+W_kmvBj zJ!1Eru=x|0pd8S#p1uyj9^EAvXG`@aL3H@wJ1p)~V#Cqj<_B!M_=;ZmZBP zpGvMdN0?I{>DXXSdvdVb)}G&Y5@p-%^WfF4*LG5U=H403ZVTLQ+WPRV(FOq}QRSdk z)ET!u+cSu;Xht~}JRSi)vmkYV0x-5XA{;#+Qjk7wrV7#b32auLI$Ei=(|6SC#!K^% zUHho|^@65=G;>LEZkqz+wJLLX`MIu z@KiKLI>MAek^#2S=)OMvcSkH6?NZqRrBLxSa#21jrvzo0$>D7{hb0$aUlZW!WPkO7 zrr6DR(f?qOTDCXfw|JxD!Byf~sD7B2Bo*y%Yk0zT@V}idVQ{WWysSktfyu^vWk1VX zet=L%1TPN&rguq0=rzZ_v$z}=KcJfuLGrXktPILzmYM_kt_^cN|g1e)@6L> zXSv_YzW+0m5DGJh9L}&~b4vu`{t?#^M4LYVmVgF(D^2DF)xrG;7^RIGkI<fAU@Rn%+BxWB2})o2AyMVU^GEiq63%+nk*t7aQt+m>GN*kJdw zzlw}FO3f4X5t}b-G8%i@pqBE!YOO8YzBhp)|P$P>ly(2U;p3;CP5J zAe|}{;=C2!c2=WIB?j2&$5@CXRCX(27xXlv7TA4*<>Q{HsOQuN^l0^ZHyHXlOJP4k z1}*Hp-F8^ssa@#q4p7!@SSObXa~L0bhA?UP9l3V1>(cjpZD!a4bB(9!Ci3$=bD_(1 z!6W)b%rY?A&L;&}$eD7VktErCrDeu5EY@jZ>TyzMg+Z~Kipq*F?Fk1*nd{I>AbPA- zVg#O7hjOrofh!GH`*Un;zzJ9K?$s=D?JEEPdD6(v>uK*h-sJN?=OjFhNzJ4uC(1rC zS7+#EuN@2R-H^5Dg}8LJavd7IHYc`~0&SDtw8RNDKf@$Rg*p1=XM9KeqWoO=gfL@J zAXh5*?RnI#MxmXIDrIo|RBC!Szgoe59|-6cC{)h7Z+{--%WLWjQRp%~QsDU-8Oli(_Bdwd zcWZ1~%K3-YBf`u0FYhUh1ff(3KJsOS-kAhA+nPH2<#7RX|UD~1IgJoUV zWB%+Ppgxzv4Hle~TV!io28qKBZ|DeE`0b({P9!mXCZf}L*_Epfl*P{(tE}JQWri|k zY9m1+)Ay-PVrfz$!#cGQLBh`$trwJw1HGU-_Et}o^W|`Q?2EI7)ceOq4B<>=XD~fPCD@Q$K<9Jh)G+Wtbdag}wh!_m%=RJ8(_t7T2ev|G_I=I+Y z82~x1jzr7zz~E-j%gCZ-BLxICWF4VcVni)+=jD_Rtq}@2rMm{h!ax^+X-DI`L*$N+Hy=#ky)N2*}hocGug4831*!%4MKm9wp zZ}ezVr%f*lwoG^SB$*bUVyWirgt-I&nG@ zw3-13in#txdrxMc#FioxC9fj&evF*9TlxVX_N5&JESo{O=!NZ;O;XcHx+zLDHHeMn zPvbPbSh2kjB5RI;Bt`51*<-SlGd+#BhEGcn#hy{#F%x7TJ+qojr3aQ`CDy|IM_f5I zENm!Fy|}%3N>$G@yMjVhfqs5cKP+rJ=Xbm*xb)5iP0@FOhBZP_S&b#a3ytD)-tz{K z-6X3i!<@m>x%;-4zil&oa@|F7+=R5F z{QVd9v%kDmxdf!CD&*^Lz*PFWD--v|978wO2Y!>BBo@NrtUE6}!F`}p|B~r1zdvvP zX=aO+NU&D~Ci#{EX6>SW1=tx*8WCACoRZ+V@I*_h-qw<-wETCbc8jlUzP7HxaPp-- zqPT*5ij@N#$=Mf75P%Iy=0RT1cDy*Bzq}>!Uf)*q?85kR-&l z0#43JY5WthRBuW0y6lYcjE%_lx*|NT$~O8NH*X8DL!*Vp@{*l;@3iA%R+a00U?ZX3 zteruBpZX?Al9)16EO$i&hx^W4a-H-UGAY`+Nm(}5JyHf0@KbfvWK&s&b#-(Ph&#Ew zJ)?vCV)nuOVwd-_?m@HkYkx+)qh25e>y}}sNMD3|%+l4Yac^#0535DRPZn}R7MQv| z?9hAI&%QS6AomJ+VnjfpBlbqxTAWD#RzuriVZJRP8UCQ7dB^&+DVk_E8A+GT6Q41? z`kx?iAQ2rrKQwmLvE)kD=;CKz$~DL7-y;FRqRgW3)|PAFQ+Cr>iOtj>uKy0m2;m_r zB$#Z37(4KHBA`g$hy3it8v)rLRg!jE%bTJ1r0hcT#s1q!t|k)I^YMM zy|0XmYVY?|P(o1{P#8j5K*^DoQc{rI=+MH@NQpEO(gO@25)Lh(fKt*SB@9C|ARSUe zNyi`!ca1w9&pzjV&hzXy_s!+iykO0&^%{No6n-e)Q3s$ zo$i?30A=&DP4Y9%wa**%gE`W>A7aYQT7DV*ex>OgnG+jV03q`j&`w~`dwY3%Y3k(L z)O!<}fG5B{_w##|SvSXQiyq^vz`KBuSCk#kF)l~O=w#RU%+(cuPwA-kl;cMbobd81 z?$g}FABgVFGg6ZaFBVhXKjKx`%vF6)R*Z8*0EH6KKxrk#(eOW+ZTKGn1M+5%IneP!4 zHJo~tbR(W$r$)=J$^ej-jQiBSe8!!;_dOcU8&mF8?V;- zxLL6a_=>smBKYyf0JoyTkk?-i*yuuGa%&VIV2Kpy2TO*3lepthGIzIR8K}pLmIP+- zu1&Bt+y4GudIRnug6-(c0P=@9&+){fv`$Q<^CN(4BvLW)yi-dURtFqDDHOn?sH>NS z-ugMwAF@6Bs?7P=_;{8kkK}Ly|E=OydURU|!5*-UB4dI(h7b^)5?bf|FN+Tm*a*lp z>8w>@qi!w5nqaPTLm#yr#>I2%K=B9&y!K?29VujN3~0%@@*uEt#m=Lqmx zg$nmNf#O8Ec@t1e5IoTs2DD6ymgBV5l}DwO;%U?7zs2~o^|}ze8oBs$)guK7@G5yp zFB{p33HXa+UrmQLRaF64r)9uJ5u9irQcNLJYBrN%b3&L}_7dZxs_R39?1Mu5$eoZuZBngXAIL zQI2#@v$)ef8T14^TE>}|^F#6X-AI6rfQP1AcmE49)4OvOlCc}s`eo4H_bLEZ-DtHB zti7AxDAHM|HgmK@yQ0m$EFN?R&zYaO0ewn`uCQc&v7DKIWc{{EW4=m#bdycG}XFYPu#(k>+b zteW{3Qv!H#gInpmf9zYH>=f7;$)ykg<&C{Qxdm} z&|J(c1kjp8&8$Z`{@MtDoo@35!38h6)$fCUd%-5Yq_2>yp(N{<`S)*s z>=MLd_YPKHtp3}V{4%DhSAeT?79Ndi{RgqnUo(9n5fH`N@%|y@_D`kNbiVT4?XyOM z507rjOUr`9(HEH2lkXp4 zmYDp(6I{-V0y)3o%Fq93hS3uwNiSx9Ap1v8TdsPqe^KATXXSSV*8gmpHBM0U;>1PO zEZ+WS*Zsb@@-2WHAAY`K@Rw}QfAX#uLSANLs%>r){t~DCYYO}?_hZ6K)r+f6Fqo?O zNBfEB1tFgvHEr5|G`DRNfg3NMI2`)3R`Sc_`(Ho$SF^nR?(C4%aqpi@72QtY#tY|% zD%by)@%bM&!>%jm)zY2&9})d7WbmJTMWa3kaO1p{^{)Tu^l%`tK1@l@zN7h1_M)LX zz>V8h?-u-{-NacASRoTa@&5nh;QEw6YJF(fS>X8XpS&;R|6khw$(Hc*fB*l|{wG%d?O9uS3e{{@ZTm z|DIzL%XC&%4d{I(12Fkw_>kj^@$n=eF%-h9GM09WjPtl|E&RfG`887UPvm$xvjdpB z*#Ict>IFbM(q?iyp;Cx5&BGLrLKG?TJ6W;1>lXO4D&T*B-`0J{Iv)J@ZGu~68W3N! z1=7%(1;G1ZP?iC}BlG9^+}7u3-ulJ4@m~?^t46w;n~{#*S$Ju2I=9ZSGC&7h^z&*- zL^kH!?C+bPs%qoOFo&Ww(SLvNL5*p=bbk_%&R5PzeFcOaH7jzSzu05`4m(_|YpzE)0|~FfO<3jJ3=*Z7gO@sgX} z=Q~`*!4U4TtueRp_R}|BkMsgfr;kKyXRdV90}NGIDA_T-^kh3?xiH*t>M9;mJGm9* zuIic%vCwP)N<-BL^Zb5#lY!sx<`)&^_cy=th23+J5!&^z8>UmSwmO z^{k}@?w9%iYeTsFMH_PuSiLW_Do0b=&v|NYGC&=5nb-D~km!0pzSyNQ8rK6gHVxaTsb%x_UNM+@%EiPI z5;XF>iIh`l18_XxpiUR?XxeO*Id$5RIopwGF%18L!+Tvk)MMepSBNYgCX;(l6LtfI zO%bQ+n>)g|bU#+YYdG@2iYNnGVrIxoU2&SWv)+E=Jq1}3N;3m)9odMkVbr#00 z)T0A{O<=Sv8MS!==#h#fi#qb{;V8;IHLCzQXHh`lvbFK9+E<6PU+bnov}YntBJf0B zM$VCrGvnJ0KySNc=}|gH_cl5!knBA|3Bn6Ov+{FDXS$lCEK5LJF6^Fc_%$FYASW5g zTe5;?T?Y%I4b^iPF>tCAU~0jSeT2@>-XZ}A9yhwXMW-fe_h%z+1_CZCAH>g&cai~> zTt`zWvs$>XJJi4sZ|st+;XO;mO*hQ$x;+7C7})OhP8<-WhPutC6<-T`L9Bn>hoNre zU@@J4-Wh{BZm)}c@nE94im0mL?dJ^6of9$rYpyeCKLF0br?^fDi>$ifM5&I2$&eo{ zb%G5Z>H^(ClSF>&MXDp&PvW}Y?c$L?&ybd$3wXy^llS=0=sWc%?|-cj|3sFT51arT z0gW7cb|N2X-?iJ(=f7}6- znr%U_lDBHBBvL*XSGXeXLYz;5TB@ZpPze@+^T5@cDY<=3k%vt*^_oFDu-3!z4bt8O zc(csOcMTe-;ye3v@9lP0lY)V+dH7`*?;-Vx$$RXfl(!r85*xyg3BV*PV82v23&^bj zJeY2OVn*FELG(2D`@OOjQSvFslMqo$A?a9wZ>G5SYuWZr& zOF0Of?~1a<7MJiSm2935ky}^jSG9hMvp>ADcdcvdSV|?DlL*I24Vz|c?x;i!w;n4q zp{+ZlBCW2VUVe{5D&FkMUJ3)lJhPkX3W}(QphS0>Fp>_{^9c&a;t8aS`0{(iOxc=Y z!%t#m5k3L4?WC>zQSU#`+K7y4xAV*iskg{d-ZUjk zXf0a0osVmO@dkwyfG(Jm{YcxZ%~5?MdA!+75AC5BSuizp1CG*!b}xkM%9e@2!2s=* z=q)kD79gf^NwS_y&>43h2=<1lntEe8GV^R6_w{~Q-jc*F^4a&I?dhhvpBnb(j_YFr zUtG#FvWz5Bix_;db7J+II*d<2Cpy*m5NG2yKO5|TVxP;3y8BUl7S49JKvCseZ?DJI zmmiYi!;j*+PhQbTYyrxp=jWxvzn5(UEhn*HF2M8bA6wVw7zw1PH=F3cS)0NKCD3pn zPmX7Gto|iJ5mYjHCFZZkLD|Y&Q$g#4><-U>yZjinA~({r*VOYtFwjFlRG80@yR54_Uy3eY)oa2MH+Q2$SjCnUJd1P!Qny%g%Q>#WqZY0 zH}aL+y=ng|O-FchDf5&tK&XrgeQB3E2<3++ z+-tPDzBq`RcA&1J|F2=(r{@~bjptEW%l_k5j15yY(g9z6eD5Jp%4zr;SdKZ>*Wb?YhTC!Nv0}1jQPos=OZP_p z$lLFwkx^5>;DmvYct`P^6oS|PvqL=M3vqWr#5(GNfm=CTITVJH#{z;tG#x|j-dE)~ zI6mUZ$4gJ}=017qp^&FQh@&7wHiYV--m-H%280WWz*Mxu8N#NjUO>@5qT)LCS%*0% znF|vdYqU&hamCB(;^Q)L7d6U5+N8y&D$iuoTW~}Mr;0LierscX2?0t5hSs%!r5Pq1 zU4G5g=%Ht(RB{Nf;n zir~|bJ;JtZHJ6Dlw>qDzaE?CGF63TtS=XA#sg6dMURzJkFKaKo))D!FYthtXz6t}F zk6Cmcz~+_ob|styzl*P1?QR@*`^3j53j0*_Z2?j*CQpFSAZ0i~(BYTRz=Gwa=MLZ@ z@J$7IpX)xE#jZpUwREVi|MUxhxwAwwV})TY{V0Cb)g4p)k%R36;N z!Q!P+-zePB!C?OEYTeX5rFP{4I$y(gJGL9|TGypj{v-et=gn6=_xeskgcA=P-Cc$nMy#H;D1mCg z1VEPy&$u$As;ZQuZxd2Vrz852aZvFbKmtp^Ce4Ej7i|*B`h2U>KgsgX&!H|Q+Qs++ z8gs&U{rAq3;)3X%iP=$JgI=l?fF${N-FC?V)m3Mso2lBhm`|4E%GHBSYazHe@WwX3 zp8R>bc?=O$av_*h$l0_>6r-rSMb&IQ6|@j4t$4x;BD4rvr@5@~DG4%`^%Da9lzI9L zR2Tv+yZX!S->H5M3zEICpr{&&=dC9~=>5HcB!b(oXdvGy=$dLexSX{(6~qDU(ua$8 zb)>osN(4Dr{;-U(1WsN@b(s^K0{1&bvV$9tOZVG^7|#dgNmbZ=7PPg2Vv~f`cPPpC zUgL>seRTmZ7+KeUBYl^DsWF@oxx)4hPYwksF+dW&Q<-rZpcapASenZlGa`fc=`FhZ zB~1OO%ML0@W0+>7p=a+V@u&kmy-ezFiysQne=C-n+$1rvZYm4biYXmM_u8^BQ2aobZsi0s9wv3S;PJz zz31`~nAMms=BHH2z;O6*C*p^U zY7rX;RAr&%;&Hj1$N@c^9kx^b|lhRfz-1WyCqP)mZjr zIEWjwsvve3ix5<#ljgkT{vM+cWE$ZGwJ~{5tmUwZy%|qbFfXa~YxG*=3-Fas($lCA_Zxs3%X_G2v9P z0mh(m(q{<%>^9gS7KOzg_hPaONF$pgMy##KESDaYer0@kgXSPhPSfhpHT?%e=uT~F z^E=bB97)I37fTZ(V3ZQ4+LLuF?NxO)WD24%=Y34metP%O+|I2^iD_TfhZo+M~PG-D8z|J7g%=vvK9puUXIql%kx!3&ry>W6BnD`q?{GV zvo^IfqvqwFg9N%G-%g13v@*W1&}t{0(oP>ZQ;tim4u#wwN zFIg>natu*?6$LC;5iTSS_6PD^TTvtE3Srp#d*3cbM>FH$u=s1BhFm5mfzl)$lv16* z9YJk|tyj{IWUS$0%X<}7BeVHxF6@Q5gvC4-hT`glm&YhGURS)Gd=_+r;e+Vbf-h*X zUj*k&)??q$>SVt`@=9`Pn&slIMS1)E6OMEZZzLmj24w$bS9OXueyI~DLl%vCn1~zZ z^Fh=yc})8^alL5{XR}{Rj82>rw7V*W>s2V-f}uN_@U;Ykqy4wrRWx>EWj1{sTxDDL ziEj+xjW+|;lOr~4#%pgA+{b@MFlg`flC8IfpVh^uhi>Ci0cNVdyn~+~1(METZ=7|I(@f|+LfhY=fjoielLGWzZVu^wlg&NNii=7`>9HH z7^}NHezS0b?~~Pg{VorfNw(x6^MRI?(8mgIn4E#L>i9-=%Aw#ewf+6{w>&~Bhdo^O zl&R$ED3t*6F~>LHUN!5fsu0L!w_Adyt*RtB`O2jauUcrZWx#xZnv#Am5gUf^s@Plc z`8e&0JEG`~tVtVs7vm?*l$NT67(QuhzlpU6^>QibZ4^AX5uq9WD*ob~?F?^}Bf=ZmN6|@v2n{m;>xIQ0 zK4(}2tVhp$JflZu>k6sXP-G4l9&L}-dOCxyf*RRGOl7B0>>j|ueqE5(bn6q6t4PDp ziN}=ZfO6PMdMA-JjXdIV@tb(aAxgonj|0h%w}WIoWZ@e+xyU9utZHBcg5V*#wQ#_( z<<(g(4)584BkaH$7IrrrcQ!EGI!Co~UB^(1kxP?^T9-V^{_%cn9?)k{thI6rhq zfOU^DlVtR|toY!GHD4gw3b6>`oQE)}_R6AfM4P!BcIec96*&JUs3N`_>K*z+vsHdI ze=anw(L?epfC_pdM7)g!b2w%y+_~DYJ7f@%_L7u;InozjvK6K=e0IMtiIoYRw`76b z?ZtR3I?hHoV|CG6*4p`e@X;$bu0&do?59cgDI(d`Q&P@a_1hC+_41VC zJ(SZoOA1L68Ink~hS#s7Dd6*yS;K68SW%({KZT&Lnn`d`kHy=9yI=nZmv3l(N1MH{ z?>iz_8>;KvNFI53rtoaoPr(bd;pe1!-glCyT zY?h(vA@5gStM?TA3Ab6=Oe?xfMHcRL~;nEud*!wCvFzPHeADSq3F*bNK+XMGVAMq&c6vO(%1OLk>{zn3? zb2AtA(!DqC4@d(#B%OqzTEiXB2fp0-TLja^eV)un?_nn65^PuZ@($=TA*3Q|i~cUj zA$bi8x5$v!Ijjzjlg!RjRY^M5?t0`x0Q&kuj>wNd-{zq95y@m}@dPI?2gkc`ny*__N;Ds{08YZ}*aw0Nxs8y_NTvxj4V zD~q=fuQOhbX%XK#1;nXE0W(`Oi1*e0R+DyH%}h++#?ey&>)NvOmU5IQ;nMI0$OHFm zsXi$#!9x1&dlE2$)lXn+i7})|*npBumT^|bymT}q(YB&qL;lof^81_Jp)DLQVhG7c z4Ee5&{$zG5eMMp&#_+Ii#GZ(1w~Dl1gpO&V^hetXrP?#yWQ9bW%oyLEHu9Nc@MX;H zpJmH2lQLHtXSX;kzTf7rcT>PSyggsh-tgCb3CTi0a*>1yHpeO?&!zgX8C6I7QFcR< z!nHIeoGe@TMYW><)=uM^g^M`NOxSf{fNvV3k^IvAVEzq_lTZ*^jJlLATO8#B_0Gj8 zSl4cMa7XCzqq@c1^*7?ae2fL;(Z&NPR3>r5719wNT^~tymkayDIk#y!`G8m$M$O39 zaNHx@@sT87^Ob_`;W+}fJQtU0%b7ssQeS~ropiWFTlUMvBuNu8A?1Rr|gQN(|~ z({|AP)rxWTS>2qUyKXt#ScrY~@}R1xZcAWtjYijRb%?*A9~&hp?ya)nS$_X!k2+UbnAY$Fk^5HV9$+kSg4t4DNYtk~S0{Z!}im5ck5pJGjiFao1PSPvIvM40N;8zLadH&c6puBL;^s%E<% z-`lUf+*$>|#Jog(g<}d+NHOWzO!D2Zh_IQ8rz_$j3+ z-0(DlF!|`j0^AuBn4^p*ds7%K9%O)04APkd1qo!c1hEmZK4e43KwF(Zh<58H{c-}3K zh^@zWPjBbjY*>pHDjh116QK)O_AqSPmv}U^M z$S=JbZP8A*Totga4JjiRIjS^?#%S|!!M-8cA&MX>TIa5mQ0PS&Rj*G;mLFFx_=#mk zYf>N5+N0<8d(X{;Ra5d(UMViz5SmRE;v0L%%U4Fj0ndt1NY>OCi;1U!U@}58``^;v ziJ|4?UG6Eirm}O#l)oGTLS%$#qxm&2elu13lRu+^ z9Yr82s4Yis{Q|t=;XbeG2u75A3R?<(75E_B|1yjV#Ty`}+lUi?iry)Kns)?S$eEc( z=HjK86CRU7^jSz)DuTDD&J&k1^P{x7+~6K0k+CH5B+>Yu5{)Vh&jAK;IVXFsJExDU zT$5jtBU#oKDK?{PHV@aLEG8l3Da~_c@+WPg4*;_Y%d!wq`4E=s^TpPqmV@6IK5q}$ z>^VS^LxL)3RStagNu45z$!blMWev@23jnL&a(E@&ntfTt&$z+&sf zS;Euci?abVt(8XZlDa1Jg%tT8fOvTIYl`cTUJLjQCc`10F}&?Yh|*X@D0i7elVh@g zdRi7`i1L_VvKYkmc`CCakfD0Alr0hqY)QG^H(|nQlAcc^CWj@=Ng}s5-#t7oa~5HB zmgZb*BaEm0X=Z+0VrxQNU?TSlRa}_wP>!9d-#{2dN}sNVOrIr5HouW(%ZbNQ zb24!x(ig3M>o~Pjp2lR4+AY&x3-KXwwXI?|5EsK`_0@elM}6rL5DQNI&7+oHI&vG$ z{>!c}c!7d_bW9@R5fVp{NFgbk8~G9h!fPI~rtMyLqqt4M|CvN&?wchuLM-9%283V! zHJ9zqLbt+_9CN-O7JMO@;_E(r;1Zslb3t3xClS)Hr>y5PU^^;D}`87fFY5bcUCg_!39iq@g|uhf!uZT2*+ z%uk-z&D+MrD~?A+I?tv$FUPZsM<2S2%WNm2B}&(esuz*~+bv71Srz($^wQYdHDpWr zXt612=E+-Ubl4j|Hon@;rb$_8AJId{splX3>bx#;Zw~EAGm340idwlT_RP=n;-$i^ z!~J?EkJ`U2fOzJ{st_m8-R|5V-w;xGdrfTAMY$XpCrBs-Yy~33`XF8gqV=qmQZO5; z=+4;tv^v`#sdnYG4+nCjAFICV2S=MtBIJ3>-NJ5_ULJ1CTDxI(gZoZqI?S65BCSAg z`%pE-O5p*zNcAd)&R9BaB=J>5#%mFhEEdB)J+PTZpcbE?v>=R5?bSBy8mSz24c+18 zB*qPP)28VNWfx5!)@bF)&=yy~p^IA4)etG2I50s?L$%*m7bZiw)(<6LzDni7{lr># z$tu4Vt_TUcdfBY(eisrY7r;AwZBZ{fCjd%;4q<1CVIxDdGrpRuauJ9)xkzAFwhTwNARV7?% zPhzSnO2oqTD9(HJHlIWI!~Jo69`IP57`S(m&kWkNLT3ijC#RV&M{UW05#{b=S>heh=?45q`H z*6*#p3y!I>9I$MXkDzmF$m;~-M13wVh(nxY*1sGa9RsR3915*_b1gol^^CdXd&W4O zWy-LfYIoa1GFE&{xbpxAR3s~sAit0ax_JEQ9jY?y{3ENvxQ1Fyf+*m+1Hv#_xuyKz zn8HPk=Z4e|mc1!-Df%efL2UxLA@`yV_=+?`?*;aOpn(+&g*(P!4Uk90B?OjV-mAa^ z%C*3KUaw-DzKZ2)B=-p{Mrej?28%H`G3&Q3dassAyGKSOPLB>K83tbXGA8+u={>J4 zNO1VPzrdNgUf?!nG4c>$-R=5C7{s7APk6tS`X(rB5y5&cOO z2k6<3Q=q8=E?;MLK0_`2X!LDu>wz?5n)y~*s2w+s%sk=B#^XN`mbOOBGg)3BSvnm(AY9q+$E{sr=2wQ z5clJ>sXpp;pptiyio$FcNx?V{U)%yOmSnYVwK7fF1a|>GlEnzMQM45O)^oBgTB(;t zk^<3??@o;9Qq>S8DhrFj_bT8kwx#rU&z+dwqRdM$6=E3j)8&2T`;M!8S+5q_y10;i)Kdy*#-C0mx;NDewQ!|1(dC2Z*9y`Y&cZajO=nK~ z+|;Kw;4vbkL{6u*V%hoMX~^@AE;WFE0-B><#m+eWMQj2_ew60a!CSH5YmC0)&5H zeoeATnJJkkEI@=H&af>K}Dua}bdR%ewF%{@J9%@{ob~oTLFU}>>=2o=XL-yr6Ya>e! zfw=a?&_RFe)1Lm1oQmW6sJxx*Gd6uQp$IYSPQF=#RGs05BUB4ifn}LW2c`Qa_}QD$ zuYTCt!;$^ZmeB{oQ^d1X^&2GphV+s`;o#|rwg3618NZ$^SpHDclA0UM#&~z>Pc1cwg%AmEDA!`@*fHtl; zv{m3E!+NNmHKVKdagUVH`8H=O5tXh#;Jc>Z-E}?&sU+im|9(HCw$;f~a>nm#4c>qXZo(p94 zVZ~kR3d=Y`IrV#Q(CHuvKcf`RI!=(zz3-==2u9Zo9O{jCL^|1}yW<|XT;`C_CL+yZ zN}?Gy8oDpwVA;>P(6HgN@uH;-tj|Ja(pOO;LVXbYM3&ZCQ}y+YN5)YsP6b2UYMI3@ z-{?-=s}ZOCPKfaLyfj@^dfavr*{4^pe&Ie-OwTo&9`SDIXx%%QfO6Q!9l;EISD(d| zo6O1`y;~cjw49WxSaQeZ05wWSm&b@BoG4(V1kmEx0faiEK{xUjgnB~cIfMBWKx;T6 zh#kcjB=H0;NAjsSQjLTSTs2a|(99+Kh!;`z_?#2P7A!9JOZ8kRbDvJF4YzyB#+a%3 z&3jNA^C6cLi6)?c9_qOQ;I$3QGS3T^gN9(e!GmHDjptX#w+CF8W0_d_v$-w1%tmA% z&9*G^IBa3^Wt4{~S_P2s)qDGeDIgz6f#n z_I)bu2p^@w&n84EXd!A!k~s2p-h=aB0P7FEAZ6&e$PJJ<0A|D~EYx|~?s1iNHnViS zj!W*&mel=78)b0j)FUUf$TEzVFE;UbXRo_P1(Bxbn{jKs;m#vzo*{|I$bUNi{?rHC zjCz4bqfIQ^Y5L|r^B%Xa$HkYYeZ&(vk~Wb`TXSuPCOeelpAV##6p6*zyJ?;~Cd-MuDd48_UuEHYhxRblA ztkTUMcvNrHaVQq8x+OeVt=8aM`GczoRrS zyy`B*KNd8SdFCwXI^)%zq}vl3hRt8TyQB}>)mazUJL_Cr#MpDFZeOv#!!*9J zJ#qpt7^F-@`Vr3C#ep_=0W|OV#;y`=nFAW)Efm*e2R>zjJY@E&F;{rF(<9(*WREYT z$3Gaz;Y%(^(EIWX$<^$bUllq-8fyTTdxpZbU&?6iF%GLy|ILyln&d6`yY8UBUK{*j zZ3(IE>*opEhCvk*LwY8L9nY2KBjLlse2hd+SJS4Ye)(%*wh5Ze^}r5uMP4 zUTvF|B!`uF$lR`nH3;qi8XNq^@K7@lt%mf$T<}WzkE7d7vP)pz(y*dqNU+6AVobg#ihlxlofJ8 zMnj&Ky5LY+$$JcCuN#>ek-pAnH}KQ20*zqUQ}fL_dri?!d~?0ib=;YMwjM=@KCHZd zu>Y4m=^vrB9&@AED$uy#tr^Mn02ANg=)s<9EzV8RRzfVBTv|q`%s{bcBTb3zm^rDZ zcA+z|VKWFqIpSG{vT%GqcF7#v0-n0V;Rj%oBb0@N$Yxa1L4>;sm_-j)8Quyg>RyL=&Frg z{^hCV=O+gE>L3p#&^3-IF#WY>?<{w2COGi~Zi+m>2RsUDwUvG9Qm{5>K+=U+v0PJYtQ{YZP z>6~LeswHq`NfRgb2l!2fj3yDcyD4Qv33xmWkUUkeGEZqmMnZg%9RktYQhA6>?fe83 zn29$eEKQZ7h%>Y2p_GyhANS*EX{QWcNOjzGW;wsN8&5pvq*uTQpMqlnXM2kIAyEflkn@x9Zy^W@IHK z>JE#@ay!KOI7K3y)-!|qwG#XGyY)EnwIJH-MtnGdZKWf#037qsfd3BB&2@>aW!lBx za`c%NCgaLm#VR&{fNIuo%mjJ0o&8w{CQMq{4{>C3%IjCCma$ZVT(;GAmTtx1*P|t# zH>~Gy!|V?o8GPgd5Jcfj>8H4y&Y{nGDdt0hrn2Dbg=|?i0)^GW;0zx5Xt6ptcK6`f zC6*Th2c@SHnKYJVfRR=eNi1xY%|V`9UK6}M%^acYd9hWm2rJnVI$haH-M@$zKxcmN zb=c!&6JfB2ZeTB5H}#0c9wjmYzWQ0R_j^;Yx9*ytqfD?^J0ZgbdqjBocR|FPe6f3H z?ZcO41-aKKxjmneceqZoPkmhYd9*dzrl6#j%lg&113Y|W7&&Dyzer8tqhEoI z3&?lve3FRnoEShirrvbFi|Re?a5@l^v@j2eK&?bc$qEq8ddd*`PNZGWh?uHLKiui& z&ON_-dDVC>1`B`kSg*Q(U&i+!Uuv`AvHZ%{uDWU#{fkJk$R`w*4{$eKIB^^tYW>vK zR<$L**J2$Y($BkuD1FZDaMTbJJ*9LirsKKGd(83Toxp%yS;YWq}U%x^c zxKL`K8e2{T-zti*i5+BFiRTE5y<_V$?qEcA~tjp|Ly@u9gqlw*UtY^9?!o^ z05|O_RPZKgzP^B~ zUy5LB@80{wYIWW?^&z?h}6PG!VbukHkiusliwe_!kv= ztqmI#4$m=?nJS_dhi;%PM- z7&4Nxj<4u+$pGQc&Q!<{z8UprE~rStyjj*Q+ae#Y_CSzwT^`2#W1ticST}CCi?9cS z-vr)kyu2miClj46>Nr{3wyAtWU@aA3FSW?)UbZ+^XX^8~81|jBVy`_fm-T@urTl>0 z^pzd<5EYcv6NVi?=Bp(CZJ&j0PQ8}dv4w<42RR#4rZD+43jQ6HwESuR%?IA>P}2A6 zAW`l()(Y0`*GkmV5buh_vK5ao!oIuPbM8TA6%|SGzRD*QeUMtT)CITvAMMS|P|5b} z$BcJfjONQ0AP@bV9jk!a67^u@{$b(5IeMrp+%MW=@jScbn4S$*Z`Oh5ujl>!(K%Qe z_akn++y4Vq zYJNuFYw|<3M?u8>R@U6AwF2-f68RkI1E1i!Q;o#g>WV7cT5s-hT?eRY|LEa4LZcSj@&KBymmYBOdjMj8%~r0BdYfaNd)*ig{LP;U|ZEAAIh!Qk0}HM zLXbV-mykI-w;9V=(4EKkNZ2@wS%pFcRBk}jhx6{U>bfO5MFUE|KMran0gEve@zQ~4 zz~s=lqVjS-XE94x$t9A!)VYzD6r-!CZ~XEQT`NZm}cFMT_ol`;%?st z(1ro6Kz#V(t&LlIcO5UhfErIrQ@SeT*DZg{qfVrSeYs6#&rRsUYYUCS`s$sH%YNQw z&hI1)%O&ojm;-&sK9OU}Y;aO*?E0nK_504>&(fRCKakfMd7HF=OQUDbL)E?+_g>2r zy)mcN3J;O>>~E9lpv9?+gZE0p$O0L$r!X_msA$OA+uDvt(dRX20>? z!U>L9A%BcF;AX4@Z0;6~wD+kZ(cXz_>G^jPIn}a^OP#MqkZa0NW@|*XFANDJEI4<5 z8T4Cn2j2IR-4(*Qg(P$2@dcfqah%PTxBGuonll~T~YyW_I}W|;dVcpLJ* zEeEN2K!wL4rp#*biYo-L0_TnM#mTad9zDKunKo1J}kqqqxbzt|T&Q8^)PBM{0R0;_L@ zgs){fyq{j~2$v~22D}8`9#wI*NB|!&!X~@%;dNDfgFEZ5zR8vUf@W*wYn-jQYj{js z-@zKbwvb8R%Lf6!n-MQ>tp}K`q9|1(WU?_Gth)Scxr&qxwa7-?5*7t${|=4=p- znT1vIUPkSruFqCo(eH%%3{(`F1vOYU#6^%y#@9C3sGe>hFMUQdZPE?W`@?(WwoGks zvqlEob22eudM9VVtqqq@;J#d^FG2yxJBzW77g%(LtB>PdP7m5V?El*B)pNYcK~}mC zY-kEo^CXZky^pi+pm1U?F$O84HS)RpEOL>@SZ=9Hj^Dm^mc>jIY(Ka#-!N>81tcWS zCcK1)-)E3v|MIS~fEou%IqP!2Un<;W$=|DivuC6?wd2s34*;1FkU%4%tps}Y_ay;@ zfkc@kUkADui;a2{`ZU;)^aB@1MEobfGHU!XRdQsM4i|!;9QyiVQp*&eso0?OSq7*| zwO5J`cZQ?FeLT9v2L)xw^p>_~kQU|mZr%?SE@wj(BT)>brmFlHO{(k)BXP~)G_3&g z-QloSZ)Na0 zprzjG!ZDsADp@62;;mKa$zFa)o)_~c&%I&{@4(2$+baxMYAh|L#pt9Qa#h^+eaVTA zzyY_1vF4!^66I>;<$36U8utQ2vsp8L>B{N45kZmy5e75`4=WHi!cv#9?#OSERFUrH8)$YN)m!Efv_l{kn8zH zoy%yGgqCN>^)Wk^CyMhprxYi`!xX_reD8}IWN2Waby*AF?Q9S|viQ)#OR?9bw9-{8%l7 zRp&_SY=nE5)IEo{QNa%_ICNj}X}SjoB7zXRh(9)1D(BRB)od^hE#4i6k^y~UNt*3j zW0MS-7FJj1KM=jJ)aIBH=kL>{eMA=vpL!kr;=Jzshr^5WOv81pmgiBRm)k1<(PD*SH!VV@Ag2l zYb(b742Ddve$(}xvya3 z_lRrdB@bc)!aqvA5RJ~KkIjv?y09R=WDi1nw4b%5zaVo0=gT`i3h)j@l0ETDB`v%i z;&I*0n<2&z?3q!8XrJjw54L+59PpT$7nwOZcP0ry;1S}YR0)@J;n$5k%=#INpK$0& zsGAsQ&h4ng$+ovT#AV1k9N8lD_AU+1U8H(7080x{H%5gS`%B}F!914dBx4E=sm1Gx zgcfc21Cd`h%OK(~DszAOBNR>T0B1v1oU2lI2lg3|>M=IBPxXPje&N*@*-C@genxzl z0)q6fi3>8+J+wCrcH)B|i6P0&cJ-Agp-~Y@?x+?KJ<{oRK)Ly5*h9T0{WP&fHZj&Y z(OPMe%kA}m1_#r$Qe7Bvu6)>27$(aKW{HBTkq)V*(^U}gCfvN-ex;YW3Ya*B)zX^R zHeRk$6t4;H8S7O%Uw%2KUCR}|X8m;+kYyNk9xBd~#$j_z)MhI$*7!v@#L`9;gZ$5+ z+io2-b@G~4qO_LDb`0=2*~WEiJ=5;`l8kE3l*;5ho@HYndoN=4po(|t5hp<^ zd0x84`8bdBPkQEM^z&eq-bk4h#gGWEv9hu^-JuD1sCNt7MZc~&P!NQtcQbl76nA&f+))~aG&Sr-}h)Ln(&5k$-Z5ZM# zCYt<=&N>)xu$UL#ijIQ;sxc4EGIR60ENxVEyKXwgMSlWb-iebR+tQ**qgxI6yoWx> zuvW;)&aK68I-SKHm}aUcyGxGliGI2r zJV7X8katkXg*~#a&m3;W*;=PL>|g>cB?pB?68ILZiMpb7-G#w}&Q)q0EcQ-Yh@_!j z8^?abpk!}g%yYeDnxCRNECLk&O7kC!4Fr zh}1%lc%5K9UECD^>l7&6btkSsJZ>|+JGRtdnk_!{(OEh_PubVb?9KGnVbmZ={LXiisRm0HS7*fUE!&TaK*9^wmBI!#k6d10l5D20-;y5oNTx+fECJ1ogb?xKrf~+l&FvG>}o2AB|jQB zhRn>R2M--Ty>6M_>kdJSg4r)nRLZwC)G`G5J}2%!fRHs`EEk*j$bjwM_~?wyVFOGz zN388~T}ES2<2*Qge4&TtXj#H_33Cq;98=9=<_&IRe|mgj*GA+j(#h(&^(oWN;2jyG zqUYnEh7ieYmX=M&t3r#8jhuO@lTH(uLu|FC?)0oH(WNcNYX3FIo|0AU$CG7tW~Bbm zA1>l?zE{}%1^BDyek8_K1dy8x@T(ldGPz;HWcY#XTT~)rCuQnn~ z`dn$J!XLREZ#V@seLLnA&!dA?~w z?lSAepCUp~SUnYDSJl82j!a#Dc~oSuEhZ{jrX=L7vh-*Z=cTo3p?jBF1bwXW0C1?z za_ux&_+0~abXiY{SIlPh3iEpa<#%w!bZ0p&5Di_2`xTAm@5{!uo%8|pa0Ew`*_2iQ zOs;S^bVNe+s;VZ{LqwygY@w^7!tvKEbSteaZh|H^^j?mfG>j zEHp7y$U*qL=ZxoglExz~i~P}7-8^gTo91W@mO=Pn0m_}qWM>s(eH9J5A>2@=bHfD{ z8*z7|x_+xx*!FJBU$#_$Bs4ih-pa2H#NYa=A`x<>K8*odmnI3Va70EzG`|(kFkf3y z7yH9o56c*}HQBE8$z5e59Re!oc^ITp ziqir?g0F8*LvAorA!{q=Y}38F?&pK7lES#w3d_3B4f2f0)=1h--b5F!PuHm2QAI{@ zY$;PE(w!fauf4CwgkzmI7)g{*q(*NX!!f3OGNc$P4^@PavEyAAACOcY;RO?W&w=&R0gJ_@AOuA`WB5G^)_sgE}a-K2T3P%o9N&ES&d0~pBl_zfQudEndKB>eTVxqQL zN<1c6ZpXI5U6#kls(Ua;k6IV$U`?IIXr@8!=vKzhv(2{##*i~I_hb*u8(;rceX%qk zhXh!eVtgV?1fYz!qZ~nRx!tH@<0aLzi=gfDF(|)?{bs2>z)t&|!1_Jc@YbL{1dJIv z;3Hzk!w#0baxL@?Ol;r&oxHYbA?S^b;dBTY)J!VJkYE~N@{u7X%Q4mH49yIyjM`+_ z&gV|C9>Gl;>vK3FQdMhmd<+iStE`|Hs^)9M#8_U5We6RKmFlDMNW+G^#Qb~n{agagxqTcP|Ud}gUT}Po@q|HiqiSbGDu@p(( zS>ut;9r za{TIRNT{^k^)&yuu99_W64@~2cIJ2C{_TsNpQT{hWvanJBu zC3@o)0{0ix782SN_L>z=61y9Xx|^ag{zTmvO;beH7*ox|U>I`$a$PyK%Y|pIn6DfM zs~laMgBA^(g&y5ZfX3(Al+7RV3RlLfuS|b1NVb9&UeX-Rm>IcHKBJ$sxUV&bn!0kr_7d9DA9}@_{Q3S zBMKh=e7iydVWdS-$bMu~DyF3GX^F|;NP;9Z2g0u-wrnMGl|6;ChS`p%H6X^Z4*{`R ztxucruvRmiGH;KDiO#4l;!9XDsGfzQ^iNH<6z#H2BIl$&@ z!ws;`jPB+`Htg0m2rSSnDzNrT&N(G|?m8%CHLEsc$JSr~hmj<4Kn8U)gfHaltEmvm zlx7-*x^mA+8Ns>LF_YfRzIF$#+q@Hz$-69r7q(D-LOKHMOSX0Vsr$z*jB>8&gIiZj zg&RbSIcE3(o3^>yG%5Db1;D`Nqld+;X_QuQ3I9b4u*53iA)z3aSlP;g-1ae&76CsNS%Ea>OHjJewV?y$@KbHm$s z=?H?6oV`GZtIpDHR-7`@2bGB_a8iCaE&d!6^dR`m3%z{ZVJ$CbHlAry^aU;0I{{-+ zfwWU6%ic&mb#|tcBIah?eyb5BK4`qqX62OzAYPmnaA}Tv zT~(=?`OOA=Y3kDcYmKV+gc}Q-Qof=di3Au5`?s4vgF0RDZyPQ<$wVnP_eii$?<1wz z1@4VUH&rO{h5Pk(@3)pUi-v_|wApA#TLDH}A&7ZKQv1#4XW2J*8Hk8zNi|iKuB+~L z-OPRkky^dE6ed5Yk(1+A2-ID)2dV+L!}h3>&8TN6KQw$kS#;PppRwSRC!4V3wT?rj zJc5XtUf54xwx5N;`9wttaea!;IPyvqUY()`q)zvW4#dOdGgwg8>bAboU1?VGe8OJ2H%mx0Q zY4|I<=Ni8mEMu|yK}l&kF4;cAxYDx$(7{cyWCz$)lL^@bDLx)ph)}y)yTxi zUO+WhE15T~3+-S_P`8bC4exHP9tQ1vpb7MOR?GjKgmI%Rg8lB(0kL*_0NID0S$fZ~ zvt5rO5_lLe&4A_?cq$KU*t;a_S7ZNpIdnIDpX?3+PDxDRtOG3Z-)rxEw}JV-Ap*L% z1{A?S7cLLJ_LeWz&>a3`0C9s1AcmFB%TDfGDH}BR6(`6Khkl^@kitBRh%CHP3}2rQ z7rIyaAt8AI7z%cbGPZD_@iN*tW)j$ir2yPhFU`s#i!)|#Z~eCYI_wz--_RtA{Y>aJ zLHx6s%?LG5{S6KIwfK30jB4{&EY^N6JfQVBibi42eH|$K3*VeefDY@}O~l3*dgBJJ z>589k*jrOh&7scD$~u#hs~hy?L!Y@wP%0*C6FPS|_2o{8J)?pNnn0lOY-s1(^lOuy z=t~2Hp1LT{*IhOG40yq)m3tRFR(jRA{QId7t1pXZ8W~3jGztNDDYRQR5ZA9Jiz)u% z>x`dJ4}J#%gn$<88t`A82z=jai@5}uYzlU-0lh9Vpdln0pC(g!e9i=rp_U*yq|*SW zH1Y--h)HbK$9}1c9R>;Dpte2KCT~DuT$!d^Ww(3Qpw{Nb5d$H_BMMXrhXB(`9IE51 z0eWwiFS=27S-_9fa}f^otXj~&_hrB15(;5QIB>lYDTHkWe4|ez6%fMCh6DDjsRSA( zC2klHG1Vt;Sxj=U&q7xr83!`anz}!-fFyn@OC)P9U0JExFL%8uDi{i#L zx)qCwIjD=~@24$pBB9MOuKsZiKad*V#{Df91POI2?5`r-@S&4Go>7PxiMga$JP7go z^YY_>KU2n!f&Y5uMnzzz$g)?-ezt=C|HuC&6Y@f237Pnc2`KVk{W^%ih*7+-|3CT3 zKaaJG05Zj}BPGvoV6Q)q_`3`Kc>zZ$oMl&-0`Du59eLq9{^_*&xBT<#Zt$pr5GONqK)F{Qu98)2RRow*fDWYYiBC4Da(EC~plQ@^OV zKbGVxR9@=!@aM1o-Ff~D(qF^Qa*061+Xa{Z{uKWGwg2{M;V~dI2bBd(H2zI&&SwD8 zQSGKI@DIb`FIUxglM|pZ{O7WNy_tW1i+}m_KX?1T&mI455dJ5~|3@+QE!_UM&vx;b z;>-I0x%bw@cB|wGSp(mmm8P^wHm=8Z# zw5}BOca@%hCe+#!`m%09pX2jkN8z^={7+7-rC}KzxK3uGjZShiE?;l+Iy*XGv9d)F z!|ibk+#TI2$IsvSmy@BTVhPN_=-z9ZfR!gt-j198-E#Mz&+We)#s7`38HthXD~L#w Vv@DiK(-Q%Inrc^6id1PhQra1B9%yK8WFcXxM}5Fog_2X}Xe;4UBT?(X`Uvvbb9H~SB~ zsxByKYIV=KWR5ZB_z0Ai6oQAvgarcwgBKCzmjwfZ+yw)J=!Av|!p=;?C_Zv6R8!5{9XzCtjlvmyw4L;iJ9NO=1fQxzS{?|( zzbX*L>{19}xituN3yLQSiaVkTcdy_h$IE{(3ax z(7&F}(-o`tuNZ{eCxDSu@2{j;|D6-eQtUT>$G`&-N6ga&nkWkxi2PTk#;mu{|CL#! zat3hB(l}2}`LMq;DD3S|-utgnBD`~8#;j{D4>E24{d8a|-u*in|38=t9w}&jzAzsj za3z+R=y}7kC{K{vHMB}zj;DvxQ(>LM@?)_{RCxUvqamW9VM+HgLrnNp-g3*HhZm@m zc3SHscs|Oz8ybjv7A}*Ga7+nu;7*@#)q(Y}7Cd?Q`0|!F(7d3`mB--0hbJcj*5$Cg zz`T{PSBbm1qIIGoMNUi4GH=OW->IJCS(-zw_)-Bczc>d*9(K>UZ)ZGDG=2t8(mF)t zK2?S_7+9UA<7j*}!tM8vK?U9mRnI=D55Y`%Mm4=h`I|SEZ3lvN)M-0bTnBBfp5x^y zQjm+s3w3ztduX8rMtC3dH<7t!nx%DFihhmq7P^Gt;=`D|93HeY|<=D(|}oPew}c-C|mkW1RW{la(^#EEJ|(cPoK(zW~Hq zi;%%w5~UQXm>;%mro5}|#l8WZ?W&J<$cmoF6&q!NIV}e4F$)J<^?n-XTk%*{Mg_@` z3hC;VFf!{nVDVFB#1EHV)A|H$)za9V-9CR*sbxxg-*CiO$&|+(`JOU<+_ztO z-WK!b>>wQlRKncD4_m-rd$?Rqo-X!osaB<d zbcDXDk>+-3q9H`g`_yvq$70#1$f|tm#{x_qR}$P_Pn<9BDjUEW8hHO6dNV;Vcp@$JPnGHcGh)<>kt@J^j%Y#ezu2<=$8+m}6e zseciSWVPS7FNNyNTS-u_rJga7^A>V!MRRP6qJ~bqc}Vmq&>Rcu{UhDGPdR_}@Zm?9 zK5)v4+<{JEvdOp1hXPD2+bvn%uJof1`lsxxF~}eZo4tIK-(alwSkrGkS~$@TVSNld zY~@VkA5(!E+AH3cPkJ<(0ap!*Xy+v=cg}Q7c_Q{9MfytoChe*zd|>s|)C-K;a=_30 z_d$QaQuKjuY$2WK+}G{4N)q%SNQ+orrr5=nr&4p`_yMA)dk8OIV%0TRgqycnf!`-L zH1zDIJ(W?AxK1>zEwC^(x6a&fYGD@_Dx2y`S5ij~2XZdiNK8-P{vxu%M~9y!?>48a zu-j>Qxajt9L9c}$HpNH?dh#iOGH|w_i$1N9u8bK)6L_bQ)?-1l3E#h#b?5%g-9cKl zy6=z;Vkx8|QkDz%ClTDSH#buIc!4@wgI#aS1ARk)?Vf+SsJ~+_XU`Pxdrt?2n99gz zcJvU)!(T}&6%RgYcHIFN1w6qWzOtzyv^m9tc(yjqLts9##Jg8rUN+WqWLoV<<*~tY z`76GzoA8`aGO;%lgx6i2RSANw+2nrbeJF zM>j?1-Yd!^Pn8;&_|d|V_?{Urm*$Epmw}9M!fA;@W<_+Y2sfZ_yPhLTQlv>S1wOn_BBxzv^+nhFJ}=OSFI#S5t$ltd(K4L5D4%qZjto1jHWCUYE6R? zb388hdgh~SprKWK@i$${tapc;4nU-_j+&E+0^Pt{qUdb_=6AHvzB z0^bPJ5`HpQj?Iao%30tyLfTR<;gN$-y7*|mWAK$Vkf&Eq+|9|^V{qn(_Ob5iymVij z!KKtpzc@5vZEdUQc+Tjt^}soVvq46}z}J`M`+c^~fSXP5ePiU-&F+_sc)i2T!H3$C zlCZJ<7kGy2a8{tQJ3#*62TC~`iS^@}S|PD+%bd$&rl}`#mxHrjsvnz-z=2^nw>qP` z!Pf8M0+@97vwluDN6zN5k$nEtedeb^%7C7OVvsfiUSYVl6uAJ&dyt4=HxRSpuU`hy zL+81srrjaA=RamM3oW==;=pQ07A z@;hz|LK{YUm2vPeODW%p-^`)ogSP7K!EJ3v*_LaX7${Uq6VANZ6Ot;uAjk(|)fk*qBZA8Vi zLK@b}D`0(wm$L=ZO0SN!bIF$sQim?-figktna#AQD^_oCzdMa6ba4jFyWuiv;HHgk z_LK@I&+ogjMUp6AruXyEwg+Xm>UK$mC_q2dG zuecN~napaWFlhXdCdtj(L~`N-?GZL!E&kXMI+p!IOY{?lg=M&z^+RN&d+FC_rserb zT)i$_DGg-D01xRfc^WnWcrm?w(~z3gu0s>#uiK!tmTz@4a?IAJ7L4_JP8&1I;k^m) zLZ-cK^QWAOg?`Xutv%W54ik*Ir9H}@F-}+9GAiBq*h0UQYEaFF?q{hdj)g6BEc5*KV{bMn+E5*aX~e98 zunN%%>r;GPK74qps)9xv|9Q|kzn@F`T?{1ZR{)p7U3OP3$Pq}B#)^L3<;l)zJXclR zpymoF`dlpTz#B7e`%!^)cnaO#xC4uL5PwmCAx_u`5!b@Jq_y^2Q%iUDthW4mwk|sN zjQ=!8Oc>}XrKC*3V>+yraY`1YM5UPVp+?1IyM}n|3cYKr=taZtvwv~cys^d7r!;Y| zkh!C)51L#D$#k9qY1x1SOhoJX6iIg-Y!lOG;ZBJ`Y-Qc#Oji5A|A8kAK!}R@e&J>tIc$fVEUByDm&~-NFp#Q z=yECw^qJnqB@)i-i=v{K7i`e_e^$IHVSKLej7lCsx9f$~qeTgWiC@oNAD^9Wa?G3P z;aj?byiJ4@&hP<)pLs_(gzpmvX&QNL2L72iSiv`8DjwUMpwLQqdi3X&ALD_wtfdP_ z_Q2`B!_93A8KLo&3l*+&M!%;OUuX%PHOaxfi~)4GK61Z1{=_wj+Ew1B%wcE-@|7X1 zVkBsEow$ujjdAo520|mef|#$^knL9?ahVa_Vb8=;8102t404GBXAcXRVGxf4T0!mo zlqHLNI&&COY+jnUNyyr{eA$hUFB#p`yc+};N_e9x#`?u#aJ^G#K^HAk6vA7fq7}V6 zehw^g`R8F>PTRyT-mg!JcnmjlN;34lT?gJr9K$Y`Su~&w9_MAOzhYQs` z$|dm0U4dfZ~!@f~E=w~h9SnUk%B%6>vH=kwdZ$jYi!z>3mQqo}{) zVfb)8rLdwrvS*7huqImZQ*%b5p^rkB91$tBJ7F@8XGX8MV8O{@rs>YAg9%gFD5ORv zCOl6Ztj;4}p_cdn6BL`@GqpggC@Q!PCG@eD!{DcHDdw<2SEz+kG(Fi!4WhtOxz~*q zqJJ}Mbo+F4k$|(ZmUXYrnxYwV{jh!$j^3~$&k*X_(0mU)Tv)}s(4%^BOXuF}FAd_Q zNSDmC3s|BwzL3H^Km5sseEguB&j-lMTJH;U$B5$9bz3Oe7nz{RkAw?;s?m|8+m>LO zNyt`YjHF5?MX$EQAR}`nX_QsOUCecU<<)rC<)8-R$Gc$0p1f<=hp8Fg31YquDoVIm zb^~j>b9dk-JqTV{(u_akOc>oJ%WNcNj#v2h9Ph$0a?h)H`#2#+9U3y_*hAfPSa~tZ zy#lL>VZZAd5?fRR6@8s8nzQe0t(ZE|B!-Hqw?u0BdGCu*SPF{`uiS2lmB&E_s?8!V zEL#dlu1^7NNQ4hkuV6}iG%c(m=o+3jBv^~MTarhW5sUmQ;2AXETCfp++q_V_OW zs(3Rh1)T*#ZT#||48rYfkhmU~<5J;ydFh_>Y(aZT^-EO0-(V|T&l4U6C018XV@?V) zX^^N67{Eo?H7;qWq13$HP0~tTRwlxd!X8siFILE6E3qwqDx$zkW{(P5ql*2)?%Tm% z5o^u3a&)l<5fmv-$9Pv~$Qmua5wcj$EsfSTP+dFcR&O>#05Xd|{1Rxz$l z%>FzkID$vEt>W zUHU@`R}Y}m)Ue`8OC+5*hJu~=?)8Hvq=Aj|;NlYtH%2#pWr4-j1HG1i1AfjjnknNi^Z zx~x6|3kmPU3WV)>S^&*8xITs|wD2WlJl%BZKKBe+qrqA@$p+Oh&U>kuVIqJ7*h z+Wb|);CZTZqz!x0E~__Wdt#XORsZ6h`)wzB|LOwZBpJnTc=^NH{>9FUx>FUXXv?ZgPQFCXw z#GE*L2+`p3g4pUsKYx_ag|UZzuGXMF3-aqp7{-EK?@_D?Lh2WH7E}mgiNNvgeOxWn z9veR6g&-WG1*#xU)-QY*Z|^VY!Rujdk#BBXH7v`xKb*R0NJ?dY`uR51S-q{1dl$j~ zP~_)5M5grDU8g1hDh#fkhX(e^ZF9WuZbIaw_cQ6X@hdytp^c=iO6G=f+g~SfgiTp6$g9E&YqO4J}6tvE0=Sy^12&1k0TXBairxIQivAt2zd)^Lr5csW0w zxz|z3wTfm8(lTwr-NAmVn>;|&E&M&SJy-3;A6F9C4Y9OPLI9jVOi~OeTJ#(&l`lTY z)J`n43W_@egH}tP+WOYFsPE+@;~uU2Asg{|x@zE*KM{d{u=WF3+KEcj0IA>Nu|2Vt zwFY)_E%+ZojcP5FeQPHdkPE~$ z7b7_v#AlCAd)LjBsJf^G+4N>tQrF?C9@y@Np~#ox`$$oXGNn9Ue0aMn_Txekx6TW^ z?Sd$)lw*0E^oEY5w&`RLd-%y3zR2aXD3vW#mJMSVAde|x>eVx?XQWjJ%L#_8X+zk|!{F2= zJ27mH*|QAMneI2#<5Htd7PagJ$Lig$Np{+r$Rt&Dn8wiChnD;rJ<8eavXrE7ElS(w z#Vr`?0ZMX6#v9oh?h z`t~GO#|#Co?bhpMU%rFunCMo7-=uVEkjM2KVf02L88t*=afh=NdbLRg?1W?fU{2}_ zVC#DSQf7S`h0X&51}_2f2sWO0zKh0gUN!*WQ4AqdOcVXS){ zd=SAL)BhHCizNYh9JKP`>Th}6=PhvO00Ix}kn3+9Tsalc28-H0@Bd4_hD!l(=vywZ zS&zR%E@uN5M5(>UQkA~fup4Id_4QSaFRNbXTergyT_}VWKdJV zhiuk7Sy;YiDyib*++S^p@fUw}#U zX|Z;U|9iOmP#{=OH@&HU-2om1aF#>As;0juam)&s`0E9%tk%Epl(z%2vW7Nw$`a9E zEe0P?;1z0CmeH?L#qrmpe0=t~1N5(i3<9BkGvSy1{MJaQsU|VI9es`+MG-toT-%Oq z+MfIWM4yokjD3_>C!mpxgsx8XfY6_r;#?RWY;x8}n1rmBTFn>b!g^U;tTA1UY4}$L zlw*L&4sJy+1ib}Mu2)~TRip%chb)iAh;dg<@x5|j`MixP)FPlp-81?pB${v-{qGwi zqHmD;mDp9IoP`$?L@acjf~YC(YTbW;J@HFfvBGgF!chU9r97utI{9D&4hrW3g!UNQG zI|f>+NZ&-z2xVe2_*NZLLPQBDU1BN%Cn-CA^1CC$##AB&3bRq8B-eW&1xqf&Q;GeW z*9GquK)LZF#QgLi@78Wh4C#0@fAx4VY6ela;`tD(q-jDhBt%H_7J}XJ@U{cdGXrI@ zT&JrWO^d@%RQU&_I5fxYs&8*N-VEgP;pp>Wiron9N6R~7>DtYy29y9fl@jpqLyE!q z2>wOc5it&`P^x;t<4b44>g-=CN)j#1{M@IQfYRIx=s2gSk^ z6xZH_8tVm^@iuMd1O4*}{7bszVr0ZO_c<|<^>wQ6lG{D|SqbBi!uf(!V4!m7*fMoH zshzeA_{7(#B7-)gku<0UBJHht{k^V&+Qq0_vcoIVUmkZ_3gfOKyd^V~DR6;9^Fg^l zUf3VfRdSVm6Gg_v&L&HMRpH5pjKFnr$Y@_)1B#6)C^n3!U*b zD2+7aJ+yxT zdIs}tCt*o9X*{3FzYut?@DEFhFFSOkyd!~uu9cmP3PZ2ePhJ20L#^|;K$LYTYw-6; zV*cb-8KrFzMH35-EkjoB>3Kag0-E1@Hu0EEtv_3B%nZV`mW~Zo;h=YsL3Q_X7GBXt z$ZkD6h?>gp*JjlBjq(I>yN$Td>e16AzcJ$M1?fc26l2U}S=g766*MblMA$*=R3SsNOq-K=Qa*B zLg_BO3TrRXU@*$@U>bfU50?3Jhsa-bT%4t`1&#DlL>&eYHY#c3euY9PFLx8zh)bk= zEgc@u%F54;lTQunhf>5@M^CwB1Z%4M6yatS=LThvBXl(P?#+7I;_t&O?cjf6g!=)O z%i0k*{oPr8gOiGM0Z~pBrYfoO;{s71whVg?i-GUvsTdu>u^UmEjHIC83}Fc!<;o5Dp#M}3?d6ZB{veq6ue!{e z+3In+lBpK`4hzFu>ysg!rr%iJbU}9i$d_dY0w6{FF1j1)sQjBw8O=xnd6=*%AM%xN<7b>a!{pl$VJIP@ z_cO%i(d&&kk9Oj!9o-^W#n|CeF`z2~#I$d5U#;#h|wh-7gX2!T-SXdL?PPld#- zCPC_=oNT zG;a9n3yOfBl(Nu>HkCj>9Fh#SFHO-PsNX#nn5&U+m+U9n+(uas9apHg`yNB+(?HTr zUuyS)kpB%4X7!p?J-!f$iB;&Wp$?tO({6-!*%K=c_8<(ufsj1!Pt)B(Tuq6eMh%A` z^D)C9S9iKaltml}_t1o45lU3Ny!SduecnEk??&y@!N4` zJe|d+D)b<_V~Vx3o}8%H`TNFOYQkB(+4@!L6#N`TvJHI2CO)6t5r(;`(_2{Pb~L9o z3i(l8wuyC<3^fr&M+hs&#S?}1k@uPhgyurq=k8Vy0sVq=kL?fiN4fABidf3zh zTs^}ySeW4RvzvfFq>VpbCxnI4vFw$?IsO03H`=QK@S2Lsg_>puQNi1r$A=SjMEK5E zgBQj``7ZS$O!mKUd6&pXR}+@WM*c*RQKTL%57LO}hnx1RjmU?U=yd}JrH2C5>W>|# z{SX;FR$^u#2<7{g)-`}XAHagtgI_?uOosJus|>ym0>Gzg_@qDm zO`5=w60iWldB>k!?QawhbTk3~&&bsisI4YnTNx#o|i0I$2ib_fxTS-M)9;?2DYybQ{iowsqjGS~w!6 z>((6)4AzpEG>F%5Tz1(>3_AV%zgUZr`gY#F{2EuWhzSMi#SkmNPwbvy^Lb_1aaM|awMS=5J83L7 zzZCv=000R?hL|fhMP)DT-%P0<6$Dn#wm*=khP4sJ|HEOlr=bs5+FlOkdc~92J?`xb z>@O$8y!!cbFKzcE_^jA&7VRYTm!E6RESrv)kbU;JJHyB)!f=off)_ZS?v>xbMDTch z05BNdNucWg;WzCuFxUXVf8c&RjuMlqDO|f8MNM_FI78g>Q-tU%d%Qf1HVUWUx)CQ& zobhs>{)L9RL`-j7Pe3=>CtUI*k*vScs@O;aP-^W~fIhOVEPs9A`#*-RP)O8tW^LO6 z!B(f~M7`i)NbP!DF1Iu46xv8u6=U<-Ue7m_CkXpppZQ~QC<*=T8 z`fnkqTP`R1>Z37A+1XwmFG?~{91^4;t+nEDXO3RA_SMx@_kElP-M0`cuPh_A(B+Zp zA9=j;yJA(YFKVKQ6@nuLFN00g5#h6T}aK76w$XV8j!FnIg^7>sUs1 zz8n|2g0nhlz8FciY%I937*Y(SQU7=a{3@I;n8Zk8OPjCvf~v6L6~o3FG!eTSo>iBC6@I&f4UXFoDc zahV=&I2zG<-qK2A&61>}R;knvXnoIJ$5!Eh)GiA9`Tw!od0)Yxf1%%-$5aX`S$S&; zXdyHP5O(Dou9v&xBIS{iK<>?QR%N%l(Xf`3&Ka)(=)@Gmrae zkGb_M6z^$QNcma4*_qa~>3+Yctm(Kht^PP>aLdZ`A{pO*U=8gOuI&^KP1AV|kUDCM zN*hg0=PRd*KWZ2MRN));|3|(+h4$$=QY_g}t1B$BIlO-R-Dio==dGx=;CrRnoD)iGn%>>cn=X$Dbb)aYQhtrgQfT^Za zA2e{_Ai>r8DL!k2q|5@T`dpcg0C5!#>0tlug8s0;!?%OvtB%{7S}62M9xbnuTv-q@+tZX1+4e)K3fsI>$=ru^w_3P zzbk6HFXFz>;->pJjoa~S8WD^ER!<1+s{LF%Z?e2c2c3DKdSI@}=>!jWOxL^>k@q9s z^J&0Bt;NGb+l!m>JR-*_n4_!2kDZk)M2}rQYHYeD%{ei>Wp$jcq~%TD^sU3xO~#z)hK(*W-GRj2V@ckyZ> zy1hv3(LWW`NW81IXFvIs*)F}KcrSSvL!gNlGNI@|fTwuybhDyG;@xg`x*98E zv8(@X#`9>V+-k8wC-5sA-d&&2zN3l^y`!Z}zm(@W@>E$}aO!f~U2@yE%{s5gUaubu zRe_xshmoy~I%l5n@Dl1#2n(@P;;3F)^5OpptbH9)^{C#|RBsaHqkkf>n77G&FknU? ztH%1^95-;B7Qxw(>yq8~<>79=6_ z6NCI)>Z^>QrT{my#a4vkIrws4_TqZJDS|C1Fmnp!DB-o^SZke z55bSO5ftR1Di5Kgk3PH0HKcus#_$Xr1D(!NLus6D+U8EPkllvS-n7~!ck1lPsuh&~ z0d78iBHEX;vp|Kby>~iqsPf4H$x(WV_mBmQGDUVwg70+>78jw7Gtfm>y!+9{I2UzP zND2E&o+H{r4(q2~dX}A5JnGqPwgzHZmS87k^1Po!_yz7mS8n0W#rE@*D_rt%4*A`` z^l90T3p8?E3s>O5-QBdkJT=0daX(FR6CnjwjVi9xI+#4&U;S*)HtJ;aYdZhn*zq2w zj%8U^Vs`(>cs}>tY%kZlyB_3hW3=!PMs43uc}5kag+{tv7@`Dg>+mVbPopo7Qs!&Z zx=Xi%!`X$T9Q*n(=t>(j zEbE_UE1_X7-}+$Cs)s6Ke{FB=Tqb4uoa~9!3F`-6>vm%qiY6{WIGgXFSR{$ui(u7q zHSP7h<27wtZgAfTN6nzw^D}hd<-zNvOcOTJx$PcQhUsK>pou8{4V%2XL!J_kJx8t9 zJdhdtLa2rXwkY23bBn{lbmQ~BR3-^o2kn$1iCZX}Hoc3!KL6l1zn4?Q7wI}g^G|}- z#EVX@K=s&v=%4lPzAkR4QJmR&eu6G?=nhg+EM}Q6gbU~Pyqlf<1uANr5q;It{C3is zHu>;eOJl*#ffuQhs*0V=y9hv>d9M_pmtkUOp~h-&JeSEg8oX%ui|oIM%$pS)o7B{- zz7~bXN;fy%QMX>V7v;scY2>m=`-zZ-1k&)gwZ$O}f*(_jDr5i4^~!UyWxXkD1hYzo zZ?k1RFRofrs6TnFPN*sV_sBp{p-89w;|h?FgXC(h`-ynz!sBKtrXQi8PIZR1>vv7{ zh+MbYnPY5VmNjhk;CctjWf(g?62*!Vhf!iwKAk%JUS-7Q`<_n+T5Wr|$(N$rH+bc$ zR!#8R>KB)taA3n2(wZNxH*CnyMhkyEB%yD02bNeQ>9iVB!!;O>x--Y7Y>~$M(nrNO_j$`Wn&gDY6!{Izs>o-gDeNCB% zL>Y4YZ_m{?c*b*&w{P6h9-ch;$)@Q(j zXu+Yp@(3S4VeOaaCu>gnJ(qaXlHr|+J>9RoXmXP?R{r zRs>S}Ce@g^Tf~-(T%mgEj@pha}nww_9?{Qq3IC4;!|4dbuLR>P-Gqm*EoIgHU?8&jQ8CAx;H=1ycGl zpoD7))WemBmIa%`oPsxzimzumH%A)RUB8f;0^OEkImY8@*HN8XjpGZjw1&0rbMjR8 z+TDRfa1m+Onok!|wLBeJF_#sN1l}llZYecJS#f*|I+g6fENy#sy@0*Sn^Ct5l{rr| zN~87sAys*%7eMv(gP9vn*!cG0{*OQz9Tvhb&(;B)IkSf!+JnlLrP+?sQw8T>|1h5+ zdWgH`rVCsE2O9(+t}Hfi5WUjV@9pM?^#l`9tifzeBkoNo3ZVXjkAYW;`y(*`0A5rF zf7(M)QU~ABeBw=NG;UvcSkBhwj@R;ZAp!BK39lohDR#0re$VFaZngZx9O)4EAwd-F z$GkN911J?GXUhCT{q1@nTD98^Wz{(0!haP77oiODE|eA_YyRa7t`k9U_UbVhU|NI7 zjT$=_OWgIgRL#LSSDc}wvdU?pn8(&}F*&)SakrJCJH}PAtkrzg(U_RKZLW$c=68zv z)khVj4nA2D^*pM4DYxH?T+>l^$Zqzt=|M>@y+lo8yfy#%M|AwqZj)SkZ+m~~@>45O z+)?3YCorFJ#57p`2ZwL+t5zB-e|)6YWL7_*P*{xAI}8?)Z)v1TCYLrcl!8)f!zw*> zpYCGOyK!UtS}cbew?AEi#pn^$j}I_3560J$dlg$e5?{ToXjHqb1vKr-aKDuwZ=XKz zL)X+$fG?hvroS(2dT6g?lhUd{a8eKO7hF7UUedf@zCnfZZR31$L^Fi1=h3v|Hg}t~ z&^gyB+X-raWKcf{bDcM@9l68w({ejv)az&rPvtk92ugl2(khT=QyNvTq?L?qXa=RN zv)C^nk1I|#lyI$Ct23*wQL49?Wz5!odTaO-?7dPpMqvsl3JiL+(#96w2ZP9)CyHLR z?Jarl6e}IwRQ~KQ0juUSK}z+(ryH;JEIbeC;N@_&Gp?t-O>1 z2Hj!$C~0w?jAaI^jXNLJ5dAbYCa}KDx7A|)xLb}ADL@=+T#~|ZHxF=!-*v55mL((B zEVV8d>6nbnCva)C`@|Q2II#BT5ABXcwXv#85|ntXbh=e7wH6DG8xyB;Qcq+jDQeKZ zJ|60QklT4{s!kU*4NJ;5_o1B*nwk!d#pAyj@%S9LdK@|(N~Z2el4qJ}(gDUX(Tmpx z@uKRW)kJ*!{9t8Dl(Cfzr;X#3R3Z=hP<@`uyf*MFX|2)iX1=0=WQ1J`u@{t!lJEL- z6&=Sk#NM6v2d&eLLa)Zauk-JwcIL1vv-AkvGYnUu4If7UEDlK5vpboEs!lJFNF|;xV@X0@p3M z|HDHKjk&(`({+H%1)|q2ALYx!%CjR0h)N%${`(E!J&FwPyT7W5uXU^A#IAK~fxE-0 zJozURRT8UxkryXzcy6)7lTx0xRJaMy#@)DHkG9eDbV6aXQ19M}lkK^X$E%;ChsBvS ze*X1a#mECWG7<1m9d4)UQ@s`}OSAxjGGT(3JdcT(!s_Sqyg(23%E-yjm zwH`HJu2ih7L%P9)9^TU?rgKqMcJdqUO++nu=JZ7=(xGYCuV2k*p_#bk zxR0u-RPY{NN7^=^Htqp+{echfe!gL60G?TWR#apg0I!#%uao@h1lvh-E5(JpjTFKi zN8qgW5Yc*=p9!Hmah+)j!F#2qhgW+6+B%U0TyhJ-moMkkFB7#(H$d%qc^rKy4(0^P z#JF*_1t~N(r|T!IS__BW>_V<6fNb0)Pv%j(KV1c@?ln7~RTPzZuC+dtS`yKa+|j68 zHCIgAa#1^XrM7Ic%EX2K109CQA!yVoYtk0@>Q4hEJ(AAvVw)UG?AhFuIg0>Yr5x5r z^}S6|{^S=<>#qIz8ndf`SJIHBGF$jDg@38aDE#@8nxg}I>8a=4EBe*|gVJ?6Szan+ zqQrwzY&(xFOBh_;d*#bmWovg@F9%)@&J70^bxZzVD(e<)2d|G7ye_lmEBb4Phu`?{ zu6gWTlBhm)O;*qjULm!1V42-CkS!fINmZ&0l8X5KS-Kf*(Cv99&d$6BRIoQQ2U)d* zW5E7t*a&(K;LQxcbrpsJYW5kL*>rJ1aH)L29?*0i4Pjn<1gNjg7oo+zX21=|`7)XQ z%n%|N4L=(0aj2L2o5BAKM$D}@3`cbs_yQNu)vsSE1FkpX3qZM0EB807vtI>yPdra< zJV%?M(?=-5hjRC-cYSiKB`^_07f2w8MoJvbQ~%TvBp&E5#Wu-KR(lh0FV|MwW>k8^ zc?p_V5=xH=okEt4suvP6Ve)_O#IRvqR97CH(9aUcU5t zJqmFmFO&d|9-}WRi+zXIESHbK^08cMbdWgd24sdjulu}L2#ViqK3CaOxNm#h_Tsrg z$wj04vMKeP4N|B$b$=c*;;I@i_k!{ZO&+pH&T3Ej;2jFvr69Fcc^Y)W>hjSXXefC@ zqGEy<2c~nltl5ovHIHCeegyL^8-BE~c&DzBL!twCyKi3lqZg z_>{-a^)NTOuM=?!eO%-9a^n?FUjcA|aUvTp7|F!WmHLlFBlVq~4BID*4R!!u1B3{V z=er#(k1GZFpJU%Dln;*p{I|n(!z#RH)GzA19_paA z)ZrcQzrw*`ljqK8IN@9eTouMqs6g=D00vc4`2<$QFRPBhI>n?`YczZ}YDYvec&|(| zAsmt{HaN}exmNK-)4c*UxlXFd(q67Bmt0qb=2hIK2O?5HKY^($e?{ZH1@cdr3d*sw zIsD&Ef{r9ozH~DtJ6L@}s7s=p-)Mf-s65drpd;@r!ws<0q*2lzM= zaDUtZCJwN}3ohAKI|W*0?XvxytU5~+pll(bHo`fY|~#`A2xO%yqwg zY^VFhwP!wCzP27rttq`^WMt$6HeC~rCv>Y7TOPDzpXz}+Rj!H*O<3VtbGV{42{-<5 zb5xZS=8-kJJ)B%aiTv0X!cX9l#f*}A0=Zqa7UUElW}pq-u?KA4mL4S-Hx!*{Os!+! za0`#Fn5D3A&)%U>t=7KdUx~ywrIzc{y85kqjkT_BM9I zo=vC8^Tch1KBz;}!c`F!=Lg+Im#Ki4Sk6QyUqE#j;4eO50tcOhm(zrhPCK(Bm@7T% zC7HZRuu)t+p3fKQqxN>iUf@VyQG@Fz#1HE4(4HZFbWz>>0E-)3yD3u0Q9k&{=}GZn!#r$-8?liQN~1kAe4ZHg{S2#dnnV;29K^+(g7U$++VXjvIB}KLO6w~d zFZoWdLywGAwP%m1*}AP*q)75i-@`>#=mESk9D8l#ksqKk0z%y5j@L6uO2paauV7*- z%Z`(T9t%x9cxVEJub;R8$kMjMdE(=;sLX1c(f+XV;<@Whyd4Ufug<$*L=Mo7jqpUd`5YKnyk|P6M0e6D2MXDr2M6bPg&w}wRP`0G= zkt2~8ut%JT!gE_dkjzVf{H!JvhTWkvmHvF0Hw*3O7uc2aa}#n$2N^z@>$_jz4~0=c zphZ&i%SU!3Zteugf#+MUQbAwT&tZ{CiW3aTSMK#mfKr)`!(t6KbN6`L%iVMQ;r-#O z96M_ufLs!C+6b$o!EEt8(M}TVp{QKrYE`^kKT1}(`SZjDIqXs15Ump#qDR~Rwf zNdZ$&_>eE6h13&9eyw5BG;-$#IGX?SfpQN~u)0IR9#13;U3+77oU%yk|qIYFr0FTdK-Ysa}iP zp;f1GX5*~v!c6^;w!+aN4-<`$iqVvAR*BcyfBbmWR;ki2^5Ait0&DnNMMkB7J3N2d?jM+f)x zETf|*7?SVIn|s}OJxh_qwmS;o^NSa$Qt)FBj+@U&4#)y5`agESA(%zS1q&rDkD7*% zvuiFpkppf}Y0r50nsKT4v{EuAfFKfpMv*B*KsbPu6c6A5X?-6}%RFZVt+eGjR3P}f z6!((oYI{yRmn%pC&+D)P=oGzNN-)=07?_X$awt(_t+d}mD{C6b4r~;fwecG*_oRRt z>AFq!Nn(Q%7Wro$ks&PLm7*)3ZlRM)8`pwxlvohE{1N~;IKvD0O#RV{Q(XpQ+wSkc zA}yC3l<#&@>A36%6v#fhj0~-T-ij9tY9={pB3#?@;-x)25-g*Gfzv?lv?QA*gf<~NWdp)!pPdnj8xgHJ`2@G!GU`o?vqRl6#M99DHUfhiUZH!z- z`?SCfrh{n-@q$_VE%$j=PSGgux38b^Ulq#~g-hbm$RJ(}^G#Lu=Tjfk`=0vR!uK&j zzMZaXt=UO`?q_A2FU-ZGR=pfpT@8G>4F?j#Z9l@=M#JWMhGkV9MVjVujisljl)|+n zn$e{=!g;fpY}MMJK$N-zD8#+i)$}ag1k4)jBtjxc9=#9zh6xZ@|DEd1rwv|mWA;q^qKy;CFE=Jaq>^m5Q<>Gpi5 zNRaOO^00Px87PEjK{TdA4j4#{75J3yX6?H*SZ2*YNX!~dzl3y<$}xQe6y<*2180Kv zFA$XzNZL3+BU#r+D(+ZmQQkZ8&w?+HfN8hJ!7K!C&n+s@X^mj5I3XN^dR>3@Xgsk7_FINu>@c+Q&Pn{%Jqki-LKIC&*q^JDBCYB?U~h z+KdqPD7EugDr5JLo{W^nkp3eZ`Unt=7$;HzJ5x?=MRXO0Lkx@%cqO?7u?D0_yFc1V zLd_&oKV`g!u@})MwzF?OYd>UVG;kwZpk1&6nSllvO1S5qE?lR$E1peQUI+~ zngifXM%ZkbSS%pWTJMI#gE>6AQMMVEye1_kKPodY`4i{P4Y2z9ZnI5s`mxK^fg?86 zg*8q5N8@;%F&aGRcG>PLTh-27a`>6MQ`xzIw}P*>X1BGAUrp&AryVZi;l!z@t_n0d zPdyF9NboAk0*3!3BWJy6-eA-R9&79EdN zkJ*&J{Iftf39L82!yRi?QrQgqJ%(8L9F!PNrQoeyXC0M68~8Knr3T3yMjturG0+ugiz z-=CbSJ9BM*pOH2lvRFi0+uVF8DG9*r%6*X9P0OxkLSVJ#Yu{4Ccx3))M?T2y9=7pX zntI;wvm3o=EQo-8{5}M%tZT0dP0#@i8oU_RYH?OZ%cZC0i}x)HI${G$AAC*cg+db_{9@>89$^Kgqqn zF#G@LddsM|vS@1*NFhaVw?J@rhv4q+?(VL^o!}0^-Q5Wm65KUda1Rii0Pm!`)3@)t zHDmvCz~41Qc$2 z2EvhT8i1(1VU*#xilA6BPX!8})Bc_NNjgRM-;98~*~DaY%tu89gv||WQ2-qA#$qD7 zZ}vA2ptrbHmdp?lxNFTrK6zMVQ$Y~MgzL^S(H8Tzcs1s(cYoj>2mtzLbB}ZwLi|IRWwFlLwx`8K!>rtB#E{3dhf23jfFJ$-)_OYbvy1 zg#UIi2EVc3X`Etw*gS!bnlbA@vbTNGIR#n&j|!7bK|_+lj!`Hx7*ridZW+T7uyjL8h4fi+a0W0k!EyNU*c>+$xB692PQOS#^* zZSNgP8*YALApQQ0jmPpKRt*Qqf(<*Kb|@*yVj63$xcSNc-WViW8KolK!u~mc0}3$v z?SD=x^Nq=-vIW^_Vut4k5Pu_#etYx0S5QM%$?N@ZAPV8YnnAxu>U6=-2tnzgZW=At zI4Y(8v-=oOOg)>vdIDY2AO|cz~7ONB6(dp0kDdPWI!&Lrt-NxycKf)`I;$AV4Hg1 z3z)!~5fHyZ4ry%jZ-vzU4nqEZHID(-f-5o_)k(SO$faJTnV+K5;c9h@SqB0)5uc~F zc8L+0?B~As$G76A2s{IG*XAiY8Cy_g|a)=C7EMR!%lH zuE4CAft3LJ6G(Qx5Xq#{5d(5W`i*H~KDHE*`|m0w4aLEm59&`a8aVZ?2AN7LRoNhK zF+f9WB^wC;=-MB_02Nf?-m-qypp=w<^!1Nr0xtAb6tExt1XI5Mbxwbs7OOB|)nW|I zivRCh-6R0FL4!;S`HxNb_Z$DC{QtV1|FIi@4r59!{0DCH&sQ^%{&pE~>gv+}5ysyl z1A;M1P0XZ`qAg%w`wtBbp)dsyK4Ie-sea_l{8;fh*_fOYAKj8lA*c6e> zknDUMDA>UY+y$ooT)*vE{O%Z!3m@B|v)N#Nhl*64NqCbIBgGwPq*KKkfz+x#_0;5(0*E8UV0)W-8oE0Na@YV~@t zOxld&Dc=^f0V%>FNS{xfw=sqS*!}VpvL#|M3etJSt?MA7@)qWGyRK9{KxVD z+7MX&8~3H=O!t2nKx9m`zXbqq&cVV8ViCf{DSL|Ev9U8n-*Smwqu|H_=o?Z}#SdhA zgxD=I=H^$2bqj40FxiR6iX9o63?%O1gRjRwj>~?`nDBB!jG*MG4x1%*S~2UTAo3Pm z1riM#2RrSQ2B!aLORq0+@22d}T{(s&e}s|L>^s{nM;g#!0LVS+Cst(WXZ-|OWx_VS zOS9n+vSh6HVhu*^HarxC&|GQCTXE5h2+>y3vBY0k1bPRMn2$8&WMy+IB zeC0VQ|5cVNKET}FJkgrMX4^Zp&tcJWFc0v4u9tVV0z;FdB$|pZAdarLt*kfkrlb$7 z@Apar0JJ-D=G=nWv|_TYp@LwARV1e{vb}^;;$dr_UBBm{vcJexno)w>*npIc0oq~Bx;VoHU6 zj;(de>LvV8RE(lwa8d8uJd}MIc}-4^jztMF6jWDXCKak>iDcgBeRmUT{qyP0B=RnW zsLEOIM!LZ06DK~l!+fjOOS__FwH*Tk?_yaz7woawSX#cFoE7}{AU63lN0w#oA^ z6O@>CuITn-J#`1&RD;P$EoYD)yvg?z%Et^iuPWrTJ1^GAF>AF?{a8pZ@^+Y+??EcLh%Y{pNT&(A-L_psM-`G} zZ?(dUw!*-zL>La`CSu%$Z+eHz^%4=Jn5ff3UJ3mYH6{t{AqKR)$4gDMi5{}*4UAEV zy!?GZgp6*>!>^Lu?@GRpNLtN8?F8GM1RRcE8482bJ1V!hIA3SclB14UNHtT$s!l}Q znwM9lYD#N0l6!WGf7VI%70>oWkoxpaG1-k7++M63?DqcN zO+k-oSF0ARn$wMhMV6YNDEQ0%x?U6G7B8}t>;Ro{tU6-wi6ybReWSNZjv!f0sk!6u zqKrq$YxgVYr>(en8bFKkuYIUSiIKt!*XX7fM&Bj(5nb)6kg6&!Kh(wgp)EzBS~$fW zW436-9j_dHENSPqfY15tR8&?M^2$1a5K~-ZG$FZ2M$U}VZ0iepOIw01l2hmWLR0EF ze9_yCEixnjzwK2zD%5<67GIwSjMU>s%Q*EH+)`6`v>WPu((!Jpk6Ee{W8)Um>#{=O zOHj&V(!DB(Uo@|;sEw)BZ=akh7*vPe7_3UfY zDsGe{V>P?AMIK9RA3=EPi##xA31bQkbhDj~%^We|r=liU?c&Sd@EA-n94v-l!b zw5inc$_Dw;uVomO96H36*@i|0N5f%T1L#CKRGP__x>38@C8Lbr^Mt~|TGW_xRWavc z-kF&bfgz^KFeZfjWAeK_mE}|F7Qy1^?~(2YIqh9Y7WY+98isH?tm+ps?BftVNZaGF zH`Qb*{C#7Gz)R!=L7jElE~>0bV&l5!G} zbvyrDCNqn37F$nCh|p2a8YIH6RlYSJsdWEd7E|QKYPPwUsVs&@+;=a_NH-rT%#w+n z?a~*cPF1Etxm=Cb3xfYr9E0&1&29KY{0e)udMQ?#(WlU@L@(tTt|P6~&Mx}kpV+gB zxTx)GG78YfusUw@T1KdpqiIkE`?7wZtNh|d;Uks69R?2UYgYzP@m6h4yXPGHm zeaQLv*J#7bt4CJlEau%?$;h(b{Lrs_3Gb{maAt_qnC~6Bqj*26k8|8BwYHkn;H?;D zpJ0)jvdZNv`U;na1RP*a0#GafQ}Y+aLI+Co^Wc5#o2_L!tL+4hS?>|EX0~I+#&J<) z2IXZbZa4$mI-Plf+z+$@w0C3!V2R>s@IBhtqL9);IdOU1*IEj$5qkz1*CgE@sW6R- zQ)vROi~MPGa2V|ohc%a2&Ufh840hVkL{-IMztjJ#To_JZS9cJ{8zPDiTKIZ3yMwvToo9ZLA1F`orxGC5V=a`!NtVCv}Ghw^D9 z#&Z-4H&FFe-Bl93QrdlRearX=zg4b|_CeCN*3{7-)^3N)M^&$>r0Uo@NF@&c#^O}v z>BVf8g@=rPQj}kQA%-qVwW39|GCO6pVP12Bq7ZiQsj9@5`vVbKq{dPY3KE1@#C2KW!OA6hzjn$=z?(21J*=S$= zbH?U;J%^?+HtJ4M7G-ZGcRO?<1`oNfnZ^0*nn^LNAOTJxVeW^i9m>MtiN zYfKoXXv}hVjVM~o3S`mDsWy0I9H@`ioayCM{f+v}`yfc1FTBMK)nJqR?!M?56WWjGGX z89gorK)>E76I3aW97xtXoobN^w${*6u19Lfss?`gkS}kV0*G@C|gA~NdYZSy-e&fnY^f~<(FaXHJ)ja#~^y~a;*!3``zGnQC z#e((elHQ;K=+{6@6ogn-K_a}gA|p#pptBU-n~lYJc=uY?U$x2-rq>+1UhD}7?pEPE zi$>+&@Sw%F_n6FYr6?@DW{9&pDz4%bD+bWjtE~cG1S{W zKW&$|AOv%9jb>bWiO26sHm!a8GLxLoMnmmZf$kYxLA=_kZ{{#;C@1#Gn9_iRpC|2V z*?kgh_~K$CwFUQa=d8mXL>@LE%WqOw4u6W|uR6CvZ!9c3mJ^W~|Iu>gG==yM&w|o< zL~FNBRVHE=M>*|GmMp7R3$A?6&DJwW0JRGeE6;iNje^RomYP{z6e?Y_~j9!S%5mTKi=EG2Ky2y;-nKt>tdNY~{L7E!A7uFOwg_>mmYh)35$6VauNtVGajdSb5I19UY6D#zt7w%2ApJ0R3FmYm?f!=WgUqww6xFeYpN&cBDZ4}im zs;)9V7`dnjr+i2(#4Q{TjYgYqH4GmeLBkaLk}HSxxy~o&l*^oz$_m8OQn^7M+7zXS zs!5A{qFU;atuN$C#hQaZU$KWk+67x+95O#HqG|HsByAV}v%%Awoif7w;X$%jy6$ef z3ZVRam~Xl@S(D|_5%1JyTf+|*bR7I%9ye;$JeSSVuItRz#h~y44VgT*2EnI4%tq<% zSW_4)et>@s=l&WlS1E0>*9BT0e1{58?J*S>3u?-p$rBf7n9V~`qu&VXB3Ih)*{q6l zI8~BJa!i+1nT&qDHfgEWH^UaPE%g)`t{8@Y`&N-v43*ha(QZI~XOmMnACV;Fj0jhq z6WY_JSyIbGwT9u4q?4DjwxY!PVuibgl`AbJn2l?3W@k~I`WRXJIbXlqskDRU4(JQ+ zo0yo~nM{+|J2i`bfo)U(a`qEei)ebVt}XyNYV`>)M7CyhX|3AOW9+yXMgr&usnpcr zDRzf((X*uaA)YJ^oEipM7p&^aN@TdAT!K!-+ON(&2SR75rGRLyy)eIhYN;zKXlNuE z3oR3#DALC*BfHUfX$&S)e_5EfJe zOwB?n;=!!%oR==R-mHKr@3@aREzp^S>Ae_QBtc4?jCbw5{F$NwduMCZ)aTZ&}#Q2tHb*iHx2dDCO48g9a zZjCqlB4MQ@mL2*BMnn!S?pn=2a8@LUP68O=v8xZp%w+kC{HVg9M;~D)lCBxX(@<{R z!U1k0t6d4&w)Lav-%zLmQun0?DVYwdW=I_wcT6e)O#`9mNe(cKDQ~~brZv~1 zDB{}D$)Fa+%MQi6M0)w1r7^dfS)>c%FZHH6S0H}vj7a{<%eE*(62Noe()nK%Ayo#q zoprR_CH!5PYX#||y^ea{$ujJCST;Z*7J?BKZPzS%Y!08~4qKcZFDNEl=jLQeK_U@O zrrlh-(s|FO{%s*kDi;!elw-6n@Y@N}a4H#^YHNit5gKz6YoWEYT_#F^2L|Vk8U<2c zSY3&4ZAU*J9u}-UdW6(EvBQH0w!<26Lb_;I6nG15`|U+Vmox83KPnh9{K@%P z9&?=}D5A$qYN#m2e^+N-W1J`%o`gQm<%7h|*LYj}R+IXvI=@oUBpMP78dK!n;205Nd0#OLC^!_+t$5D?1Ys}?N+EDrZ2d3uRDaI-fHK^5xG?- zrLLFeZKAm5(Se8`JYij5krjvCvJiVjvOpHZTaaE#+FWNX>8OAxwQ}r5lUhF_g z7d-%-$`M1m_*Z(}ijI|jnOV)7)~+)poPMG9&cwc zvTm)DHg2re1>#gSL8kka%(R-_t(zQrn_`qC{BE?t@fuVq@rv6EQ|Q)GEJq@JWx|!W zT)P-;?RfLtB#{9fy)5j&E{VZogn^Lbp?)`brZQwxyigf=tJ#V4+-^Vdq0rr&n#13l8 zee^q=Z#tIheNwqPpKt4mjG}Ms`rr9zDCzlE5)Gbyt5Ljw{!J);y~RGcFaKer#0J`e z*;uoiO%D7Ct<$D9BOW%b&H>d&UfBQQJ065?O)&Scv^s6m7d0QvgHZaMGWT{ySB_x* zd)zIZ2U4ioA8oBag_Hj&ZwIcUI#LN(oP<&gz47?zICU+v;`o(3>th03Q1`LOW2Pew zWsAXrsAY4Dg%~&*1#q5X`k?v$l-_VD^G@3yh!V>r^!E2`n~Te+gF3%NA4QbBnAbaK zG4a+M_=u4ZN-fQWy2BlYEPXx8DsGT2Ky@O1$KRFH8inZJzSQ&af|G+4(a>t0RI2n6 z>@hC;x+(=2>;deVPhtfApZYpbVh85VyVq#w>j(!q|G@%Ca`9S!q+wc8{~DFGTi3q& z%g0)!wZTW`NR;+Q%cj)SDVa0x1Ehpg)`Yp+54}&?)ykcS#3@DdJTF6MzGtW&y8A!B znQi>95;X9S0O;oChvRpiktb09Dee7L3m&uh-QDX9wL*!>; zfJm#QP-$UC_V%B45RCET%_+97BV>NOpab}yHjq#bP+(uKH<56lwE$2KX1}|xz(|gN zyBYtAE*%lrh($2VA4K@SRdt|O%L3G&Rm8sp{oew4qtNe;prSec|9wjZnDOs6a-bRi zPov|14%LtYj;Z4O_ouqa11zI{&w=#Hp;Y+zZa>>a)>K?V0u~ll4m7gcux8JR+FR&v zRcvSBGn3X*4S|i9x#wf`4L(zlVDaY!@8qwwzbVmwD(^(Uu_h9X(G?4DtGDHk{+HXi z>676#%E zhvhE;Utt5Af4c4|t0sY{bQ>-%g=JLXjA56n-#0?%^4YFUAOr{qW5D>jGSw(Usxv*h#Zzj3~KeePb>r;2P{5X z&5s1iF#26pR03UQ?3&J+(=cCJ=zmG^zXf6__q*+NuV1Ey4`q?V+p;5k62YPHN@fVg zv{!WV8nv0oh9qy^XIhN1rspAL&n_a{=j^b)zEKtwCAMp2O9CJgmhRZgiw4vUn=@5| zdb*w;+H1R<-6|(mPQ~WaG_+@?;QOuF6S$9KIVX7iRz&kOug5u14~j6^rYo@j8lw;h zEwc%rySQ8r;-zjCo?cFJkabnnR_@}6$3WB~`65T7L^6GutOfP9?S`ivPj-f${FcW|VD=!&18AGtk#NIT$C41Hh4VXt_+ za}x0u_RT3Rq$6KOcj}S zk7ItEYDeF>e6|Cn!aA?XP?&PCB(-rg_HAQw?$C!P8`q|Us7-P*!`Gk~FXejZCq>x* zobKPNhCp+4gtBbo@r3O?BK_~-|Mj+Q4g_9J%ertG(ccpJ^BG{5~-M}GThQ*d-5xX#G zNJpmQ!^2@!vmHr-uZYp&&Gpg&B72=pBGdpF6ki2#-pLZ`uFK_MueEtZVzzIgN!Tw8 zJ+SnUDcd^4t)3laY1?oxT3 zq032srJ>AT&14bBBa>NRby89~8F+;CK}R_q;s{5khO)wb$We!|^xw3$2I@b4Ub9N#qF zwr@Ra;k844-m_2y5x+P&qzsolT_yFU_)koHy17|@IxK|{5$@=&63GOKQqj~oBAz&e=ty@{$#h`miei{*|R1wQ-J6LuLj%kYsM}Dti3k5820~l z4uVjbAn~i48xmBq0FZ5EGik`dGiMlXN*^9MyE(O+eDKYxcwIOKbZD)5_R`V72>g)Q zPH%koG)d45DIHHUdHh$oZrE_5|1m3h$RH9@Qn@{AD=Wy`XYs_f9zT3vGyDs-Z`0F| z>y~S4Ylz^(qoar2G=r03jJh2?Jw5u0(QghCJ)+YpF}wiQW{JiHI`5UUfFz7xSdg$wEuutC>I2T`(@0`Awbfi=71&^$#ZDCLi#Gb9e& zG<1Wh(L<m*hf79te7`) zyea~agT0YewLK%=q2y?HbBnZF?YhA$nH!-zJpLVCKPz`kjeW;fm!*0hqQrGh4 zNcyM5#D3lXEGiii#Ty@0;Fkj2ZNhUjBb=um2P-g*!0U)Q*lO}4-y<-5`{ki>aXc&O zdoK(&Hg>QSJHuFp2%+~^(SA}1xYa~+nDe(!t1c8X>uo}Lm03ia1csi?4)1zNzZSUJ zNJ)WsiV`67JT~}A7u^{9tX@;Z?bJ2B?n8$maKcAE*F07ykT-fq>^AR-ZG1WKK6|!w zSxl;U#@@_`kdTm6Lixqw{Vnh1`M}AcT0NEwrR}gN3-DG)2)G)Z1|5)FYECKYn}8caMek<4w8vH{7h3&8HEyASe}}#tJ`D8xIZkV5WC^ONrz@Ja}jROd#V zz`gkF2!ECnACQCQ=+Ub3S;>yf@E8QZz^<``p9o?Fg@X#`agP7kZo@m2y4usKfpFf1 zK}fim0B!@=G79x+emH>7hrQ8DXvu@KylqXC!mJ~+L;{zkl!1aL9U|Q5f+yeh{^i-i z$OHuj@IHTsmx6H+beJ&jhYsT{Yebz0=F4c$(_uEh=wBwwJ9%xDf{pfN03jE?cdmF$^nYPLO1vQI?E*A(*hjy z+dv~>04HTM6R(NLOAbLx*z4Csvk9ZYYbT+K|Hqq(^kz5ynsWcV9t3&-qLxK|!2vqS zUTe@MLBzLK%$cq(xVF(Re^w0vUSfoNLU+e$^t8lVWVT8RpW@R~% z@|wf^vnsSHO2Y|dBQ=IPy$vr31DGKopy>rRqk%<*6AFR}6!hTqdbD%ncIXmnTNMI$ zE`J$IQZ9S)~>SIBg~=;B!e4Ops_eC{WzoMeSO|xFtFo)p+sBid6p| zp5Zpg5L(^6CH6Sth&V8x63B$7S|P1B13;!Mmy4e&j=POs$+z+tGWOE^9jNx6lcC^R z33V$gM`*@;3#EG-CiNz@S2DbsTb%%mKpgmnH{xi=g+I4Smpi1xaQa_e>qS9RwNNeG}v?Zws5u|}Nx%nax18b6*juLj(Ti0EjOIRcx=e-x${5-z<-EI6o4 zOI?%LmZkXBR6h*~qzf2OiuNL=wYMm62nailT(Uywl=1{2XyHW>FZV>w$EgLT>Gpny z#l+K4uf_g#=`bMyQZ7v$naQb%*aiggGmnf{0PX!Skn|A_mm7^f^m$tptWFxRHf>F% zr1%1sEfsm$U1|t%=poMaEl@{Ufcx%RN#BZn6!vFv%1~95TkmcO?iW9O{TvN%WMu`? z*Jso;EI;X!e7TxcKo)1#gh8u|hhV^LLBXMrfO>j6wIH5Kr?d|nch9x9f}EZHNA>~q zXj9y%;ZC2fK4b69QE#1AE8ewkd@T^$InmOYBXSdlYWXeDZ?@QT)}CmR@)XMD1=ck+ zJ47#UZ#U_=-euw0g#NRv049n`YerhxdO`tt)t~ixdSnz~bBm-@fNGAB1QeP#G8vJO zO}&TOl)77DZchL7l5=-|Z9pUaKgwhqEvTAYGh8wABR~A>bm0-}`KH?qxr=eizScNv+ZXFcKsq6~cqY@YJhIXPRao}rIT&sbcJ_kJ_9 zmX6(&>glh%H!+xrGte_=ORl><;ui*uL=Jb$OGLSl+NHBmQay!)EIcVj#YK8=nQepU zx@N0qMD8xyjUPVn=VDq9tc`JTefwlRXmVbqwX-SYhS+O;eOs!IY69%FmkZcL?9aYR z$zFJnA^jBVWc-0l@g@>I|lN^620I&pZ0N$g=!DndAnH zPvfl{1e;aDe|nZCAsq&C0MpOF&s#Wwfk5tyoln($9;)Y)oAY5}M^kMN2_a#yIu{l& z2pR2MK|Xp{soYe}3pc|{tq!#_a3y9uQdZ`5A-!5T6D4V*BH1e)F?(eKiUVFquV}@$ z=a(c!VpPpaGr#V#yP6;vTvJcxVvMqyp~YP3m}5N&_*}hQExVM2?gZky1=n43Hx%azh`= z--Vho3lI4Dq^hzoHw|tMeVNW6HK5CiNx{Tg19@C=aMInK z#-Jpi(=dSZRM4Q4?)gUWOz6S})3@w6CnsQER7hQ#D!MZ*uZl+-=9D~xsQo6spnlA? z;RLYomCGlhPHrly$fTjlbC`6As}Y}v9WXE&B0Gi2tx{sbtF2MDHKn5dK|9N3n{$N ze0lbM+{wt*m3YdP+>WK7HYliJVzxeWDBGIL$KOuTa>2{HOd47a7!J_b-G6TQ z%UjAj{+Sz%U#i`W#z;CpS=UKO+L9=4fIAlxw^6UUAZ$)wN({>`i&lgPQ9~{v>J#pO z+Dj13+gr(i33V-+v?61lt`Fq9wU)3KHIlnpG)=y&2pays7~CP60wy|E$;L^wlD~X+ zO)WYnZCv_x?mZ08M>@jiY6sR~EO?!-DRImy!IW-+>ZSqB44_2qbs;p6H#O43!(6Q z47x!cSD;lIeQs~jH~6P#e)BP$y@i(MTz2-3?Qo!MEGKiEQg@RFIqUy(Z+y&=j+WM@ z7hb4Ovey`@E`abAS{){o&+VCB6@ua5N|ZL%(e(5*K2Digv;C@NeuO`z2i-sldLa`; zFIV+a3L(+akL_(Dyxfm6u~EE}L^P`fUy| zwZSq*&K6n}WK#(8Jk|wmOWIVpQczu#YxA257F`7lf}o%pjUC}fu`rlW>%BaR&E^re zI7XD&hF%o@

4-0SsqhCAjM&5goPSX2UY8*MS3!I~rl+Z(*$FX0MnCysFSm*i2NX$$sp zm>nYRm`syWd~Xjr;qZj4q}@M$l@9+{aG>wVOkrggcv)aCpp;`c`VM#O@IEBBhzYTr zC9mQt81ey9PufB+bgMds9bu72#yg^*S5zW1Wzz=T!muH)mD zU%!}tjtYb=)d&*7tubVvsZOIil~VlhK57W#b>(-OcK`0555#L)c*4oY+CF_aQ#lqY za?{dHSUVNf7*2s_wuL@p459Pzc4k8uqzG_oVYEAWdwxw4$!7`DfjN|O<{`zbHAjK8 zB8nb6V+t0wC+4^%Frc-jo(QLLC1Z_R-$VZoQja|RPd+F#;8p;jArJT-ei#`YI{wIpniz2F*_ahQi-;C|u>BEGo=<#c= zel~JamI6ao*$phlkKSbOT3WWTR0jyy#X_7py@81{s0yk^xjW??39s98$GC&nQD;IR zgY>s$ahbD}nNA(ErY~)~eE8Kg#$x>{O<&O)f6L1W`grE@*zLph=6R7?Py>sRnwF7# z#s_q~`;6=^8tu1?Tr`^G8R)45$1?9LM~2^3xipAAPP=7X$v5yrbHz~Tbeb1EneJgh z>VfSwor7wM=S!98zG5UNV2PbZrGQyy-47Z_V;@;?!Wmb~=GdM>Q*{|XQn8*o%h5&$GU~V*T`Ee8P#0*vLqW=>Z2VaqRsuhG=k-mWPeHnT2mvGixhmM^FKn zI$?#=u267U!S_{{YV(oIGAdH5FcGg{2WU)t2kxX@k8*2*%46l`Ji3x}E zN_H``LD8Ei@$ri|GlhYg5vq(@Uznmcs830WM@3dKTQ*(eYi6uoN-7LB;s+=qcbz|2 z0PK{^D#n_Ds=k$as}F zh@?mwyz1UDXxP69{6 zyPXJ>cw#DT)KPU9Rx6UD9Nh3&YJ48ZuaL}p>=2$;V8iQXH$B0?aF)meo^R^e|hn+qNkQu}S`sZamF6=Eqt_ zyAEXaO;Gs#jG2^<>ANMk5j0QbP_a#Xg1pYebT(DBLp<=1sssH9>J+I@ZAe!|N1|Sx z5qm@}WwM-baK!vzB1;&P(ntN#R5f1t8f(YTfvp`{OWq0QlXJ%m$q>cOd9-6+Ju)KZ ztwGo6Xl#yQC-;}9tjbB?TU$CRFqul8QtVlt=D$k zG-*ff->F~kK|iwKHAPfl?3LKJ(-hFH$uE%1d}2&l-OM(Al0M}0&h_qbsN1@+EuY~h z*UInKC9|EXmfZN1wIRYb8qZFpzt`J1Sfx|>1KxOoA0@!VdDFF<-KI$#!_ICe6kAdg zr@Z_ySaK`~bdmyrRC(IGtLX5J*zuaiO`YqPffBSQrnWZxw(+-Vr#tI{Oq0Y01io_X z0YUt*_OW;kqF@Au<1Z}C1i4@5VwyndOyKkDtqg1V-L4o}E9}E4HTkQQNbF|onCEED6=*S|n_h=WoO8-yWX)$Xa!3p`y4bs7o{;rc zI-4S<%1UZlT3Sj<`qR_iB5G;WbgCjj*IVDO3S*jf%9t!27%iQ;kHbhbeG9FcOauv9gi{e*)_naL|>7Hgw$5!u;A)49TRf5A#Qda3bwT|LqG*Q;KZPzz%4R{ zIBO|zKeP?mH>`6xB1(@S#=!WNj@VA`qu?A=P1yD|fhcaO*#W%USd3(PD?^I4d=c7j06v$tL|crkXJG`3En90~KYk>FHDEA$PvcCCj1TejO~2mWKpGj&?_{Y_1~q3_0#BPd`&1!Ow$Fr znTq-|)Dr#gb_eOquXGrsEr<;r2hGl*qEW;!DA^jBD7gdh9;GYc7s*2c4|%rMeSTaU zLRD$8zdJ6{uP!Iu*vYPj^fTNX?eU(>PUwQnc#2A;fn7o5R$4`+=-(#NX`ow*&_A7j zT=#swphmv*?K56WR8gfcKEbYWuoYL*N_eiX5;r|GxKx-Nl_})I9~KqUNi2kF!doS+ zK>m=dN)ggWI3516ytcg@CpZ+R%TT)D;`oet*$elo#f2|U_9zAqri9pv<~o_tUUl!y z;<;{{73s91llkY{@_l^Pl0)VE5^wbuts_GcSn~5#m*l|AkMh~sbD&<|r%M?YHyW*u zWZj;VJkz?B55GI^5|(CF+W{zCzUeEA-p8rhT)%j^y_0(r2@!%N1e*)uM;oY8l!Gf* zK{h?|*=0?tL)%GIyu`K_8%Vj+PEP|IGesKI%DZ07@{VrN+n&@M+ei9L*E#xdV`yAQ zpNFFmt?4403O&QZe^LCbK>wONHeVA!_jPTf=yHfPT)=;Ox0EP%>pNz4_BdC|MbuiN z)e)Db33PZ+l8(23RX!8<$sy-WPSxV&_2A)X$NTaIHP4>9g%t8$7VGY1%T!ap=!OTs z{098k8)AQ7d3kM!VrLnpun!Uxu~AfMpF}n|0wCrI zAOqM7GZKK1Z;VE_0z!ndp=8mR6|>%(rDiK1+5!B)Wntr%qL{}FRVa~kVtctAN5-A8 zTF4y#oZv!g7y30^XV32 z#4ik^>2;t8t1OGR`-bb!43qra(@X@>vRp}>E@?s^sr3W;JL?h#UwMHkj9v(`)FmhV zfA*s_$yd~u>E0TKN*urlAw?4zbKOdAi(OwcxU!Uk$7%jiore%bEZ2 zdN&$ZWDmRegMzfad;{Y(wx_A|sD;~;j9yA*7Zo%J-;y2E`Mi9P&H=*06j9ZD3)65@ zz9537X#Cn&c9d1%s(hc5Dy`M0o{J{9VXfLlBh%n@YH~%I;fT7qRIpNu1cW-$0Lk~) z*kAiYwfC_D?mH}y&3gD>-l0a@DLBjlBcg)=*ZdR^uJtCRccpob+PVS)cHQ>|ce!m^ z$qB)(dsj^uwHi{v%|5I|_}32Gid2l4P;*O?8_quACkA#+1AP%KOx9kch$!QWgzvfW zW3pRE`)ru#@hGFt2r)iYTdtB?DI=_v*6y$+8GhG;9bZa@8Wh%#$k_>}si;iSVabZb z8sz<*gf(nAHy&&2XL?1{XQGsK-H#9fKU>emC3G0d4b~0y!ND1>Us9EG(`I=R0jLraP1b(<4i<_JFux9VZ8uw_8tyVWO{8 zho=@u&!t(q+bg4?Z3A7 z9MEZ~Jg}_Yd1o|ygL#pVFO;8fL2ct=C1%(k(IdU9@{8kGBUtgP+*exOdK)p{vQNkx z;T78U839fKr0BbLO5dtQ8L0k0vd#gzl4X7S;l#EkwyjArnTa_Q+qSJ8I}_WsZQHiZ ziJfoHz4zSvKj&L3E7`rWy1RDOTh-N7zxR3Qa#d}5Ss=+(spDV;vzg-#6a$Ji1f5WT z>qm-J(LxcfIHyOxP2D)YG%Xc^bG8*f=2q6w6opOaVx+5d7WQwQ4bXjj1qUbZNqvj}+!(<%zuPxv zVYNFl{Jr&Ax~Pr#NZsDDqKYoHAvM2eEWGXgo~L*2W$I?%xJFU7$f7A~Hueb0@4Di) z1eI#TMfF~TsVui&`Ei5-LlH*0&9+K~!w6de%!UoIO zba_!5_99pghZzX51aqqggnYj103{H^;-D?~Gp`H%1dJ zMMdeQJDyKy@A15n2A;_1b>ZU`8vU!k(yLAbcH9H&HpC_Owk#px3u(2x)qU?Bxb2pNe+;4ClB`x|q>4 zQG$tZ|61X4ZuVT{Up|FpRHrXTIbXFP4tRDGRtgUkLxZ%{vrFcfIYH*94uY@4A5Y56 z5mxM;6k_z1XdbjMw&p3}_Fc#*G`DFd(wc`+TnVvfSJF+T3XaV=qUAv_dGgdPBk--k zcI1_mbHHPw3TsYZdhQTOb7~~Lh*qUk$(C2;Ikl5lk>U2mxmx4#Yrl97p<9Opp>e?d z6<@zs#gB)s%>5fO+|F_JdwsRa$_;U&Yz%05vARlefREo&&7M=~1>zsnbmuDSsL$w5~ZKAA%1c{GrDj?UR7Mkrs9UnhFMUzay#Vke~ zuLQ#NRL`&X-J?!cKK%NGpjsD39d>)|hmj%^c{QE2<~N(G9Z#lO<;?Vi1j@X`{T>Y6 z_#WPvw)fqqZe3IEtYDJVGGQ{h5t#^Iiz@th)hM$)E~JQ+Z!~Utn21r9T+epcX?i8) z5k!>qG%oLh^DXK#Cjv6iGAg3!E+unq!U{@Fe6xE%56U0zNWwk~qJmCoTpBfqN2Cd!XGP4lP z|Lw-4jCYfX7&bmc>sP-}=OMwPBZ;7o-FQ%#e-JmlSX9x)){bMh8uiW%pW>HKDrjHC z)AyAiNN@cm2iHtcF&VDa{w^UXwfh@m$cc-VAjVn~ywkG?EztZA7TSxyQ>)Rwwk~ePrq1U#IQnjjrx+-ArhxTvSmS{4=H!H`$@x@r|? zQmm^02EHOaJf9epm(=FgB95FQ@t`Ecf1MLeW-+z3g3*U8ymF^s9>ml z0Y_?IAvUK4(=E2Wm>e|`C{q0 z>(kwLAo^;oG49r-O9Dj(!hoPuNl{(BC-Cueabqw@LE+b;mi^byPU?mk*Z9g;UAY1P z{l@uk{n1}Lp>Ov)ij*uC_CySi+;C138IoL&pdZPMr#7+;deY8VAARs!0( zGTAs|&sT?gkxmotW@YCDVLbSs1U`Z^+KSvUI!QrONq`H=((F()`m_+ZxMC($%MB4g zUkC|95bc}#dV>LY^`EGBI?zqM)3nWyyhB0ru=k`X*{6CAi2qs5aPO;I`Uz zm_`RbM+3Skm5tgI6X1BI?3av$xuZ`PrbYSK(Txsn*ISbX)7mrsYLI z%y%P7pn*~bv0g!CUpHSlt&}(EU~n%qIS|df#i%AEh>->W@hPSYWF4!jMq$6!gv!YP zk&GoMmLSiDg_2ra*Y%il%$93TPIqB@^zJlnaTo?|X%Q(3f0@_Y>oQS$2USnW1)JLd zLgyRIAJ7vP%y{BfCOPK?IrB@iGFc6)5Tap|@8|@E1XfqN-hl}wS2$@5&zR7$xr$s5 z2OjSjl#vP_6eZKw2S|YsW=-BnWA255q3wnASB#2Tg4^TZKw8!^K;oUSa62<|JE_PK zq8Bz}-_>Wr`@w_+xj|Z;KWr&*x6#mL$j+xxD!ytV^2Wc-X7m;4XPB)F2c zC7=b2+ylD7<+tk4*I8KtSCP?A{UDa0;io*eKi7uw#}w3BVQr{dteq1@qNViXA3L@a zi9a}y^8VO_rODYYVRd|J_O+8?6I%ErVAbv*itkQ@WJjY)&zsPH2JRQ#h=F zLc-BNJ@62fvP@Y0Utc$YxdV$GwY_OdlpuIAD{(15$)O8^4{k(M%F;s%H%)WT&5yPdQ<=A1wC#M zIw>+qU$3tn$``I|MtEO4}Q8Ppn};H(Hk~*PeK`F*qez5Q6G5EYn`Cz&~+3Qk!;(aL*aGLbBQHm=onT5g{`t)xzBaf9&bK6Tho{&2QC zN~@7g^t;lo={34n^~4)D@hyoki!kE`0sP(_9*NJ(^MAdD<8}yMK^ryZB~YQs_dx%c z{P^_?`ML{yj*Tw7TZim^)f%x2y3~XmjI^8#s-FmSl>`KjzQH0pjo#7Wh~|^5l+7pv zuO{;n7O2e8>*;Cw0v-Q#vR3MSBgw=o7Uqq5X9YLOc_kU|u)xXsnPDlV8I8O0{b~BZ zK|}4gX)C~E8m8`B->+y1ZS75mtjD#95#GSfs#O>yg;{sQ?9$aUIPdGy8 z6cGy>CIG!A_Aj45>|3<}JRumTwH)HI^ErtGlr8h#PXYApLh4aEmYzw%5SzPPYFdxyULH=3qiiyIE++OENUT2rXj*yNT{!l zKmsiZi>iN2MSP6ZBujp4dws&UN^IK~QFk3)mhN9Pi5=(t7;O?}lKeh&4Fs%ZcMV9MTX_N-jw4UBBFp2=hC5{yvCp znk8K?k=KEJ0obU9QKTJQ^hm)|EVHm}41#=OMqy}ls(jbDQvHnjPGxd3Wa4m+&kp7n?u(2@`_87431Se{ z?sdg!miz1%&yRM7Hr3kSHQvD;bU&+G?KdJ>VSaD3@~mu#gY~zD0Y2+?c*Z^SCI*B$ zRnf9yo#(WXEms^>kwDG;V`asL+(!#Uq_Cx(&uCi(_}b4&Y72ZnY`u2m8YM?Dy`8^W z3fu_1v)L9yVW^&Q7M{Ej5lWnxJWr4(R>X*-9my4lhIK|#l+QCWW-xHNlH#4^l);vg zO6%L+co?cC?Noo?Y9=c#ezl$btnZv}nk@BXmBE-drb;mK&>S^H79+oZs5ObDtsw8q zk6*{n?BkX96-;G@aYOUR@8qPG@5T)77X~?^p6^&upXXMvP@1k(bw`q(!L`4%1ch?m z-Lis0F_MCQhv+&KlYqSgfYer&B&#IF&m&dMpL%~JJ~IJE$pL^k$kC)4n;g2x)7xx> z7i?g%r6(HF4J<6g+`;>dT!NYJ2M?Fo*`&9~gCw2J8I=Ts<=dIgsqx_J{dp4B#yn}O z4a2c#wcga4G(vyVtIw;{;=TT%7JU=_!xp1>axE441W^$QXwQ(v^&hmw3NB!TO|u}m_%0g|mz^}}ml zNOLI5wFIn`?z@+E8{^N1k{elc3q((&D`hXqKW|Ti-sI$7r&|j=^ILAHW2?~y@G?rR z@T^;l2af{mc4=$f^9VqCBTnJbv@HV7vDiCD?vk4B4Ec-8y|jt)Elm^q<|i*gY|M)* zS5viBH>EvZDdYMoz9=2I)mkD(fW z7F5dEz-nVQQm*9Q%{7XYDbY+WW#vdsI&DX-%B=6prX^wX{gf{_KETGSvX*y%Xjb)T z+nJkZ*cnU3KmoDjG4IK)(p+3J#k&N=E1wKMcKx!w=ub>OOEX0S?@qeJCE)i&Z!R*~ z=|b+nDht8z4d8}dAox}Nxpc#0=1t_Tpd6#T&+)>SyEHfVfqRpAX=e(9vwttrclj?C zfPR1UG+-XjUv3%Va-WfMZ&6~qt6J%q!k6f>Lo#@F4{|yL&6@D>xu#AF%yRHI9EwFc zfe_gy42{#wT||dKaOcYs!qH zGdU;|dtK#yw?N+v0i6$C-huxkL+~Q3?U6CmPY8fG`&t-F5-TsN#{DD8BcvsBoM`kA>zO{uz41GuXnYn+p z5`r8ZtR@McS;KOj`BBvyJv_8rI~N2qF#%zrJP;X>xmEDE;l9kOcV+{8fj+e{lZGmh zRK{{_8^1r_8zfXUL%0C#O1@gD?<;}ou0i+z=_l;-u#$4r%t@oikNecUaP+$5Sq;|j zqxA__>dIpC*)XQ3&y06|8>ho{rR>V~Vz3+4CZ(0j4&}q@4v&{c$?g0n8Q>cyl7-FZ5qQWHB zwy;cR0uHsxdP7B)<1}IE)brF#PVXH;jKxkeAm$KpkeVn0e`re6Q+|TG-xCLmJ3LoBXFP&_Lb-Ze z8|bgww*Dtu-zo(b!w|Zwv?-l;Sd=gt&*I}nmlpEN;}&?v0y8`wSlTdX2%6RTL>V`w zlba@T72}F^L6(RmnOF+{ViI5JYMQ)@4T}h$$ia=4JOKb=*BQwa>>J}sML#FRc=Fpq1INl2urX%M%X)| zPA!@W%8f6|@o_1vi+L!qJ-v@a zXIMnCN=v2Tb1fZ_`;1gg!kzYpvx1m&#=J;cpsvX=K!sWAB z{oM?g>q~@o9hcn`1W9S->kr+l(|i(tw_1?bQFf{E3-iiO`86U(iz|<_FQuQ3C=vUzIIOSGf zrAQS}qnA!sDg(0f`1m<~cv*0??0TRi1LmJ{g6dy^wZuAVWxx zk-6a+-$=FKo={r^qqc0R(L`v%z@XSzkf)Hv+`UTQOhtYJ-_RuK5=@!s5J#x|K%#Qc zUT%jwws&d*f0Id5iou}y#p}y`6A#S1ie_VK)=!G?>%fi`gj#zv(k?-{xBPb8*}Ovt zfeg(v3|%{C5zlj(cxpsv^QZh2^h(wD4L}}D!lN2-k~3+o!%um`)IP8mi{b|SQ6vX6 zTII8cG+q^!mn^~Q0z;Y_r#MV)xlJ2_ok?2Wog6-6X?x4^USTiW&di~|q z0smDjPc7DB%L5yg#m^`sNFcmKebtOD(AMUso8pRuhX;krRmg#pJS2H!W}yI`O*#ts z8d`qv%;P>Q<`{Q6j=-COj)jX<0b)(+?eK*-U81fvWwQwVx%~mXI6%5;xA?6wTNJ zpF|_rgH{%ktvSi=R_ILL^{Su(Wj}d^&V7>|bp)OitdmXppZL|V?ez+oo zY*L?&6+@J?r=m@BxVv(AUY>lL@~3STXVqlpITDpQ;+3>YnXjF?!auv*OVv8O$0g5d zggheRoD%SFU-Vb1Pik3Oir=Q5$|BC8Ms{+lkg|V^NW|KcX%`Aho5pm$I|M&$NGVNG zb(2wSh_3p*@_NY;TECBawa9*2ud>t$+ZtUz+oKFJQMgehrk)>>g2hV~K@@mSBs#7@ zue@4kxBjc)T_SjpiqKfjN<`kwy!U$C7f-b^tR{FilqDcvweEawPL{+7|IJ}$d8I&J zLBN#8cpa(&(_Er}Wd9tO>rs?cqYXBSl=IzxdaCc;Yc91R?3Tqkf~YvK;4mr$-rXmE zD#(12fshadlgS=wcXPs?`YqJ*66d5?7nH4>`$@A$!}*oZk2O2(`&G6&?Kt zoS$swLOaCfw(U)jzz;lCLhrPF8W99hu!N?3M^M6SrnZjKn~Lunf`d~B$zEG6O<&v2 zzV%B&t=?8PXwN&O=6P%4>>lhOdP`JTJRS{-wEt{6TjW^rz`c_&+5uXcB)Bbk(^hm6 zTVMJTWXEo(6G9}g5^&YkPtBU?P*;#(yw1>m^BlhJq>%U1nQ{9*TFP{`K&!m$p{dC- zOphI%7}}~*Q06FcHY3w}vs!yM%!Rb{?tJkP*T^PbgoUtHwH$=z7*1t>`?=F=T#SJz zg~{szqR?*TO-jIb_`nC}%$#=SA)WzR_45S*3ZK5#ZFq!`*2nkrK+SLQ#?`M4qAUvv;5j`12;qC^6@JKlfIaTHpR-=d@PpW zt??_FWq1yesVU8L+crDcGb>Hb5671k8-P-l#Gj+H`a0XpXy}(*MQ_z9*@O5|ebp~nOQ9Jd!@P2AdwBD_izhD4r3?&`O2y!?WM2wbX0Tk< zY$#87GvfG8k(;1l?Cx9UcXF@6d7?UK>nFIbnQXUNQ}X%t(?N8oG7^EbJMBJ-E?VL< z%wz~mnQ^itJt-E(?0PSlNdeWiBbkmS*j6YtTii1fFrRQtitgqnW#Uw$p+>joeun%c z7pE&Fo$DtvVQG!&HWh50(K0fkCq8b)_svg5ptOLD3>2H9#SFeVl=(9GvMovit)6Z@ zRF@s#=EE6;$s}!`YhEM(wwITtz%L_p9foZBqvr#9;e|0qM!>F( ze`TpM)qQ(a*HN^RoW>Dl0ygkrBroGe&~L*5eAag}bW#vvwXQ z_{LJ#akqUL{gSN=VcdSd2%&aon*|)nCYvledBAP~YY=Y?OBDdS)lf8Sc38I#`gCpR zSvUz0Ffr$N(ZqDzgpA0Cc>9on$A2DwB#y@_TJ4_6qD&D?AYPw4Fze$KA!0ya=d#dE z%Uz{)mzK|MTl7iqGew+U|6od`u+YoXFZE1TuNA>a#SE|KDW*>_DyhIt z0WzKxU~4x-yXeaGG#w|{OaOh8_)Fede&ny z`UOln=Ml@Q#=e1hy$;j<{@q1<7~iIU2>1Q8bErhopda=m%M4zu3meoFudFC-e;a3r z^kn(lk~;?gidBT3lMGkXt*C#fpFYAsZmREh?a;m)!ZFjt0nOM?eM6cG$2v_5auknA zp!sre!SME359P>s^)1UNrft}PoVvBygfzy*oJM?XNG>=fSG3W*L(c5jRI#o8W!&dZ zizfhQ0edDsuQ3cAbsH?d<}){7SS#THLdD#(v7<&2DAzK(hAewON1=zuX<5NNb3S)* z>FOdlHOR;*nrt^=AeC`^Ff~Gd*`(=pFnGQqK$C z(0t83%RK`pZCq@35x2NW2G>S9a6gvv6*nqGx@?d`sak!mXB{S&A}Ro3ro=&}wjf#W zqIHqDHwlg z{Vs)#rHu5&D~Z>Fkt0nQ30^huMw+mdZZ!KW_$xq6(Y$|^)?AW27Bk=r{?XYtS5TAk z4fWDroIj>+nf2gNt7t&EUToeyPI6(H$ko*)b%dFdYIJ6MWVw6>UqR%XM2u{8UttQ$ zMpEu>^F;EtkmH_;pLhA6tN7rRfQ##;7`iz{UMN2DZISrNYI3N{8!~TEh6q(c5F9lO zYW_h#YInNDJi*G{0~)(>XRaYLTI00>x~)77yImNX72TxlEkqfA81x|g20SkdzQ<~X zm46}s@eFErzt&{BZHM$*k`|*d)hl~y?$?GT4!?Il>QQ2lEGe8lZwtgv-0K}Is^0nUhoaKBK zZmI~vT@#0w98Z*1;lb~DPTFlH!|>#s=+-UV8rm-Q+ajF{%B*!VP5H`eBaWCmG;qwN zP#lV(r^9DmPoNi{oy}>~lgQh2R(m-Xn*zwD@^f#)3!It1V=$p_3NLrf4hs2{o~s2& zQbuY|nY(2qJKQ+0WmCMaYdkTbjFd?^9kgHD{SpvrnX1kGVQD+NtnF6>9x<>;6;*8u ztIcGYP(jyl`3^cd_)*03O2NcKguJmkr=@##FH$xB_&F}R;0vXz-7VY5c5L*MO8U34 z*62I--Kd9KCs6qx60QR?NJcuzI6(Q!YV+d*Pg=G%S>~9%+YzmGoj@r=%*N(wuUEDF zC|k};VPdPi()>6rz#0wu&llz1@0b{IRBf=}{Zma;1wzE2H*2vjjc?#8{Ww5j2~9*acsjs@jNKMUJsI}I<@0%?&O*;tI;M-J zT1%WATM#IWicprvwS7IC`YYZHm;ZK-%G=d==viWB4(_IM@?}< zO55iPaP@>*ZS6O?p-s};D707!Pm5>mHbG6UJ=_&H6u(Zu=STXBGVq+q{#yJW0n|t8 z?L!*2e&0z58mC<)TT$Ow5z{EP*DwxG(4tzE#-i^eqOnS^aTrMRWVGk6Jl}qDXM{QA zJV-p34N>LM-8pG$n569o`tEKJ;Xgc7*$6f)0cDuZk6lQ5t-U~ddjP*^FzZ%{l(RO z*h?@kxiEQ!1f%xRH?+eloiNED(P&8nw~4}P{q!1f2c>;EYou(S0*(l~g3WOpYq8kG zsyrcgn9}0=9%j_VW;$*KF6lac1n_6;&E~32^q1XKVo!T$zd)tR^g?*qyqNd!U0LZKa9uzgwU^{kgQEBdb&wk@Z+0yEm>6e2y!P`DX>NF>&B`Fx) zW`_1Qy&z}RgE&R&3GR5?>tGbwPP*h{D%KRz<45P8Gg)_V;62+0b1x#OK2EviY#+B2 zim4}XoVifG0!uq^1fz$o(WT>s3a56h#g-K}TLuMi$|_EiCs+n#B5vGj}_c_SW@AV@P&8^w=>VUnYV|e6jNTcOInpw8ayyj`D;nf8#~GBXzN9h zn%b>K&H+11S}J#H8rLzRd(TJW-jit}UhRsSFS~)9!u+S1QCB(-#RF<+z)50D&UjZ% zDTUW~`W>~PE%EF^FStJlpSZN^Q_5=!>6yCD*e_N0pbhY~27CDkS&6-peR*BOdMT2p z3!keyzLVN$v(T_yD+o&|co!OSyJk4DCQZ_^rK-YbpM+gol1>$M4|bDzpd{aBi}qm* zaMsu|b=8`V;&PB|?B-VVU`Q&8XfFwR0(B9vjfPihMOyScp1Sw!cmjS3nk>NfVRYB7 zXKZPac@lZVej&BOZyunwN&)JE>|=gy-I0B!xa6Wk3P4*Z_q~vWLLPLK`)yEadDiubPRONN^G|{kG zjm8jI+aH!3`F=hDh>4<2aO$cou;DcyVJ1}Byf>;hdNIb?3&?|>+~WYp6;$&qU{xYu z;fjwQ4+H@9b+IJ=bUJmQ98NZVlkR_@nNsAfN9pz2FFuU?Se@$oRH=<$aMM`DC+5|x zbqp~yEx!i8+={dEsv(rWHGo3?gaqOtfk{3=P=Lue>VkO_bA4tL>-8K=yq-Gx8-QUK zkN1(rw^Bt&TmQ4uex+}r-G2nYn6M%8%2%E6mpQP>b#eYemOl9dAtZ2L$Dsh4-@|3G*(tbq4(NRoytQqYfpAlmSG(^)zy=vb2f0ad{A|2gmhUZuALuZW|j z=URt0(EBY+qv>l&DYbIf73Ww@3R;&4B;3<5WpU=ihPe!)l;3}(i~e&qkoeId1_C2+ z=z`ufsTq`QR ztv_&rxFH0Jv1tH7agcaj1JcrHR89*bZf-M1iA08I^Tlv7<7PHn5@4#|gSWiE_MYPZ zq}4$FeN}icKn}@Bn6v=F0=#bq_)iF6-3mJD81{r=9=N1-SQ-R8DLy6<>c}{rD2#j! z;48g?fhW7TI5-@L`>R3zEu01dqY?xp;V5P0f;6sO7Enw?<<919;=7~szVy_S&Mf@E zpqqEzAfD2Ox)4(0)J%1r^rJiBQ5R^3`2HKPFjst>5l$6=;p{E>RVU3639v5#l;K1Oq5y9W^Ll54H&;& zi~X$8?7j%9sUib0Qq7K1V-CmpuEQ}u8~eKGOCUMn328B! zd~ze*8#M!49B}l&;cF3KVPgDe)A>n&;IA=((t#y;LV|Z^_wYHXt&V*`B_b+b&MN_2 zv*z7Z&(h#1;yoLRd}7~)Z8**TPR5u1_=r8O^GK2yrc|oS;{y5jU|RYNYC{2k=u_nD zsI0f5<*3S5quk3*nhaecww*bg>$SJxUN4}bH;++IHzy@xiaSGrrk+AX3@|X3w^r@b ztMTLwiE2~;Dfml?^sh?q%788^P{+x?xj7eiEj=>Jbk~5XSKy-a>DnVU_d%%oj-BBa z>Twg%^2%;L^S^D`6$yPLRow#tgy8(n zi$1+6&TAnohLe0-HOEGW(7{z=M}u3(W{C?BODWq>U(YP@{{GPEJQ!PP+17pbgAed^YI?ex>N6JB;m(d; zP&~F?{QB0d0C4%D0=nwL1QJUcIGTyFjtG&l#-hWD>GO(L5-scIaNu50F!v{EAqvq;OxKMhJ#Z8rkvY{uVAqX>;(=X)D-*tEJzMw^*M;W<<9YXIFNG=%ATSmg0k$>^ zvABw8&TT_segTRV1`0ysMJQ03`5y(@hVF9HA!ShRHgil9R|b|NJ6a{Z#KzU1l60&5 zU$FE)PN;xMs*{was5(6{r?IXKq*W;=X>e8iiDCMiq)m60U*(U&@c)Jr-~oQTk9cQy zl^V8@Ap!BysVx^wHJKB^@{S}6o6LGS4Oww=A8kC>jCfmbL!wrvS9?XpL_x{o($dlH zuD(1C^T<+x9!qjrdwcri;VOEydZShGEx_KE3qRkQ6K`sUBLoy%VJv6E_HY8I9Z9QP zQ4~pEEj9!~K2T7HUS^4UdU2toxVO_ER#8!ruwV|9qCXyY`f^pIs}G7%TO{b_?oKZe z6#^9rk_N9gQJlbZ4C|#5U$;%V8iwk2B+(@|Dh zqako&;r1s$xc_-!FfBd(Mm0*eS`44!0t%E#)cK4loe&Y3+k3--#a&jnlC`2QY!4cy z31xzoBy^$gwkEId<7V>v59S-w%M}bYE&l0GA?+_0Xe20`92l{~8XVRQ&ArP$ zD$b3gKmp+*72*C%2w&&%%LKpJ>k!I~^xfA<=g7|lX**+#`|Fe$kCuBwW0xJQ0;5}b zdFb|bBXsot`X40vY5QSDBTyIh&rPrAA>qDUuDOohqpxBUNRyCowB0Ri*>uUlqcH8r zs<6UIjrABW5z1IM2GZ7HM3GhI@~a6-l;Dcl_+&2(y8JVWe)*K6`9OA^pC8b`P*e=B zR~?Mh%sT*kA$79of@1a8`1t4|6%vDH-=BYv*FW9vg-9p5RmWzU?ac3Oys3c=xj6it z4LN$FL?_y3D9`^I-sk_HIOqgHbj>CUzl3#HGhw$>+1g=4y|8Mqf(`L=uan5eMlGua zqKVt&GvQ%}17(~4-nkK>RoK=?=AaOifsjnwrx+y>7C9KsWFgn%lEEAD>ft_RPF!G~ z#y`K}nV(OxRWchbcxixc#=?DMZVj%@ORDX@TXc~@jG9mzi+D}t{MRg^f1Vd80E#mV z{!md&#=azS3VZ4$n^C92S*4~s!_Gdx6Mi4lt(DH|3W>(pj!*$Ykj6@082!n;JId{! zlQp24byFT)6ne;~+M$ohji;_wMa8H0#Th&BIyRpvkd=ZT1=+8rUjw>|5S-$es31I6 z(EAJg5W5eT6g-g~cp#vN?Bg!mv-uvr)_omC}CIzl;Tc@-^c;0hA@;#HMuyqYQ2>o9|Xa*gdLp3afQE>-8s zq~a#_AD!GSG;Q=Iy2;wI-5^l7#GXt4ao8}JPt582zlMq8h3HBmQ?-!$pS*-TsA}uy zTVSO94QVhb#T|;87Is#L&Gi)$miXGCt7#J~r!2aB3QyOsDCtRi?7(Ja%$+HupU ziEwPoPWnZDiI_?>?inN<=J!gSr`hX8Be8Xgdax z6crHLoy_w_WDA@n{vWrPBLXmP40SI2?=-n!>>&GA9$;%h6tnn+p{#x=e{k_QrsXTpx|IqtFR>U@V7#{Onk z{2Q2*p!l-pN@>+6qS>|ECFo&3S|i7E>qU%NmcB+Xpdm!WWYw)`Q~9SD7)8J4Elpc$t2B-*p%r zTzz*JeGOAp@Cl$5K||N=U)#$lMs}5WRISFWsym0$EE&T%-0NlupOD|=Nb2jCQS^U}lBN(K`ZTCp%w2~IqUXKI*rB3x z<6XdvUZ!k?{ZIn&*@Nv`^vjPG`{#`WrDOyS3Aui6C+vx(;L^&f7cA{)heVNS*SzTb z-pa0nr8uzdF`FoaPl%E2$YWs)h?dxPa#bZ6PSg-nj_Kw3TL2RQGDvkAiBFUPaiL+4 z`oL9Uiz(M58=)6zs-F#Q&5EIWe8V^n=`rWrR=d30T8bK){Me0M+e(bA_%eCQrN;fI zKbw8*V-kxg<{BupOSoAWWB%nN^xu#QX9fL9US^s`c~65C^MZm@^p)EPZozO1i~jrO z*_o1|7@Ww~Z@pHSif#Sy7?Ju4{v2zF23+AR<;f;og5XU*9W_1`rOFYovsDO;X}1Om zb%P4m#MKm!;)9_!v444O|F;U6HXwu!(eJv$hA7S;o%Zg6T@Oi-ebfuvF>FHh3&3d; zit6b)@FLqjjK(5LFZyTPUApS;8k>|cl!Ad&yYT!d{TyVP5zP5J*?-hCKFLK0 zy8hcF_TM8g@OPFZGQQIwOk-qlA*s z|F#Eqpk*b(h;3>2e(UQwu_)n} z9fvJNfYjw8bPn@1Den_9w*oNDTC*vA7GnT^kPX+?IUe{YcT=t^% zHj;UrFPfyX!2LpLfGSlX zNaxUSL~LW#;h*$tdK$Y7(BzuH>Apbpzkf0&(mza3E`V~`068L-~xVc74Mrn7nTsVrS`qXJ) zY=7F)i*huiWGHGFE{imQ*|2xK|9bp%RCVl5+VTyJlb_%HZ$$<^{!)BIQJ3cC!QkEK z^P{Q8BvI~rQ=C8W72)%h)@DAvw%da7l{Fp6iGeuL|5E;x4YH%Uw0^a8suL_Ku~{GI zjGa`>=_l{1z3|WL;(<=1pOvS|bHGIjjWIXMkdV)lO_g>h+rccA(SX&{aJvvRnXz+} z^(rE|?651t``5a>-Mt=CYq)Y?Zu$ph^kndz+>BY5n^60rF|E;fzWj#PY zoqruqn}AN{;>PgA-HI-xRt{r*-QN^A*GsC4OY+$SB?Xj zk`$Td$`X}udP?eN)g4kZn(%#I{7#elVqz>OG7y`0S&)aRW+mzuZ;-ov4!T+_uAGZ7 zLkcCH!DEI|np&&>9UEEDkyZ?US-M?~CgKMI4Bx(xW3u`mHA}V_oJBfmlej*lVaBEu zny?q>BJ>oqoM~CO! z3dm1?==}^?qs0pz-QKGxGNXZ9egHS3geoKHrxdr+hR$Vb z2b)Xi)$A4me(p%yRiJ1f%Ux~|6kT^`&2g~`YhbPAL?6GBJXRYzYQoiWeBzeIzij;RXpR%7YhCS?49Q9 zPD^11lUtNdF=uC>3K+mul*z_8dz4qqU5?b6tiIs=4MB_`*y=8{0-D`5Sj{1ciokUo zusdPtS4C*V&8z2qQ_*p1SpdVaj$1OTzH@7FNqd7Guwc=4mW}8Y6wROS(qO?nN;uHP z16=9LA5}Pppc?gHI*s{B0;`ebb!p2+4a{f}73|!Ki^Vb>f-RN;YA92^!zu%Jq7OWS zMvIW}mmg4g(2QbiIT4`Rk;nfOD{0AQ#~}q%V&GjM(_KlEmkTBZsK3$m#&7NS>|IGd zr$|C0qWnt3j4q-Q3elIOTH%@{)CN915CPzDMvu>J1LbGC)vE(mn5S=&T zOpNurji7=Wxp^A0O4Y1HAyfrysa#kIW+r*vPt$d^k`XoBrSGpo60?b6Ma0gG0?p&= z`S*tT$S2iz(@z}sM8_c3t39n%vKxBF4JDX3jB~Mkj|0JzmjxArC0{6RnTUrf_yksl zhPrqZutNb6QU1czFys$+u^tnN6U+@PuHwMztIR8qf73Dn%jk9134bY0dzNvQSQ z9GdN{@(-Gd`M#kLz(KQwgkoIQR4|a#obf`*at9y;0(+G2Qi^#Mh?7`JIf_6E)~$(# zW1|Up8YLwC_OOD&F2K0*>#<;eqwz`HTNYhpqDZh2w|xa_bqV__JqH*w8DPIdTL(?Y z`ct<<`n}ogKTsHYx4(SK;Fsa@!GoOYZ>Y3v@kS~LOKmEhu?={^3=S+ar$ZA{t%twH z##Ebq*SS;%KAJ+WNeLQ5lA&)Z=9hMsgCNjfde;%<%{4!`ZS` zo*|iZss4~j^uZk>z#Ab$@U{NCI9=ng*^TvpE*Dy~IOeqI0i(xRv6Ef_+}K{KSy%eA z_1J=@Bc{?o#Yq*5FhmGcydu^l_MGzBsZ~e3gU*%F>bFqThpx(x4N2t~Q44UJ204zs zOJ+flZp^e=ZR-%g?K2Y(Pp5sO-la{KeqZJqm`myKzd0MPID#XNWVp}?6^!(*pBr?s z(X7ms&m_wfnob!hMrdUB7!8n5RlrM=rNyh#9hm0Im`=H)Yma(nXIHlHf}-oVtOFOu z6qH#7XdExpSnLcX;bIj>>2okXRg;h|Bg^vqtohoY<*++4y;}F4)|DFDzxnIw3MNuw zYf7+3zxzqUZZ)RoO;yHmAs<$Qmi6=Br)3c-H9G^?Pn{Ms#r<%9%02fbb-Av_1hEfR zW_p|B!Zwa&4A_AS78h;WmiIJzt6Flf?0(6<@A>9}Bro1i=}7`3i`I%Tp*|O1ST5^U z6C4j^rZ@4lQVrO!&omeEP8*ytwEpRmMY}tXDN4>bPo$_S1-kenySf^G5x%kIkWko( z2v-8BM??)qsv$+TGX5>K>T5T|MCaY1fe=xQ2dGt(7w?G=4^+}?j5e>Bxp3Dyng zm#fG~JzG7`k9w(m;Q@wjEjVj+x&PNx>bc}pZP0*rqZ?T^wy)Sw&>@qTkU*roQ(|{G zh28VvidUT@VV)86Y2(7=n}eEZxqH+Z8+bjbKr&VXJhzjK9cn!pr=<42c3^){EV4-p zJ^Pf^fXF&R%HMgl6A>N`$4z_tYPTAgFx=a{#yDrb(kMp`=;^tAf1K<1cg7m(wy?t` zjTn`PuZ@2SztD%r#S{Ij!O;#2$VVuZ3G1s6eaR;$&Wik_*NsS(3tr2lm zGrDO5m0?;Dg(;b~V3}gnBJ?QgD5#=TuEtdznSH zaD?mz21`?l_F~g=n!*CsN>pixQC!=*r1e{vu*3%w%!IXsujv#Fa!^`jQXa4FR5iDz zN>$j!^2pG3WFL0W1WcrFeVhCg?{upRKT9(84akd*VxEnt6rNJhcKh&ag}mdIsVCbg zPgB$iGaN;m?R* z+q4YaTRjQps7#m2SRAC?6w=Q3G|ATj940QLVkp!%oeT>l4Xd_Zp!R!E7bWpDI0V2P zc3PBBACelHhn{?QYdELba0^pk4bUY$$)#R^-f8lMxz`k11QsK=pu zfXwVXd#@PywYUXbG$ITVG=U$g#5!TN_oga@0uQVudhBln;uNhy;bhQ1)tZ;q#ekWN zrZ4xebEx1W?bU)!dAfWcdyJOlb>$fD?ba^r@X47|u*#(#%8A5eiwjn!S=9PAgu?SV z4r4}(D!nZth`)@gwDQYWFlYEIMv}<^p#6wv(9jl*174}GsFpveXAw_5s$>v!tpT>m z8;a-(4WtRz!qZk<0z2TM3B`#ZPal{YsHofw4lJ*9Sv1P_#lu9&1=VDVn+G$ zkuJhD1CnfE6&K=f(AQ@T-^CgnTec8HZ(!XmHncu?zl-8Xp{^k<(+C|XYK=^H> zUlT{yNAQE?U;HUCrhz2(`E4GQqkQDykDUyuj+4?g?i0t3$cYqMw$xtf4|X>SmL5=} zryYmxXG4PZu9Rq6p#fX7yXWo4FuKiNjtB{|X(0<)zysWY_LCVZ*`I+gr)L?22Hk^tXJYg4z^#&5zK1X&#wfPU zsgNzNwn+kFdMrx`>A1=Kap^gxP}3WMIM;qfXR$^sUdBbYrvXg`T?Xs_f5ORsUqQL2`61*lVCvZvIhS_EmKMB;J0+FQ?f6vVy^bnd2~EmB=Z+aL)6>%+Q3g z`E4R}ec>t@slk$FV7&_soM+o=uVh|JDj2H0s3f75U8Fn0Z{G;KU=N3=pv;feZ|I%? z0e$iAnq8~d#!ywYD!zEke$=5A`A{_qe}UZu)9Lt<&Uue6>IM$Lw0mf*eZA>wX&&4kt1yPbRI= zdbv^N<9q=lr~&abV>}vKsy=e_bjkOOQs^DqjS+IkooG?Y) zz$uz|G1(iVDGR#(v9ko55BJ*>)lr&n6?$h*&r&}@QhT9fWFxNp*F%x49G(}2CJh+= z(U}r9ervA>p~ZSy+6O0!LFehbpap}+o(j!jF|Z~42CM!OBEWRiV+1kK^z@-tL2<0Z z6YZB8iyh)i&}x|hgT81bVZr+D2P)Hx5ZI3Hn@vWTBJT>?_ju;6N9Q#n;kK>RiyO2O zTcOO_KZ6s1{zict{e8O2#FcO7aguXEWXejj3z1Cwi1qCxs-d%C@Ub0P*KC$(9UHyy zUW7H&IDAutgQt9eP&vcxQl2<9nj9h~)j-=<41o?^OGeBiKB`@YGw_c)QLrtTrt{j- zwirXY(=1BD4Fe_}aJ2GsaynXT_vW`;Y+iEbTL{ctga$)u ze54qDm|3qTM)052T;Rgio+EBoJYytHR%Bv>|u~eBFeGpb!lEe8+Vqn*eEM`jZ zIi)+4TC`}GjbjVPGVD&~x~B^K)$k+v(GeMDpWNujQ#O9B(zKBkL6WgsLf5C2e}ps* z7DkZX9jzx5L9?!yzJNF;;$Qw02W?EST@=%7`;O)X?c;Djr*$)N0!4c|0zyrY9Y>p=~-?#-{=~+vKcko_Kiy2)8 zIdtxg)awx;212gE6O+m$2p26hp(mEiavi|zT|%-kG}~I>r2@}u|AmqD!TDahpg)_i zo5%V{@#Yno`A|9EY_&cN`Avh`ZLz574H4D6|A1D+TvVHE5jE+rAw>qwTU@l|h-)}S zTG)ntUAS>@b$#o(dGJQal8F#tyQ-B)iK5>ButuhdmQ!SzgVOq(IOIg)MuH*(C_rB2 zKMK$YGyAh}_rmB?*c{NP^Z!4$`PUCJ2wx*CxUXhvI8`4Axp`W-AQV#Mb8+n;qe^k% z*#cY;*su*mwAJ;=A?9jJv-)UC@tCt}@DYC+^Q)!z)EZ>lzEP|d5(06CkPs5m@nB^o zD0ULO@|K529>&CG+cJI$=9mKt03+%_OK%0Yq-#X z1VGm9Wc=v-5bpTVn*OMji@Z>?C){2`rrn4s&Euj!d5$=oBWwGDG7>DD@9$kFGn>@{ z3n-y)JTgPda11#-ZWrU1b#4t_2=E(r-xwLLecV%*fL^_USiC<(v#WKbrYd|u{8zW) zCYi_WbT9=JMSVwhYFyU^2>~+-ZrAUk!sqen3jFd;-7|?*n$2(Hj2tn9IGtZ33pr%Q z!lxgs3KotNQj^`(^1Fk{rTF*n-&@2!N32Em#$OhZ78;_SHW0iLi=UYy~H#d20IeP^r|vihEUUfp|Mz6k}7osEf5hr@r`om@jzqf ze=TTcbslsX`DQpvHU{JcaAez@7jR6SlQ3Z#F;jiD71Bw)(<}e^o^G3VoKur1=LU91 z`Lz@R=hsf4EeBLI^S*{~#hSiO*{AL8mVP95wts9wmc!?(Dg-Zo*qGOc(Eo^9k4O>d zlX>ML@q>B-r>&@}oLyKl0uvNHn_?zYw9FX#HG(`Y!?<>+@rcI7(?u!X8X6mxOOIzq zZ={kR^pilQYBtQ*_bZNA4m&KIQJBnfza5uypqzgPkPD5zc}wE6}C2MdR#xscaNn+XqnRlb2`*X6+!U))|e9k|=#kL-oo#kKNz zyJmk+0%!}?Wg08_#+vRqyc~DreX|wS(++n(>FD(RnexTCJ@s4PYJ>Yi>R(FvG?H56 z|4Dm^^uhbrF-cds3*v`PeNjS|vVPxk{Cx~6W0z^c>~%IIG_`N<*){gamnax(-)0e* zg0^5ObHx2v4!XGOK%rTzM192i|68$gl z6VH5=Mt}-xalbXs9tK>Zg=(`jtfbAW55OJC&PR)NvYN+1uhG}z(GM=`4lQJsl_wRK zOVfBg-TwW*^+0F~kHbNb@NWk#*h)v?$sI-&qL)K5GL36lu0(&&ayJOVdJJlFWHf*Ww^JFgjwjNEF>w^bk)-5G1T^!B-4iH{` z^W!7zN%ts^R6e|8Us6%S-TSddg_KcHy1``#0SFP~b6&mBYi;%gXysw%Qi$%Zz&wp1 z1DXvIxpKb`rP^PtONIkf|7Tr@d{>mTdKL}@i2RgtLpe}q=Y0nnXQmoqxfsOZKEFal zU|yd@GugHM!K+@9)$B{B;X;72gd*;-{ZI}j;&DmU;$IK%#IDGK*1KP0>C1|YQM~w` zf_Rp5WoYiFKRkHWH<;cnx=@rN(W!lduvt*LM;nu^XF{KO6WmAge|k|V2&6Mrcmm7P zx+6Ai^yBK}4XW`JwKvuo_%oT{1p^zf1)vko(QNeP~3{?x+XTD=7O*Dnt~=g z2d?wR$G{kix9@}|mkMOHLtGYKE3O)0N|IVgh2e9YMd3`rBiE~2l;zw;u|fnYqxR0H zZ6B7!+ShGgls!dVfAXj|$X@1?B#?|u|H*dEN6LEu#T-W45uNX^-S7O2!JLkc4lf;V z?wNEM!;w_Nj*DE<6stah*#AN1CjlQ*V2JsoKIMLhGcdy?CA*SAZ^YYE#3$H7)7pj5 zg1U>T#-HB*&Hh~>gEYd_ZCi)9pCO9=qLAKT8~R`2U_b-qvHyhTru;_&Lm!%qhPUhO zi5BoxLlXWTr_gsr&gkU&MKuS?I{%$W_s{P!ai7vouFn;(4`p8zsr?Q?&o=LTeMJ7P z3iyCI0$W^&jFR$J_ko7e@E}yBn_X$yZzcu2aSs@cGC2j?8zTjb0|OthQ#`ytvsz)$ z+g0!9nG($!$$wM&>i-5i;B@eD`P3`lNw?Cdf2qMC1+6EXL@uhm(djVV^G5x6B2HDu zb%?P~>EAdC!1~S6e74i`MM9g$Z^8f;+F)GqOd{VD>&E?(x|L{2#eA9Tdk@sb@@OzF zSzt$YlN`DW!I+YU6lFie|2-b zSToq5Rcg+yL*Q!y&GU3M+4S{v#?4|uxGH)s0*ngT<)WYb1%_vZ>`Iot)sQgYf$dQ( z85}t50M{TIRnekJ+Px7BlCe6lFLqYvC9?U59}R5fo)Z2?A;k26&LMWV++8q8DrKc| z{Zy}^3r|=saTdEB5@4^pp^--Dca=e55KZaSM)3wXKhvpm^g1 z03G2$4M-L%!NHClM1QI$Tqt~Qvz9*l1~A%VK&Oo?Ju z9R9_9b8S&pY!G^xv%w|eb0qT?kgZ5E{f(*cGT{wBM10=> zPNEDaav*rWuBbl~ay(r@fZ^E*dHQZ#Esl{qZ|fcmKF;-F#oTX%I2oNR%_p{#N*+MP z4)W?6GU2=ZY8?`UpQgT4%y`&pxdqB3fSi8sE0`(k4cxC&s6pT#up&7PMLc~0kM}*O zaXPi?Ou{|iUr|FQU6-HXL%SI^QkjFwqM2H)Hw(Q+e=icQjb&11e1^WZHMO<_xU;_e z^?5CnxtQxY>9F6?sTUJg3g~uqD0|TXiH$(<;r>ANaPYC494n^5LKQbDNy-q^7}P0* zMrhY_Jo#w@-AUNs;R?cNprlxHE9w`Uc-u3VYh(53(R@|4N!_ZBD#-{>RpHP@#TLHs zGSPGW0QbzV`vKL1VBqW|tUs2IsV3wxsgN=ELK6Sk#me>IfbZGCkZF5WW0}bNW{_O5 zq}RNJT2@%CV5Y_3$*OKT;XeJAh2XNh>)^vuV-1Kelhyqn-MZhZQNbUIRqhkofMiYg zufPz=OEir{OW87luiXTivBF?~$DWcng?Q8za3q<>J^wv*65ePz#f^5I)&6Sk-F;GT zsLkqoaa?L0WfL^rs9M~%liWyTH*X9K(PpqF#ACJe<+%hM4bAjMw`HfW{$lqzIkq%h zZizna+TsI>{4|w?`p4U*lfs3TjRoQF*tbk@!H4x_Nyu-Z&^MhRa~IrAmc}SF=Km!+ zgakKgpH!Y|Lzj&O@*U?rIMTex8B9k~@gLV9pJaRwZLobucSAUqKV66|_Gaq6NbwhP z-|3#8!i$^;#y~jZ_}+JehGUr_!!EV!#irQdczuJls*7kwbX^1aUG_f^J_hFd;^HeE z*Vz&tq(_u2G(ggiDc3{VVC74_7Ho!b=vYW(I-QDo1oYkCNqYyxN? z4Q8-dT_E4_X(_$5^SIGfNo!F!-u{)v0Me=hMV!)fQNyymW8nbI@z@IMRguVktL3~T za@;hEj0o~}e{$sPthU2yHAHhrV@hJB)ZYN*6F&h?Z#b?S`T}2{RzdxNCmEL2tZGLA zwtv^nn}0L4k#1v9_DPtfYA^#W^q^a-Zh!bxjSeYg|puCz>A_iFZH@6knFxqWj zNWD7tnOt;D+5H~zua~LlH0`xkg2*T~>jt~)*6Gd(L&yGrfjaL&uj$A6x@0GUhjkK- z&Fq_3@$b0e17LqO9{#i`@%i{+?XkHo_8ZdLO?VcXry+75uF9_J+6nb zuZ*Fw03kedH4|N~CKS*0lwz{6#l==XT1c*D-ZcB=SPXbR*bf|s+>Hfxc(Fq?s$}!e zc5%0)N?OdubvKH0X1qs^i{ryztJ0tK#2f2fFULpT+1URjr`PreDd zy${#KD7?@23Pj0Xl07xCAG~J;+xKzIrui^^uOs!bUx@$#$P-xW?Wdzvz9}slBh$3p zwnHHc@eqr{(=Qbp1MiRj$DJF41k-T^U;TUYn@hEL?>noiE)nYNfn{=wg2zyeY`6q_ zNbu7er<)4-?m(3>l@iHe*PuUoVE7>?KG@>Wrn){6s8Z;GUg)nY8x7k~S8q(rZlC@vJ!@95NRxr~e=Q91~m{W?#1_%#c-$gZFrcuAP&= zDr7C#Q$zc8_870fghYc{kfp|->9zmaE<|#cORv9J`_muD@zS(O3kjln}`Jy)SFZq+@HstZUsR`qJK4(CFu#$4k-b zAN!N%+4%Ycv0i;yB^kjM*ctE7(dgJ3A--GTgk-$}ARjQdgm!G3cHlmD+0bIWWv=GS zw3en!w9(XHix6M55atwlj|t_@5BZZ;IN|^EXp|2a!r!^Ra;L-JORA;aqzP9W3Ja{J zSm}$xDlvz*f--d{b3eV1Ro79e%k2!^MUTw(f#T_h_I96IKV%t>pZT+${a?v_t46p6 zjhM_Fs2|7D6t|1rVttma`L<&ftNTYOT$Zo)3^^+m z&rQJlov)uHfIbbk(k1H&!*}2R+a=@vV?=HILq=oaWYRViT@|4!O>-$ecD9^nVu=K^a$5sYD@nsh zFnkkLJJNy#5k9WNZ}yXml2PdU3EKfBc+LZxY>dG3{PZ%N_A5@OD?clQES`>SxEX;1 z%r``o3~0osyy_Y4CJ~dbE3S8C&z;p~%bcrr>8P}0Y4som#E!(!T9xTl6Hc}B;UtHR?xP;A^V2{*sSk236)v1`om!7Jzg^hWBv$*d3)e#gY^wslXdd*OQ zbOaD38#qad`5WQf1UoA63t&J!*Uscd(M4$Ri3Z2f0KYWt&KpmDtOTrk^CH;-AdHAF z-DjJWDqoyBP17am%d8?>(bFG(UxNGO#g527r)+`M;aeff%;vDp4PD$Ilhb3W~jCRMBc52R3+)+zPn8b0sCa`c`T^9l4#sfA(Pj`R`H+w2W%i#aCJji*s( z`g)=jP3Sv=qy`XSRinH`{mmBx)DfIJi8BNFEI{#^$#D87cNREZz-I>XV-c8M{LeV>3itG{~^~@~zfavcJGN|gZ3Mf#AK*M8LCV|C@hV+Hi*O9Bm z^N+K%IbDK9jmrQr$iD)=p4b@rWarNmBx4$&vb^_OzV9K0@+eA6;&|3`TmEucH-HzZ ztZyfXmM9ZdtvDj{;7KL3tYcQ&`2yY&lxoBt(z0^n-P1sJ=3JCo1kV2?_n?E?v<97e z!s|YBrF*wJKFWH4TFk8NWlJ93yJv&B^w!85l-STW7UkGbZT*e{>Gr=1WtNDrtMk&6 zAbG8a81Dg9_?~(l<06^y>`^fDkbb-NGoW1mea#Hx8SQ~YPYo+hA4ShTPTZ<;j<~(* zxOVn9BaF${R~Ss-JA;gn@l@sk-iIGS?}a$}iCyCFe*Wh3U){$8)$(}_KI>J9oQ(zA zI-QHA;DEDl;KY|vK%TBv)&I|5=c=;&6iV?qi>>GhHx;iCkrh%N*ONb)o0$DQ+!s$ z80gh|I0Z_NiZnD~y%b{qxbf4(7-v$xE*BC(b~zXJxJ)TmJM)DHjGNfr z+6I#`j20cw!!IUEZAV1h@7z~cz#<~-?fi&(UkcA7XYLj!*X6L|lioEpGPHEKxF z4kyiX^;>!|nJjX!z_Ms#`Y!w{McIa{Js19Fu(>fHF!m zT^S0@${T1yw#%lc+jx1K%UKO(=Fxv6+}JAEiicvFoOY$qe$mDQwN4nR?>BHDrWGVp zX?VExj_6lP?>$XQV)7Dn*y@G)U7%{0osig-)qOI@IH7t}VSlFlvC1GW?z!@feN!SX zcgU|yTXBFtX~RdNt2Y3eT6r89)*mrlAG=ND1WT*Un}I56aZ7uIZUzM@^JX;-oX@tn zi~0r0P~#Vt8Fmkz{5K9GNBCfhg2dOV!K4rU z0HV><2mVJ5617M9(NEF>Q*q#Y#(&wO%szl%Vrl)O&5_1&A?9)R z{cfhN(7Xiy0l~TnmcH5N`3`<^JXPPMd>PMw}c(UROA%=+dre11K>V~l9D3PU<#o4-AB<{< zAk;X)sOh>5>9anfGhH!Mp3k`|fc}?9?t|d6Pnw%;9zU)71k<49NhOM#`z=m_g6w`M zMYZjd<`oR^$`c3qEv;Su?5kNPP zNK=S`^D%x{P$U+Wjdy%E7`0Z|4^!c#N(ii0SS!z7G#WXv%Fw3@f2XT^Gn=QVfepGD zNW+@ssZtlpcKTS( zUeWt{{vl9_|CS6G>d)qmX!q%^Ck-6CIM;~A*O}iPZw#w+NcnqeR8O@WF5_ogk|C(Q z3Cz*&HRZ~?XLw5@iuYqqDV+!bXT>*&eXY@wUi|m+xLt2Q4t+M6PIvi z9HEIu{3PCl=>CMO3cR&?hMU#(P9J;4ZT(WB4j8-iL>rSzVXvSU82Uqn-hY~l{g;Ae z>VsA$N#MX(#c0=lOkrcC`RomuVf+O$)kwgjUX$r#0IJCQk7RS&fu?MAxVMt;%Hh!fsj|WAO#06iM{o|<5gJktOKyjQCAK3P6f0bpf zK9vq=UheYE&T6dQ%XmC)-mN}kBgOBRc`NHDwcbkucB^Klv}qGJaV!=tw1coQ(uYGS zP=^YC7=l`5600eXqVa-~?xIkQL@eq#80?d%;El!?RtcBwOw!B}8bV&}fWstCwb8-)PZbG6S0`s*{b zr(yzNog6L3PV+7ud8 zTZ342ARufn`T^oJYIGXfUd0ki&HJaSbe1S7kyYu@wrNk8dUrG9EakR%>tN)JR^t7N+Sk#jI z)eKnaH{#98?>)zIXc?ewpmwMc;rl~$m2phwJOas-azq@wR8dhAaz~+mf1X6w#kqF9 zH$GbVq?Bz|Xz+~Vj`+RsSAhW(=e1H?oyf;38WOp%ciR3%QTt79$J%AhSE!SF%%f{J zi*ef3(!e#@*UC%6-R~QqS2s0o$-pDv(eRshG)ky+2D{g5iGN#` zBF78`lCM_^gsb^o>L@p$c+jay4WO#K&8G&2-hd+M(liOzun#2Z<16Iy4(B2VLF$kG zu?`Z9>IYR-N5>pL{4J}{>dzqpPabZnGuNH1R*I!xL5zSMl>t}yzz+%%5PdqalAFKo z&`C3hV&?3(N2KK@c*p$$N7bP|I|VtSsm((1#BBCTyU~(UlqoaTvF8jVL;kbuIW+?w zQgk)fkjHL$5S&s5>0s_qL2F~-hAbWHxHTQqD>%kBtc{Vu?PN{)s0+2~O@m`-H14Iy zWb;+s$AdvGX!Br8war~sh|cg6)9mw&i$xqFAx-h5-Xp&){mZ_*N2Qz4zAT<`HvKBs zTo@n;8vjU`nTT$(PaUqOU?1qB9|ykk)xjp`NGgVRj@FV?BJXaS_uOxQ8aYJ23&P|1 z4`9V?mz@s&r1RKpc>MY5oi7v6a;VL*g&-EGQ(@J-`)Q!+Rxv~2=0}+)Y~4!m^P?=! z**gWw?V80eNvx9ok55OY73Oi#mnp0KOFmsQvg<(&A^GL{opkSW z=Qlrib*tU6)lcC0E+~>{8RZKSiu(1b3*N@PJGQHQ z>JUSilWnV=TWj`V4Vq9-fC*}rs=g%9eSnPSkInKiDpb=u>~8|x-!tKhwn|EyVMZ0B z=~2U#%XJEzP3OanL0eHJoP^x3}gYUgk(>~huffisJ57G|jnm^=QONdOR`fcNu z)c>7vqAg|Al=eg=%%Y-KvsDX=ccSm-J&>i!&Z00rZ!8gIk4z|lKkd6<@L+TLyp+2I zdqoP5KGI*9kT>U_D%&DB9sH#bK;f0o4>d9dOuHp`} zCb`O`bR1PF&oORB6}_g2L@jKp2;*^GrJKC5Y2ZI^kQW#PF#3MzuHl9)Qn#wcB&NHe z&bIJkG2WL}Lk#*zS*l^&`OYp^WfjTC66L2A#~%~F8mpDOMzv9p!hoZJTf=|&0y*pq zijNobHIh%aajFEnPr-cl3e`D4Nx@D7DvO%yqVwJQ2h*WI*_0athYba@kH2}56i*el zwTe;&Q%x1kb3_-luArAm(S+?N2W)8J@ImcpLMX0>tQ5-ZENsFnC6B!^W>$$ZkDXm_ zbGfvQ5?dJD*J_ForSUkvtF?fitWvR4utM@3eeT})?Ha7ZUPN6uwXCzzqO=uE;jlfiqbhBVA3 z6vFaL(Yo%(YdgZZYd2j70ji5?SSYb#RQ|*2)mGdsiFKcexKQIjjN@ogg3hrkBCvXm zR89>7JQ}x#Y+>Gq|Mb+h*tIoSTanie_GTRX8!vaWn5o%tDp9eeOCAvmK0sb5XT&jY z5n!1^2A0($EgYa?vZ5ri-;z8-mC%!;1S-o*HSbU^h&d(Bs84aRWNn|^dn^ZXW?>bsfj@5>(mOG2{4P8TSME5PIquJi9Abo4 z4M>sLZ-1*&hL(rL!*@#~QQM&l!yu|qr3OUy^}RGHd%u1SG8?_eg@~3UXwco|t}{(Tw7$FkNx%T=rAa45WhTYELV8KLT|god=o04zN& z1TRQ{#mG`2WlaBDoH`!c5d`ZWv6z5kn>?4XT@l9v((X{eUxv z8z0K^FQ3%QsAY>ex<*}Bz3q@h{vJlDkvbG<;hWA4T}iV2kLzU6IyImUDtb4dbGWt* z>l71OkXu@E^7xz+idszeUaXlq8G{-^>T#9c&w)K^7V13SW@HJ|&N@m{WeWdr0Z*UT z`pM7x_%>v{6`rTPBD>s|)s0qBW=ZTW0bW!SXjTO)tIS4tZ?qN$pM>0_{oC6A_+!oY z-||CPDN5&Rl_@B7vBNS0WA2)Ec_kH`JGrse^<0f&W%8{v_x_>DC68HFUdD0<)@2fR zd&_dr)MA5>I^o@yr{4l@RCC$T6E>c-|47IYm2dqKoX6=?vfKNa$c3ty_v_g_BRT_~ zSB*1VocWkoRBg~ZUP4@zI8s>j=|Gj94@fr|GCPXMZJ<#p*ymg2)6dwx&aflc^?+m%U9QOHP;vL)M(W96elYwj6iJL7jCi*n7| z<(k5Qdq0TM4`@in_+BEI(H84L3CW8l6&yiltsp9SOZAdDOPaU}iks7(VCHgz>R97G zgLzAy*m(2-r(h1;%cGS)+T>X?f^NtG%VZ;Vy1#b@mn&bX0N`p6ti*rqJvJPz}0~2_aM7xkr1dL z0>$<7$=EqF8cD69f1Y~Zi)}$-KW!TseNy2zDIPZ6z=a@zOx?IRiSI}%Ty@j|Y|)+o z=mn(QO4filvb+kbr&Fk@obV#emzzgknK{ZB70ocs!gc!b$ znW)!&-jGsAqN=B071ZHE)>|zl3lbz+!d~nJRh%kVSwVDh-AV=6i#LTWk$6sHWWj;m z>dETU{t`5EGaw|{zg>PrG?>KZGyb0yYA;;2sbBvQN3DuR$9Flzyff3vRKjI1N{^zc z#a&#CR;HLWnj%uho$$PDiH-Ux3j(YEnN4M~sp~xGbkDU>n@XR0Nk7$u}>bDKn6% zr=`M`5u3Of>Vm^{vtPb12lA5Zb3J|%_(o!6+QcQT z-sro1dUUlzG#oh&zX}(5(MS|J5KLYn0103LNQ+9O1b-Q#8#S0qRjj5ypp#@;(}0%Y zd&zG~Q9W2IDQqTMP?dgOqFHGMN!{(U3>lzaOVEkFp#@EV90#++>n17Egi4pxON4I1 zmM6#AMd@+JknY9yA1{k8mT)nuSQ=sJ+7bD!YokIVPRBA_hwf&)%sI-V=U zNivu89!_Ze80fLzvvo*nciHPLrmTOt5^w@;@U|e!c&?UMSaOABStVr1w}U9Cpr!Q^WnH=VPCZ%#EnY&Ji#`+Y2iSB1q_HoSOYP~-?0%;ea> z;@J@PkV7mi`b%`%ye@Z!lZU8F>kY=Ugq61kE0jOjd&LP$+lW(A|N4ycrRBlRNDrXp z!a|P@X^{Q@Q`%Ls^Wy%a324a1g4^S6%B(#cmaWp|Q(ck*wOvS7LAiFI?zbC11J2I6 zBE8G92wB!2sGsX)KPh_(k)wbs^KR;NXWa0}dj$uZ7@)@0{NxiJOrWP1Lj89Um<9c? z>E|&&$L1;40ulN$Ou`RL74koi0v5>iIp6oDOohnD2T8153}Py~WUVZ&=%CY}?^%iV_fZz2Ooo%x zG#OauAxXFEKc}VB`k@q4u+gr6bGT3ZClhnb_~mrKLonTC1G-H}=6AxV#}4x}%IxH2 zuKxbf7?kg^h24e5N3BrH>KU2TkPf$gIgZt;qJH)w0uf$(BAw!e5X>rlCn7RVc~S|a zfAZ$}51}6KNALD!1Fn?Q?Bcda4%}CZwDd+eMLic#cnX{(i)q9keM!kBOl2zKhxDGi zIAjg5M!VnJ!{l8YUDROKm&`>+h^-Qrftu6&NC1|Q3=8l!U{}X=0Z#<>RzCnJIVs|Q z1c~kUR?qAGd?%+E#e%GC^MS%)Cyd+N@?%^>G!io4kUl_75)5I3+OJ-x&JgEd=rsIAP)J(P$D2C1fl@i=EC6FI-Ojclz~1$jerrveTBr5>GC70Ch7wshLlK#NlJh z5wPO?^uC)Be644p(kX1xHte;;AizkCZ@1R#`5^?4H~kv@Zm9qf^=1;j5Zxj_XJv2;CY;NIQc){)V0xmDRIJ% z$n5}{Q9K>$K0g3WA!4+wV#uVCF0-4TD(5Yybv(t34b7hKHY3{ZHbO2cb@|im2PjWn zN8Jl+@CTRX68=4WKOdK8-n&c%mFL5@vp&9=&rCKek;ckOD$u3CJqrKO$zh0*)pW~> zFC`2SURT-E3T7dHYxO$5VcRsH1FerOjya!!%qE)uuf4Bu>Zu}+BX1HGYZ{K{{*}6B5W+cx^{Ae5*Dc)H z!>uc}^xw^WWB^K@y0+tp_qSaFw<0yQWD+xaURQ6;Tz5N-OHlx42oPVNg1@L=^&I{2mW?7Z0a-0(=|MA~_MWUl{6F z{NknGX=MgWU0LYg%sA)$i-Bq@KwxcgS5n}_w52Bw2US#s+n_5Nn4WX+Q-yr~Tp$?!d;AoIw9>)+8 zM2tOuwmai}V4q%;TZ6q3Y^zMVY}<$bc+*}PrUEz#s^9$u&MTt8oKOUj_lu2ZL;ClX zx72zaKR-L`v6a+}@yAGd0Php?{ja|T78we#{k~KB_vk?Hx;ASBWF_^#Mh3Ntx@{^J zj)&h1BRAb|D`-@SQ{cJnxL5V z-Q{dboshaKY zuXl^o%dkz%%*g4x4Q~>-`bm3o8vz{2wXFSJ7n|NT^CO)$x%wBq@X~R;+te>Dw-*BU z>jESEH+yU6b&F0ZlZgsGf7Y5}M7vSc>bwQ{X(){T%Z%={?Z}N6H=2Fm%FwL8@ter0w&Qv@O?IF`(tvl&$i%;wl6XSU> zW5r!Rc5yW-KqGUVF4pYP#b(gcsn~R!(N+I0Tiq8roTEJYLj4;r!Z`4`Az5m+i&XbW z7xD5c_0Q>HX*f9v&G8HyHd4RRScmsT8SkoZqv zSAUabfQQ@*<6_$9h>;Ug;vr1T_?+!n7YMV}FHem~uB= z*Q%<+K%u*-9x`2|Tym?zM z#cr=>Km=U5Fsf!j75d#@0PI}CtJbs(g>-AoRV!w!dmM+pfl#GgPl}dsJ|ky?tOGpY z*A%psGStj!v;+VzgNfU3rg2PJ`mGf|F_Q0GLrJtq>f?*nUEM?KR_c8Vyi-1)zw8Os zHhuVz%5C^tn&Zn1gdHIa9zysAcawRjQHi?GNU=UQlVB;V{7mbrYl{E^AK{m3f1ac^ zT)2sf0ax^ue(beXeQ;H2S~1I$j8&6AS^&c+NAAyc%U(^>3c$p8|M-v$S`@fSj(WHL zaCEdCh$V}W%o&rF?ZAro~^{%5xj?&{39-Z_c#cJ6iRquAwiGO&helLUTjm-pY7_ekh>N{XUBp zoap?^K(<+0uS~A9lv|%$-vV76!rlo`+)mh=ptnX zS=8LdfdBh8-Cd-xk%V7N6L5t;lWR7ZUgD+lnvE~W72kbHlm}Z}px&sD>wU}H{Mp5K zU0CYvn?O~DRaRBW0b9Mnl)}q-=`?)N!>xH#_DNpRQXFL25$!J2rLh3g*E*cucXznn z&cku`IQ=G3M5j>nSTM@0A*x$4T<;(E5$Xq1&{hPCmC+;LYFDTu&?xZiSI>`2-{$wc z(C~Iw>i-h_POoX`QqS}DxUcU0a_O*8T9Bc-Nr~NZt7jldo-CKo8LqOH0lHafzD8~E=7s;i72aXrahzams zKXhNdAq{lB;;rGi8GR^@j8UgKfdK9c*;OWo1=A}(QkK1ceDc<-x9Q)^Y@Wk+P9^q! zbD5M7w|$lj90*N)v5bicug8K4#jH#RjId!nwl3PdA6Q0Jke7)tIM>5@deer~N|ieSP3qyf`t-Y|$!AqLTdM&h@<8A+ z8|k$gP}*cWk&SabK@X$%rzQ z*?Ls>I3A@B>}RXpz4rdG)f_$D79Rez4NNCxTSr|e%PhA}ihjq_t@h~M`A+_wQ|PMY zaR)W%2KDEVGbdz{>=`H0f5NMvZ+rShYUT5XBO{$_Z0fyB`=hU?6W&#BXHZRdq{6?u zvwW?|6%AhQ&N|3ojs#AYcOml>XVgna=gr%f-gP$BzYk$3%~>d*YY0%cfWb3g6~q+# zCpQE#0I##6(fhpw_%dsxeqFza2cHG()q91sHSQlth3pIbkUekz4c!g`zQ-y$GV(mE z#_wsZZ|A|BfoHliVYy>1#>j5y`;!F8e|j&04zpP~x26@t`yQ{=oot#QB(cvz4ew9J znoPu)7Fwhs7iK~WC)o|D@e?=4HuJ6SYb)EmwfPhK1|3`f*ca0WLq&g5#2V;(?6zB| z`+cL!Z$kOF$)y$)qanXT_hWz%&#t(9#)@fOT5qeX@a8U7CcMs3)sSEC&;K8WMozx{ zL9yI=Gjus^=F*(L!qWCpGt^X{o!I=Hr(yp)aw6LQ_{!Dpium)^ z>F0V73kVlAZ7gF~pKH#wmdI^3`meBGS6kHCIh3`HCK#xcV8^?J{;P%1LOsf}VT&hb zQ|YT{ud`ZU_k5cu|5?X4VT>9>BfKDOK9lBkU}Y(2nYMR@;8m?Wt zH-OJmT9d@h>+yE6N_T^yx;{1UxALH!ANuwMbvTr>8)|-;%P8MPz5kDt_pRTYS;E&j zp$h%E2^@I+I9DJ2X8Lq@4EoSqx?!Lr@%deuM|gN%17|s zsShtJ%xE@WqUZ6syn=FW-Adg)R(Ps69yrGZDpS z-N@cqTNokaPqhAtbXe>Xhl3r7GQB!@yev(zbr^9>!C8`VB1+r5(p= z!J2>hO1~M7>-I2T~Qvl^TFP1ki@MF5if7EV;Fl+UKrii z+1K6*6IJw_yAD<+o?b;j9wNuVyEvv9M3N~XpUzupHX$~`z2aiLF$dDDnyvQNQ;84+ zRsH=(aw%Gxw3!B7E$-*1vu(@UM%d`eE|)eFaw(+_?2?lG-`4x0PMGX_DORG^o4yUF zw3Un*Ih|~1E_>Kd?M%&JV}D#QeWJ!@jwvB}i;CaUXg1;L<{%Ljn9Bd%<#k^qHZO_A z%nyAQ1pcY(_>&~hoBdq1kKV)Q2a6NgefRV756Ntt4qG;HBgPwH$Q3@->mO#Pvq&I8^EBT*H`Aa+6W?xK}AAeCUG&ZqD5i& zWNl4uzu*YsbC7%o#yeISAL^}RIqpab`-KzgXIp94t8Qzuoxq)3kSTm>iqed(S~~&t z$eu#G8RwVxvI?@YlM1J^Mp1WHKhvjQN4?~b*Vj9{zd5WsC`)@R^}KG6CBo7`^L;wU z`?P6DvrpE|$KrkWHRe;}gtuVE7T(Pf>~Cq9%ZK)|w!y7$uoZfP!Fcj zNW|-N!t9DE$L2wf;c-j2R4nA(+tY(D;HAUE<#uW+8pnOk7Qb`_TA=a&b0aD-?H&Fg zi5PEfS+kf@rB@g4VGp2sz40@B^XexVZE@(YBfEbg9rSzx=v!JHNVDGX7j#K;==016o!9d~v<# zylh6?3KsllE_?FYHtShMwH4%n4}DRD@3}2SML*nM=19dUzkWSaq8-Fnt*NO==yjR7 zt;+jHbZd?7BP-@$lEThxm5hm%(3JmH2IzMHw@u(Ki_2zb&-=&P!dH-JWt6X!T0q6% z?)KN{&FQxPaM2E0|0EL`(l{$CYjt_K;vs_SO@`Vq&tdcWyZf9-d>($#ojWivAPLP@ zlQUJU`K$TYhzQ>-iA3acYm#`z)W+w_r{{Y1!)%38JDTB)2A$9)CT(7K*B6KH-H1_s z7aBfnkllAg2hKYxRFsTPqK)lAgdZx~=b<#Ef|Xxn7+NNWoBVV-jxyGkLog@`UHw;WADhnCH~Wi&KbI=z zMdnSCMAsa*w)BhzJziZP{3&%jsBd~bPT9E%II4icc+DsHGY*Lm%nT%u_ypj(D?zMWdFKYbv~-3dHuWUQV?8KB1}h zICDm6`%qC)p}=>BxpR=pY1_#t45C}inhRLDmmO|4dsPA1j)U=3vm6#%FVN8qJ?TcZ zFgV+CG%r>)(evRUZwrKq24R~&LKnkT3x&TI!{x_oJ^Lt|OuA6_k^!`z#Z-y}Wz9z% zV=lw6_#)$TL;pHya3Vw%M1Mi}wUbIRBMaI*KdUq=(X~cEWfQE;7W<#_oMHghLLt4h z8qbj`QY(ob7+^Js&1T}_;<{cd-QL~?FM6_32+K%)r`Bo#nSfWL$;rxtWU1Cl`&3}J z7{1?S77R5=?~vqtkH#A=l$Djm7@5t)`)9M}LMtZVHK}q~v!->|(QtZQEUq=z=N_ZC z{x&1=Z6|*tv8v^`_G~ruR3@;IM>XA#3ch0LzqjkogpY!ktS;TjySkl(qe{WYJNO*PrCsS#H z2AP!~#>ULsQ0W1-LR1d7H%m-eQ)?O%Ps&GSfrKlm92?7enQ9O95<*?#H)@wa(XL~V zVA4lpB@D6ZG^0yV=y6sd<@IT@&tk&?+>t6pxk=P__c9%a7+M_-`5G_+i1#u>aO@Qv zhN{IBmp$~quoDEi;3{rA2M6Uxc8_{vV@c~$?`oP<(#cYx<5b%iQbkQD@7v5hO{-zh;$=H0@?_%36YxLJ{K=KZP%~_hlC|-J5dw^RWUR(Yqht83udeRV zzVhpl=Yd2tAKG`l4}`3z13%I|)glS#INIsz`g7$5(;24U6+&z1{riqkQ=ut!{C~B= zbQWf+tXfZL{Tzw?_+!^GAn0azZg=;K9Y479I=D9oyWbn^Rs#lWre{`wUJE7qS-<(ERA7S2 z=$DCTj0cc1>7ag;ykPsrD^@6hJ5mgC$`}4f`^~@P1@2vjVF*x-QQnb2Dxf`14hA!a zr@p-#PM#_^5K)NuYAY#7b`<4(yFOX)`m(`kf8PGM=jBVdS%1Md@H1OmTZST^c->cW zu?UDEoQpmV;gljapQ*E8MtyNQn~7LyCS75Gx;Y=nkvf^T*kpdzm84Y@>9|+#@hyvn zgoNQJ(JJ*%CJzW| zaZQCTMDumrg+`w0@PE5IZKoC#FhEn*k0L;sH~Tvo_knJf_Hox~2~G5OPx8^LpEq_c zk~v1)5B(UuVt|PU3|rb^pfgnZq7uK#uB+q=Fk7ec-j+|6Sn&2-{0&`8M1oO89Qk&# z@6pD7V$@2d+PKv3_6Rpc;_o)TbO!-&HV9(Sn4Fxtq9d}<@zhjkbvgv2Kx^N=$rSFN zgh_A(BcqS)=+L&kSn<$$|N9)aZReZaNE@*_f0zUndiupGNc*aKRP_8-=t#U6vxyB5 z2n3$bFW={qVpuOPxupI%cEQ(?^XZ&g_`ucL2WzTo!8Xj03jsdo)f*X+2NXG!NzT12*|@B`HK-xbENn^&~k$0MEDW&Ci?!VxRagcif;)I6b=0NCMQHW2zgx=qiyx?ky%#J1tp!8|}(Rn?FdmlnL#rkVB{=8CN$QxMo zFt&2+$ftN|uOM$wHCnV{V~y|Jvc-mzsa|R-J^Y}(FB|U<{PP9cXP}c(2WGmt^uNGu zi3C`Y>A0U!Yh&XJL5(al(hCW-bWZfv`cFtNj{R}A{*4y|^?go!bCtKHox3PyfU7i7%iMocHHdMVLU1 zDH?B`5A?LOw9L%Py4?*w(=0TQoZ?D#G~HwmJqdwZ5Bn?$s|ZFlR^&@+=yT(?L*%KLgMr(<<>mCfR#7Zn~lI=Y8XgJBs)J>Usa zSk1EJ?XjQCwT$pQr`Vb-m>7tos5qIwWR8k}wiL<0g;ifD&mz{N7;DPmN)bK znY0b4VPbkqF4AhgGe0Nn@;wUrSpus7ZhOIm-a-#qkPFRxbR+Pp?1oIulnEC5Ab-~eIWvho}d$y zGeLGQn}6$^T{kZ>T@4RswsEMNza-8wgA{u+L*KAVzdb3n_UfYiILh+-?9zXK31EM&K^ds=;M> zhxj2Y0Est(o#B`ym3{OCVY#$dGmL5;_nkWGu=jqGRmh*X_ zTQ9;9F5SKPn#widAc)87dSWat{!C*Vi7^~SlOMnhQ$^}LFT45sj>}oiRcMv3wJJKX zbLEm%-YKWB*D{8PM+;`4bJ}EZQWbEL=IkZP-}s~A>)w8o@NKj%{bPybmkeQ8)I1JI zIFKQ{mN5&p~#dD9~-E zAa4b_mOz?kp~2q7C5o&1zOkzM0sW0j+0 z_CGGI%|}#}pkQ6c(wMsORZi{BFBl*_FgV>DY+KyOV}ZZN+S{R93*GI#tE;Pd7Bv+W zV}uk_(Kl$bwN@$T;ZLUr>v5Y+oF`|Yy-nWLsmW!p%00cRnT@-+s6)B|iAC2L< zO&6)79E<@^F#HjXvq|7y)L}zwj@f8_nf-))%KOn#-A+ zQ=j+D_At+*Q9>V~!KRc?PCdG38*~NwRx+1%(L<-X{&?Xb{ef-P_2-^_rLN@{{0`A} z(vjNGp+5N$fD!7)x;b@2jHYlX zsFr{udkKs&sc_oY3V%ki3M|ry8?75QvF~6x&+yF8xy(SJd%Y|!ijRWNjhTD!y#a$7 zO{QIb=KJ^WW)r!(HXTprp943S`n&T5{I?DBkT;fRRR-C#yp20(-CW>EuK|iPP)~cN zuv;ODFS`Ex>^23z7>rQij5ChIGCKV_1f&7MEWOzx+CD@szyF8b$lU@%5tD zMjRcM7fRo&VDfE5c)aVc@ItI_q?hT>m>K6JYMv2zm;PyTwiIsv)rs{|D275xo1nSu z*UwQx#y5fhr>I%c%P#{E@{1jW%;xJ4B0%g?`2(sXL1j&+Yg6Fv6ay-pxR(*vBlgiq z4ghS8U_dPhuzCXhsa*Dx3=xPVA`Po{`$U%hpMhwGUA6~j1N9SUz=muvQt$Z;(o1$C zj@aoJUNg=yo#VwX&SN?q;kLsfQTQ*3PJgQFd!B)~>ZLY*sxD&Z9im`jg3HP^xejDV zOKR%vkT3OD&3kdKfQc18H}1!6)1gAuqSMvxkdP1?R`AKiB0DzCF z)(gvCH%Nm>P;8i9998FIA}B(`<|ZpdZ1@)dsik#?EP+YJXsv-t- z))4F_t-$tlx<(t-2|>S<9o8d_Qo!grMv4;JIHG1zEzd!Hm!WcMC#(jt)7Qeq=!x0t zHlcdMHq%6U-4!0T{bB-9o0=u9OBW=_(Ljq1c%^QQ?4BgfFKuY~xlax&9KE;9Nm<2h zm=Bz8jAH z#Il*FFRM0DmcF7u74|(TW0e@4NEw!&1_FX$NlH{m<)4=Z3k8*iQ|eYnK>CRXvBGiU zv$Z)PrR-hg>j?y;EQA_8{3hZ)^yGEhFcQlBY&i9PU1Rpj>CqiUdJH1(qYjvBTE$4J zV0<~@gXNopCKN6ud<8R!Qx2%sx0bGKboe=hkkU(rf-lpQdkPP0in7h|SptPR**yeJ zm1~H74tse)uhGWF!A)0B8|EDHotpY)X!o}<5d)1WkTHz`Q`hMApMfS!e3$?FrSGM= zaaQGlzTBvAQ|DnJTeDcG5}n7)%dk!gOH01#8!Z@Q+C$QZeVQM10IvWtOHg_H`1}l@VJ0>EMUsf#oE$AX%c2RH zP202+N6NKpnlH6($CF=Wi`a)`(4Nbcj+m$el0y1lli8*b_tYj$+DIf#h+WgmusS^E zlazb+0e+756;t^yy9@bCHL*TY#;r%Ndp0UP7h&J6f_FDkj+BD(m4L8i?2in>w3z); z;Z$PlnNP%Gg?rA^FT;@V#el`>KUkgbn^!BYtDz^L1G;YJdDaTNpH>RT`^Ytp{yA+$ zMMbe@Y%$H67$o?K3cR9@XIthu9l|ZeVQbh}Ni_=dkAZB?!4+bYAn)4S(x|JLfixoXC?~MqnJ{I?ISG{i7c5O5ouNrIo6G znu~DNFKJFW6@g`CFLvy9b^6GOTGx+ zWvLWr3-I@VdBrWDw%f$eMV|(s%W}>~m4*^ULN)B>q%>-ZHS_yu8GP(yK7~{LrIT3!z_KI6 zyk(Es^-NW0(Ntw+I+SI>fsFM}A}ZPL4c+`orvlFf8HYiy8Bi4Nhb=Ro-lMvUD#@M! zIi|R2h1TI@BHjvW>uTUj#ZqyBZB=#MgxFX=+!COB3p`)aX(gU-$ikj3)m^xb1;@{; z<+ABM6@%o+6?tJJonY(y&;F(xfR)%0sYf!(@}obc!=zpN5{1WQ$Zb@^3Hz|)yew6w zCvKF{q>_k#Vt*W^s4~jmYr@jnrDuWgDTE<_|i}n2O zFcb^3#icZodY<$CtAY6EwAzGqQLit2-Lb8%%)<8UeMP$c=3qj)_dQq6H;}rW)i#4T zeKY}#iD>xi#f}8&1$)|AkU}xDxL>{MbTgM2uv%RHT$NuE>Z5n5J2f(oLSM$`aalgZ zABi8m`5Y+=A1)nGX)sY7#?)S8?w%Z$vy5-rA@CfQ?{+z3mZQ3R;Pj#kqLZh|K7EfV zrgV97Nb}4XU|crh5X%@X$AMo{C^jFe6CDrdRp)xRje}D?Ypa-CQvVcoKG0v`Txe8g zJ=ldl_S+MM>H%pokQ52x_Vo7dxM$hOm$vffRw)&F;5~hJX2g5c{)AqY=9*B@35|i( zvjy*BXljp@6NAR|Iqw2cG<&RtP=M6B3Rob*5PB%4hrVz&0J=piyz$7zO!i|zVwdMv zpURnPvXFQVqJ7~Sj91#en3quUoui;b<;NjC?6A#HHg!kkOr z3r*pYjy_HNEa{0#-5M3$9XX?wG23P_~PINpJPMS>t&+dn z?@WsBtyYw)**)?`T_t(seNUdY(dfZq1$tvCt9hVynjl4X!)VEUZcW@*9EsTEc#qQ zmw`I<>1bzzaynXj;{E(P!mdD)^ttvBs1ykF;zEN!>vt+}mbf~fJz+0j*B4?&b8msv z7F5!8*XJ}I6o+@YaAJ;yM@Ke^_MLFsLM8d>r9ivxCe;j_C@+O|=btDP4Dn;__}sOK z1R{PC@NRX(nhvuuOt<1vqY$$*s)qlhn>kWHdeni7tK6TVl#SZ}z2y$8UJap}8uc-t zjc=^zu$89m!J2=aEU_QonNn|Bt>h~Y^tn>}3xvI=c8*>YKFPnl>U^I;v-=8B)(wk8 zos+DTC>qowoh^{lVMiX?ou-qtsDBcF|Ah3@x!U4ebAwDb_$fo3^8y_hOYSNCV9z}I zJ|V4wL)3>I@vdCWK2q6{(%o;_K~C^Xhr+7kT)X&`eZ-mtT2D6L+2QMvmfL2db1eg& zTS88OL#YHuhX_TW_yddOED;7dbb{T)%I6mp~FPqn(&)P@Mb!j*czM74$+fW)oZ%^8;)i+5pK!8l#EQ(fxoOe<__3&_UHZvJIQvCSkdYzjlTn$PxGaW$FaMY$LsWT{0>{F(p9F5-=$(V>|_)1 zUy+cUc{0bi_qmE7Z-B~?@F!BKl1OvKFG!u#j@}VKl$w4)k^G55< z!Gu1i-9|ry65O|9Parp#{SIXf^hGQ@`fi8^Y9&cXM_|NeM06AOovk^bm9M2d?MaH| z)-=I^uaADYiSvm`_4P;4Ej1Mni!!vP(-sCTAiSo(G8ivS(i?^?7{6T70b1b)(DN6fOo(y# z=bVm5G@Q>{tnvqlDB}}4T0f#y0PP_XQUb!mu+UJ&EjAvWmuQ`oSpNI~A(v=0@?6Bl z6J_22gtH-K|2zVHJ%lH<)qcKG_6p7-AQVxQFUGR6BI zs9&I1+t{-WB$%>l`XccGf;L)+Pc%f_jMjsXgER9*1}6utRK1LV{_5ALK!xCj>tKJs z59ze{6KEa1?ad=V75JKqLbP2|qMqO144$imJri1JYy0k2#QNU)Ti72x)+6}I16{5< zUWMf9pAnY&Lc7bA@kQ(|!1~J~d&8eI2@50}!Ss>1pYMIsXZE^v>Ej>!ay|HE1hLoF zcv_6Mg4?gkBS+EG9EOgsl6wWHJMi+@`Jif+GlZTXu$<;8I;kG!_EI^{6nPnwW-#xi zO+~%2B~ff!1w|txTPzGfbfNs)r%QUaw|hhaG1I;IohEra*{pRuW^{YPu(1gF@t*Jg zp#Hm401b)~1TTVHWBLt>%nDV{qOhZV;FwbsQzz^(vB@4iE>4WhFzf&bk?u!#v-#?6 zaMm!|tO!maZgrtr?w<(z58+^F!ax*F7w%yPg1*0X+vVJJT`|(Fnfh%DoKt3E(U( z&5}SR2+xs@PR>2S5*M~4QD^ZB%@jLUOZUBs61cZXMXtOBpmKK-Nbkf|s*GUvB^MhV zD~PYaMg7-jqWuTgr~xg7Ud{zcIa;@@R(YQlj&!j!Ho>}6V-D9^p3E*c@G>P7aFt{gI&C;r;Ri|Go9Pum+Z)jF(cl=5xp}>_6Bw(YkMA_(tLhEN&KPz3U z!pvT>t5h-}Jf`%BWrQwQT1j~iSU90%vS!7`UP}e> zIlJAdI0C+SZVne+N37ao7(_Vwzhj$;-oUwV7TL7ht#x_`uK*uPk)vAfM@T*zZjNHY zE|%z(kMM*R_i?63(2fkHIte_5!XN zNYJW^T6|Dt08XR4qg_7hEPqf}k z0Yfklr{?n2MT{#hU0)G8B>W8Z>Uz)P!&|Y1iY}ley6ZxLQZKw_P}$Y@Zb*aK?2V>> zj*A7^e#t>CJ+ij!GBZ2t|7VqC;G(VsC7avCnF1?gK?n>W?|P*#mn~fIs-TRAS@o@7 zyj7uWfO146HmSDZU-p&Ax19v2OI$AEX&x6S1Egv$ZgYV|lv?{6Xb7W=x*uvtC}r(6 zd(wX}A+S5+0n5+UV#~Tf<+}mvs0)EI zD5P%t`tBd_m~2|6@uZP4NsNcX>g;w-S=|O#2p&x8HY1Q1+e$N@TUHh+(a&A^4N1RO};Zlwk zQoo)e%+UIzG=Pgc`d0-5O$^>YQS)|3%|@p%m0x-lwNSfQYyx?~i`Wzfv7_WemTVn<%m+l@wdWoNQw2vQgVt=Q8?WLV_)Ep;9{7;v;&6tEeG9vn^IAS zm?N%OWzqq&*f<8_RrGXPv*XwVS}Fw6GR4kUq*S`GsXbUY(r%SGCQ^F7%rY+V-Q?2n zx%4kILz3?G%$rH@`w;d-X$IrsF$cb??IFU&C2>m0$853Ktcj4ZsGXq4`)eMF1xczD z=j2EgE8KQ1xzktppN?gYtLy0%EjBfM=2W`q4sqMwHkf57{5<}!B46ia3Z;bc|Q=T85bcHv7z1{P%w2fxclU1;wj$>2ifFia*jr3-WMX2j9hR;f^U zB)Zr=2dNj-Tu);^tFYUN`+cD3_rv=lq83VugabS96*@#N?kbbnp)_3QRR|h7K~}x@ z8B*_tLA$BcP;B`tu>*O9_}i!G=rjm+WTg7B>u)WaMK`6wKP_(C+Ub{u%O+IA<|_{j zN5`+mMis1+*_V*^N+RnOgo=yN45~`I{RoheZWIlYGAPZ-m6WYD{we7&uyXlRBeHu_ zMmaerKt?E%lw4BBQZwkJ_GWdlKc><)du@Z)QTl4;80PMnL_(GhFCYKzTzdjpm2bB4 z?I-#GS*sa&v?8l{6-N!d|C(0XA?wr0mUQT)Sd34gUfqls5|Ws}laRjP@VYsf%GR1{ zHp@~L7DssfVV7rA8u^1o{QY2TIszGuL~@Ue zo8fjrl151IM~|<;Z?^Me(S#EW)69n?KROaK3&^agqL^2Ub~UGva*33`R5L3#$?G6b*23i+>Xj$#ze` z{vT8sj+#qllnX|)KX2kgyH7@^54ehrp&mt!ZRQF+ahS7XA>K6p`lxy?*; z0&TrNk6WYh7W!S6ah`Beah>QvExk@%14-UUzsc#fvBT*nKlgzot=REwe( z($UMW5!1dVQ{x}DPaMNKKi8g@o;m>E$CeLx3y zw)o|7_&6;G!qOXcE1_oDdZ{_Vg%{MAT=}T#I!j687Vz1H6 zniAsK3|>?STd)3`Pk~>=!OIYTIr0yD5zna2(h-q@n?pJ{byb&0RO&;mdc(^ec|YXw znq5aNJV&gc&CUP59x%aN-BE4E`;HA=K0Fnzsu7Xk*w+2)<(^5_=5h|D7p@AY<-8F+UbT6`1s#xo%>nSZ>8Y`NVousnCSzSj2X-Wtn9z^df|( z$fN4;`LCWZO3%|uCVUApmb5dwr5kKC6rm*#c!nHhYcf&v3IAQ6;s&{Z%(y)%gJy_p zpp~m;Y^5@H6b@ONa`1-$YH2nr-`Kh5g=_?rhNN6-@Ud8)VacWaug~N&ze-I4ASi&Ac?1|h4)KWAnh9!;oz?uhU)#KPnnS#E!q+%8CGh>9fr*pBGWS<7moj>c~ zP!!~Ma=)$|Q8|#3PKYdLS5lVd?VUvpi!CvKm_3k|%kZPrjHQEHeJeiY>&z`@Hv#=A zZv8bWt9&K1j4|V1iOD}i$7PgY#M+wS!|=!_srFycu_UBOrTvygpHl-h<2uRsgyAcx z*O7;VXdEB&8dlwn@}KIPacCGBAF`V^PUyWR&k;d87aD_lfr5gR_Y$;yXkr0lD74=)&K>DaqRT zN#`Z@Mn1_dhAwKTiiRK|r_r;_nCi{iU1@q|e&R1h{`>(Eqi&R|if0 zeC~g)^jD_&zq3UCVVeSI(*LhpC4$t($grt7+t2*p(Ix-Dz?h{T&;Ik#e=hd_zr;Te i=l{Qn|Cg83r8h4s!4~nr>k$a>M@mdyv{d+w-~R!sOS1X^ literal 0 HcmV?d00001 diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/source/images/distrib_optimizer/sharding_scheme.png new file mode 100644 index 0000000000000000000000000000000000000000..b07c25b05f9e2e7a2973caa296126c724da9f4ed GIT binary patch literal 99135 zcmeEtWl)^a(k3B5kf6cc3GVI=gAeZR4DOZy!6CT2ySuwH5MXc%?vS7XLfFau?p9r? z{k6ZgYO7Aw)SUCy@$S>z&(o2r%Ce})gvc;3FsSl!Qa~6OxG5MI*bqcmXiYSPa0dGG z)=ffQ0}&B%eMjXx3=BDpyp*_xx6yfzzJvLy$HTlG*$-Qo+@{$z``7z}^Ooh_ zjgI#AAKzi$V8HxG31a-#^_#}`KKg0Y-VNy)~ zj{%^6gXSACjfaILhxw0!^1<@I#uYQ+f-^Dw8b($8U;QKY4@wdI=NjH%;G@3L zNiW(#r2nt}|1JUH|M?}*=^@zJUXdI`9iFDCNsq&6A`MhK6CmyRm96hib!bo$-YZnx zK85UfZ^chKHfc@M?#<<64=PX;PPn@H6K>*9NZcPDZE>AirI!z*;O%o>H+$;Oc_o?Z@FM zD|K`i;Y>P*v)3gy_*C8rB%G4h32Sj=%$Pp?yWvw0$cx8L@DN*{7QN{DCHaiRlnhT@dK3QaM5evh|1 z?;Z+&I@H_&RCqWQo-f+9>mWvUnPG>^tq*8%Cj!-JyVn=Bst0ip3o zA6G8qD4U7jym8{oO;4-S_B4~|N(Fy3r{8cW07eW2@CPTp#e{DEAzWbxz;=sHSkNaM zoUDI1;EeZIqDKKH^CHuZMcK*+xx^>xH1r(?CbXn+BZUplD_0M`otw&Ki^Wwe5aZkv zirL&xzm2+EW|m#kU3r!yIJ$o*H?0DZ8Ll*G@z0y-bDTmK z`oCq^G0Di*}6&<){+$&^iKM}_Vd9i0l&sa%zD?D*o` z%!JK6x%7TJvFo*D$&AR@w<_Dz97ArmA(n*On?@ba#ScQun-$g~tdeVmdv-PB4~h^s z$wd6pRN_x_Kj-!|nJYYX#89D!;AUA?0YhFx?)(-}(K*$jEm!5p{U#BGxf%uR%WrRM zgvIiSE29cR)K%s726bz0&!N^gBPbjuiu1PEDD^FFl?-AZ{;*R zIurJFI@$B7B|8ufmN!X^OMCi@#o|mZHZm+O_U5#Iuon4`Q>^s#&{Yt2BhA_m#ST~$ z%YxMCtaZwlUm=l{#`;+)c!b}ElEy+Q*Jc9&zfhsVBoCWoS5VI5!kGbkzn*SkMec}^ z&H=k6B_cXD5h+rM%}_(HAex8*{n8XRsgZ4c(On29lAK$-Ap(uu)p4;<%(!NwVvU5SAJCdvYTlKV_nwV!}?sJqu3hKq7%jPdF@UE$27HVF>Z^yVgO_&f8 zPym_AvzSdfQBw1av}ZIcXC+dF&rV&Sg2dTxKlAmlMzF1}alC@10uZrM+S@a4t|^Ks zOC9Fq0qUZe9H~xnlWK2Rtjh5xJjg3Ixcf1SlHF5b#w%2cn@aSCn(gjFT13&4__Y!u z4Jxz%$@A=0W?&|*Ge;{9_jL~jIS7JWdc)beSxUVyTXz|tW!P(P?hF&erJl@DJ@ol8!4@5O9-)}PDSrp8rZ9_Kg#r7q+;m; z+ll0;sx-AIh55kJElFfn?H#hDkUjCSdBjA;i!yy8n(XcZvW_jJ>e{3oSBzyj=~h!35Z=LIE5*!x29%eS|z%B^8!9tg&`XM zlI3J&>-M@~bg;L%_!wW3-M=w8!;Z9K1PPYXx<9AM6x}qOKST*n^`f8Nxo^DOBvCbr zU6ERuNj$_xlS_1{-L$lPLOs};|4R77-F|>$B#L&+xK}Vg>8Fg%=b@9W(fOFttktvT zKrf%Oeg@eYj~4&L#&G@U%%?&=S|lD@9uWc26<3yzx&77B@^jIpvdTXm|BMzEI1|px zXI(hzm8t;G-JJsFNE6#bmNq8u3vfz@P(8Z!wN8GUgk(f3pGs*x#J>MH18<)b!0DQM zUSm^l`MWw^vWFsHfI$oCUSKm@8ztGGomJUu*`X3gx+7qPZiswV+0-TTPwomS@mYgw z^tx8NLtz5hMU50h=r*I-*?}qKBA)goo1WfPtwrP5U)CO_y(2ad%@H}m$;u_5P4t6? z=Dqgft#i}aJ3PE2zpAjXuuD4^(pr1VNDcNhZ$su{a~U<5(#Rjv)ps~JIhU8E;!(Pf3 zmH0_LBxON^UjxD-xpPp|ma@`33O(yfJ$$T|%AX>bTP@XbGmV!&swVMTFH-#kkG~+E z85G=yylCk2kS<~UfcyKbu*mnjO+3`z6E3U)$2|Ir+cTYCKs`>3o^HxssqZ~U@=y=~dN zAY+TCIq?)~s}Bh5ZpmleO)(-_8x-0$&g10tdA>17yv#COC;C;->G_txQR_OHIO!FM zSr=_swG~~AqkHN%;gn|!o}~-T?p`(@PNNK&bzvjX{q1-HgR*wH^d)j;)msE2EXIP*EbD|I^VK0{=l0Y zYMYw!vgj+tT%2|8YU)U$zPDuD$Bf|%>(&{>h9PVj78^N|Y+V(TZ<>nfUuHndvkYlb9;T~l>52zgunukGvSQ!`kFAGh^9m~DcQMW?F=W0&sPDz(ie#$dPX!^ zrzw9zj-Zk)as@3TI<8<+^h3_~7IrT8@~HZ(tw`LvM@Kty6|? zY@MU&sV~PLHWV_11vgx<$PIPwWuvk+3s+lXhhM&K!qM0}L87ViU$ss%5Z`%qnZU-) zMFRS?6FK4*Vy0U3BN3~owdBaAsk3#zcX|kUd8C(Dj)Xzzso0}EB2AqXWr+Y}%PeoE z@@$ab`kBl&v&T7@kHoPt2gfxF@bnR0TSK;Z7$3h8#ZC-skL{Kjbn#Jg`-^R$OJwEL z?(Y0y(EU9BBC}QV7gsBLm4~mnq?e$YNoB(Ajrriy=j-=pLl(XR8@r{uKB@MlUn~0D zvwmI4UECOMmVDIeSv`o&d0XQe4R5PO>V?Q$cQzy$f{bp`)IM;3#$=GJh=cEpUIpZ~ zHX*038e4N2Ezj_w7;PQai5~x5P|S%s&)*_P#OT0Xm>LVhPqD$*r)X?!Y9ZH0Hn7oR zA~Un^EPSXT@lRh~lhCt>-)T+ipg}8RJET!E)3si|jb4!axz|tlQ!6#hg%q^~QUw8$ zjfcaNor4}0H6|m1#G5b!%CZyB5wfD$!sL;ws<5Q(#!Gl^ekAL4E|@n20ed4RAJSyx zru91Ix!e(sii)2m!+zOn_Ub^oqrlDco^q$p2uEfS*(>yvQ&~XkF9tPcn#qLz6`#ag zR4K@Tl#=?k&HFrtsBLYQw`GaUc0XT|ugAWR7&H@4x+bI{{^RyZVx~ugR?Hq^y~wtL z_f#WK(5KyZtZ?yc;no}*DFj{8vKrV(ZCl@)hdn8FeXV{mJz%Bgf1%pdZe!zD{_U?H z;~yPT`gAmP{vmLWN%556jJD4yRjB!efa!%cxwAns?_nrN_56H5+9kM}D{KC;N{BqT zXMhm>khTWzsyEkG;KJ1X~lkFxPJ<_Keigox5|q zfaJIG7KeOpH_>Lt{Tr3=Zlzt+XvyhpbU(}9Di4W=fCE#R)=^RwnZ9z&RAWcHc-A<{ z7&Z<4fe_%(@?M%ejZ7>6#VOlLQ=Oi$e*{L}_f=r3TBc9`(cD^`Y8zpj4o#&CD$!ir z=LNY(w2HJX$M@{&eEv$?=pgVZ2%Dole0uF2ZIuK{v>Cc?j+Riw;MQRG7Otym*MK0} zVXEnjlFeLqyE^W9W6EOno=Q)EFeg)`P>SWIu0T3w(UUSiwNL!q!6h5b098OfpWXPo zYO99d7xg!6(;_uVHpeXK6-P_VBuaW#0ZpY~F2Q5KjZitO84xFyil88#&k9p-Lx9-i zQ+Gwoj|fdaXVXZE-~62F09^+*a&+Fm4!6lI9IdHoBa32H?V=oT@@e9DkJht~`GDhv zE^hTm$uD7X%i)R0{L@M;oA5TTGG|w3D1CrKs@Js0cH(`Q(Y=W~FL==6nYJ{nD3t~w zIRvT8WSYMrNm88`T#Y?dIQQ^*z$W!o@mT$2L0SETlC!yXlzgkMKy$037$C-3<#amtflTCsp;s&M$e9c&mD<<5gnkEGqeHnI$&{X*c|__hhS z+PwZzi0#fzb?1#kDQ`N#7|+>LdTpMM{CsDdMG0voXBfB)0#sdO;Tfk?;_}O|!qA^R zPs=9heB{JOS(h%|H5Dewh*;4}U*3?&v{Kzx(s!FOu~>m1s~z_Cp2v$&y`V&|nwbj3 zNoh^7ef*|fCVQXLsMJVgbvbz30>kxW{)kFD^AxfBy?q+0#EXJ#?f4n+byQ4WsXgc; zcFq($TPxC**i@-o9xuT!hY3Wd#p{q$jrcGp@Tc9wq23u9B zm=%pOzWbigkv6G^ay{zW@r7X=ShGXw?8Z;DICSCwJOlM&e>rp>tmAht(wn<4inWOL z)JKi}*pl2+tqEtZ=_fp^SnFNlM%76I@>63qTypXGT>&_|?OwO#CGy|aDhz9rIFVqn zq8TSsu{o$&Cj97>Yg60Gin~z*2FC)I4u?lH#yw=)pzx2c{8SFFgX|k%kT`pyGDSvN-pahh z)Ln<^Emdm4wQNaoJSH#7juOpUaq%7|K#eKSR2-5;uSxb*b0iG8qnf^fEenfKCq}Hi;>b~3K$aUzw`#W?%ARhzY|wNr(|#>2+)jg+7= zA=q<*Zo~|!<>JRT>w#8o^ieZ$X>IGp(ZnkZWT;P=0=JbeRw+o*^H?$@=I|s`q`{J? zxbEI3Y$OmvZL^5Ob&UDG3e0_Byu}j;FMQ$KQxbMRCp*UCGGM_lBfL0WUPYH0T>>;E zV*djt{cFEsW5O2sPFcimaiSr?RX9#B*b#fWc>XZYTq(tb0cVzhUJ&b77@g@IEZ$VH zp=FaL$4emUTBPcppgf~w4D~m0gjCn&cR?1T-FOt?`PVWfax=|e9XBiM&^4X;XA_vp z8F3AIh|xbzj4aSsjctZGin=sd46;O;CEN%H783Hm{rTR7Dsgfc6dYb;I&%XA7}nM4 zc}~!(``sICXqkTENJ10#El0&#m9n`h--Mw3`CiyiZ%IY2Dc1p?A69fCkFQ{*08)hg z=?(c1yWldwI=+N#Iw|s4cGg&~Px(|MHc@N1 zVtNhRkBBP?L~()LItvj zy$gY?{6ZbEEN2WXlVrFnWOwWWWKK!Igm+J8yb&zsW>31EE8M{5ro~ zlEO2|=*wS;e?teG^uu)CMXOV92Ej^O&{9p5Yi_?zr@=N&hvVT~MS5rC}+QHa19lHCuu@j3{foTfwklhZQjhQCu%nCjBQ_ttg^?bfFL5Bek40C(F zR;W{A_I_SIe~EqmL*J=y6Y9ZkK^)vFy!pq5i|Ai~rp0+|v@tVJ+=ekp+~bs<^nG2WN-lT3A<}?bile5B<_)W4tE~Nc=isGX;c4FSaRju5bK+URrSvLomHXB za~&oDk2^jmL3&!TV#8Ws88H@n?$}I87>G%)M$4Gqt7@^FP@wJS#7_6FSia{fPrbW; zr6tucT8E#l_aqRh)nR3nHgn=hvEk1nN$T~_e}u4W4O$?KE*!=$$ex`QA<2J&o@R zItONBr9ijmOI(Ftl{6t5VejVQVv*DrOY2$D zz}HPcAWw#3Nn#`%*sA$lPzSMez^P1o!;`b?B8(hT;%cwute;6YD}i&-x?FqC;nv8f z|16JXO*bKrde4u#x^DJE@A>`O(}o$#xe4aNEoWxz)A4X)RM%Wa<{R$xkhOV;<%InL z6;*n1z0mwkz-7@lSW$%?O3p~qbl6X2usvVLvM zzinBhk?Ut|6l+sHSuw3Q%nm&o3A*)l2-Wh~dqeVDHc)kpDm!33W@MjiVmE&3Q<;Oh zP-{3uhcB6M6&rX+tw8qi2CufP;OyO8!Rp&PnW8y|46eG(neWJ9fo~?A?6|>IDYpiV zaR-#L#t2_DPGBj5>0^hm}c;n|4{j^}z6I;G5!eE;Dk9&_GzFzdOZU^ahg?9Nd6T&IzR|yy z{5oHD?NqN=$0`nfc90sv;ewxPRGvJLnVaf>^P9jdx_8{585M#YuqO;^5F0QmzL?Y< z0o;~X)#&NOxTg4i{76`N6tu@LD@iN^I)-w!b={8Lq|LwA$^;TEV3ok%!C}$Aw1$Tf zG_$cFaC~I4>=syFF&BRL>b9JjX}K?@FUJ{5d^thlx*Qq~3pgw{7KE}+IpLU!23NcxpUXwZKmdhg(3j>81u!bmdVS+IlQU#wIlCD zdCZQjT7Jl-So~@A)qZkcHOld&&*^;CAP{RowR?>BnCLvgiaSN2q?f`iRqq?gSMPIM z;?PX;?#G9hl-<72(@PH=GyBfK{mNmJ5{n&gL z^HJM{AK?_S!{T)3Mgc7CmesfKpzUdJCC0-C)rl6psbUE2HyRyXV@$r9GcPQJ)+#S)J#oeJVM-oAb?lsH+d2j$zpc}Eg8Je zd2{+y5ElGQ0vEV>-I3$8n3Diq=aw-fJ2X@#nI~H?OA3?2_J84&?_wrcXKXL$c4BKD zYGl^D+B}(sft`;jt;PdZEVH+cU74m}1s1Np?p|qYQ?iAMZ=(TzCoyl%m`#|UElLx1 zOKIN73ie+s`I?vu8q3#GC_8Z(?q*#K_0-&@jC4fQ^{L(}=ZXgY_|r8fwbfo!oU@ze zy~I?ebCWpZY^=MzwpkYJx25g8>gGvINk~G&UbwDBUz_bK`REpZxO`uF$!&AP%x%7Q zdMVZH7H7rfYuj6`F+H45sEJ(3GHB73;4y9%J?3DvYU6k_E>Z9M+=_z`zHe~du)sSD zJdfyAPrV*{?r&IGN&Xti<-X~=WnzS2g{#fm>h*yBE=yaW+^YLWEoBbLT|FsZC*ZyJ zKhTUPB*npSdPm(IKkwq|drUhC-W=E8x3`QUYukne@t+EPSWiAW^kzmHNt6Zc*!yv3 zgs`5!?esl0bDqZ-j(wfb95zQBK9-R!=MFXSexDE{QOrxMa&-J^Av;GYXY}322;&Rh zlY3>7H9ByDg*DmX!Jwy_A;(VSRq?LPC831DSU#3$QH-Lk?juia{Zy&%_yc~ZAq4Y! z)3Eck?xU4+_o{nl$@lpgwOJb<8s-tmk~T*)+lP5d#Yg0*KhtHb<0A~Yj){N>uRG~< zix|STatp&)MD94g7pVm}k-2%|GK#m`4~7 z6-hQtINF_|;*ez^5@>3WQ^2uW*!0k_0H}@|Hcy{Ag`#d>m+z z7LB;7srvH#9lhusSdxVH*AG)$d)P&~m9rSZ#+^k@zagR+8qN-2ic|CY-!M$mT553* z-hn}b>HUo6m4_XUQ4_1R>tbV$&q-m(kqnt+M}tvSSyUq4NGGOHEnC=!glq*C5ZkT= zwt$UYdeR|NjOR1b6jsSy%Dgl}adusRAK5|z&2>Sk?CuHey!53ae|dXsb;I)%OBxr` z$K-9UN7kC?uUfMcVb>X(~oZC7Tc=8TLkooLJ$LqluM^AKu0mrvUD-U&FqE;q)kdy(ykvLWU#72 zy?NUzv?>P-AeaI>dr`WMmsLNlNaO?IX0v2*eaRx+zglHF12xZREJ{WzlCqv1;B z+taV-g52bfdXC>5JJEg#oFmbkvp_!B9(SU8Jn>(APaQ8F@=TVyOWnMQ6geZ(%`In% zAch72><|@42dO`BrbU*=`>_H#B!+g@IhTk{M@vumqRT-Bv4pSl0x%@_gSWDo_mPD& z1*e}IQ~8EJjB(evV|6bz{G5Aa4rt`g`q~}lnb{PsiB_=1Cf)-{(h|>|3lB{9assS4 zccy1)ei$eu%LuMEj%uNuraVU0+yQjy*h-XedL~b?>(&VPQRiov?e)phL2QuQ2+>j+ z2T<1B!=@@n9dyn|sQ_IsAKe6!?@~aE;{4vh`FV=U^z3@7EqKHn(7mcs9%DH@(vRwT z%Wt5VrYA(CPR8u!?G!Qg0Ynl0O&yi+tGytDk20aeg1l4`zi86T==-yslMVfgbup#B z3#NGG9w4x;@*YII=`~4@W9}2Ur22BHwR@6J19GREB&HM22>Y62w(l;Y5?+U_O`P5s z%n|&2k?JW{*Ckvci$Ny^Wk4tj2O;UElR`l~14tRB$fx*84SmT&nKs=DL@MsNY#bYH z6daFskyDuCo>a_7j0i>^>7FMh{xVeSbxPie7Cb$hB=c7S+( ztow0*ID9u9_Tdffnnb6zI(mg)x^OODw?OPF02p1zu+9?M1w*%|$hWp2KVdm3R4};? z$wD)p-qfxw{MJynW>v`-P&U0q@Y<7x_N)0NNj~NCfPl3}QyKeU*w@%`k$J}=9Y`u; zOx|8Sdws1b@7OLHBiW8P)QsDO`$za=rc;Y`jmLXMXAzD90ov+Snz55fL z)IR5sk(t#*S*$ceb!iAvW0$&$5dk4Epukt5;Tn1@)~nR@q#B)j$5)lYu(Z}zWf?GM z#FX)#2t|solr^B4!0d_#5URGyU_}G$!Bets`Li+w*rGMuC8`0?`dkqi--qv2R7Y?Q-YQKIE%jN`)7@Regz@j{8PLEz)=Jv(TiQgf0@uhUMKKC8sv63&w^quH+a*2b}M(;tQ( zY2HzEKJP-NNK^ZW=azXed1zk5*40c(g$_)rfonRy26Zv(+eyWhe6MsXYzQ5da{o4U5LC;TRt@KMZ7yw5s-6n>7&M?$9r-sj1gt&GD~}cjOizt`eng(b9g^m zddf$TxwHf=-mtQgYw$|r6GPn*?@m*#|E6z8A*f37T^GlJ4+koV9|Ie+O@jk>=bePi zHijIvPsyRjt?od3Scfy8&=Vp#=3f9x5m9kwkeZ0ou(o!_uBqKuR^^foO0LLgbDSDM z6?}AMAwb=J{xOFF{xl(t%eZ)cps!4*z3iLMYHzt90_UT+FfpA})c<=jajEa}eRUc8 zt?W$hepBhom2yw|%-mrMaoMAQBF7k*+5D>Ip*o&EGwZ~H1TVL8xZRt!=CL5zCb-h= zOJJVb>sU4C8c37#Wbnp_JpZR_(SFI`7tYX_wI)HoR0V5h^cS1O#)|CbqSMv68{e@y zUBk4f<$D?@%9{I?+1!H#AHjDKmniERb3kw z={#<+GrF7$QU#pdjOiFX=ZbO5DJxEuib0D;GP2nG82!_o`=A0~uEq`gf`k3SfR{#V zK+xwU+K$+KQ3Ix0{_nHpe#@`kKiXQu5^4Li9y^7kKO-wSu$A-HDw5N^x+s;m6=O8V zo_}sr_x)Zbso&D=t2?8!NJ-yDTBZKMALTJy9uhKXusdU;KaQ5CV57Y>A@7IeVash# znU|*?Jv>)rFluKJxmHkZRpi#x(LL8+%u2dYJhv9_-nv8xiReY0c2=d%StyD5A;^vI z^h#oP>e1-4gA9hJdOTKJ`W-E|^-H3*T8Rv?q!Gaz`mS^& z#NON-~NQZX5+o*@1;vrdhczce_a!xwEuWA zaCtcCFZJXPhs%Sm(uT{`6`!TMdJ*^A**sU1ldY?}w}*}--y_i1zrL-ffrBg&?n|mx zTC)umqhD#Hb9+Zq@mw?$i#!PhOJjX!i&uHc!*6dVd(BAT?tq$s71yy8>*)78FiXPL zd`DFA!_J?X!&3>V)V}VP+5n{h1;G#oQpf3vL?|@8Hvoh8T!kr*r0n8X(YP+Z-|ZHw*9bv)CeN|||Fym*U?4zd+2OKr*78}b0@mXGna zwK{2>WYaHkRz2S<#{Bg4=Zv=3x3Uanm_BXPszjJVF2dsNTg~&3tQaO%CnBs?(4fpP zlaK`3uRo75d|r^O+;Mx;Fpia`)WeLgu|gVse=Al8pgrOEP4 zECC1^2iKCMBc)vCT-BUDUhYV^F=lo7(sc3(Z>(R|No9qH%*S}FQN!F!?sS4)$V^du zx+_|wNClS1jcNv@=)K;7>J9go_{j8TR;k~m-JI?7eq2@bt1Cr= zEXXuRb#OyJ9S?78eLC5Y-tp^`Q-NxK8Ihrd58#mcW)T95fJ?x6;_PPHZ+EGJj;FgN zY$XDq?J?u^8S;;_TDnJXOON2c9>u>+)BFLxU&qxG3fRmGOext;v>kL;Q54*vYGjjM z(vPlApsyNs5y^{u;ALyGvg1cLpXo4F7MQTgqR$V?dXuzu$i}d$n+`Y5mn+`j_NL-H zM+kf+LX0k}9X&ktp0@Ih>LIaadQQne%!!JhjZ0RG{X0L2Wh+x717e41bPYg56dQLo z3bP|g?p2M=kVt9hx)Nr`ivFC8sy~YUVQQiHH-7abnwX9A&Xh$PeDn`PGFlhmWnbIg zaVP);vb<@oh5;6pmx#=Y3@t@=6m=7W*s@((w~su?j2RR+q48G>US?&V>ZNASZUT~m z%MrPLe@NfP|J@!7wN2lkB+WN<4Zb(a+$#dKgJhg2Q8yp|5ONAoS)tQ!E;Il5_PA=M zR6^)rV^aM2wjre{&rc=u=GX}fHr*XA@lg}oVz;zuH)bOoUOhY>B>qbIj3m3`n3iBGx&a`<}2Dw17 zp~I>jMa>nCn8|x-{QRM;P}gAP><=nCsdZBz_@X@bk;k4t&YcVnm6#7R%F0|X2Ykb( zI=Qp@Id(fLsrE#)xH*-Dx4ugLhN65q*!W<$$^jwFE7hpe!LR{Qv1gn6nHk@(oM-=p z$Hg3hytuehuJp6QEgX;CWsC6I06AqXMr-(uKOk+Dp;bFDA8NiAC5}O8oRg=I!gotl zvsEF%TMGE*P3{$$5R*nq$oNldf??jKW*QBkc%5-I=ltSbOiQ&+C8cnLmHqG0;5i1| zstDjvNB;&uv%K_9-SvEgUnsULaShvP?ekPeR0Po^M+hue2?yy_-gxb(yx@3!6SWY9 zRZVV%5u|hRY*de$#!#vIH7a~@njS_X-g-yPB5loe&uA%F=Q4ka6{%PhfrD>jiTs`B0{LIbf%18Sau;i zwF0Yj`dF#B;AZr?CDYU{Jxx-q{YWsVDcUlL4)}f_$ZpYVD-{>B$$Y7<>v$k z_$_+yMB=yUYn+bJa%rXMByWx!aPa{sWmPz8%t#Bt-NGq?yE_5IIoIpPp$gva;1ukxsiE^i`2-6YUI>?k#;(xqxW;a_{0pX+Qx3UP$8Vj=m-EWm zUrS@R?JI)_V!x%g1bIWXM+K#Gcz2$9iU~FDU;ESn@dv?c8|j3f*&O6fYhu-x z6K{!W)SPTWUwAHY5;u?iW;(qus%r?d{HXAOw6}OM_#!ijLNxn(1v;}fLMHQ81vTs2 zB29qQ)s#6!Gi4u*g&Gn|$j`TX+=e8#+O1vf${NJhtfzR+qw zP^*C+7W-jGRwiHisD%Dm(#>s^czP-!A(R}7Ea5@9_?@4>kVBuZI+!5GuEl&wYC%u= z6Ubv&=WyCg%lq|E5AQ=2yvEZ}T2_WoyLAd7Qsb{;+TYLi-EdXR+OuK0EyeSxlXmtW_T!9m;yS^R(#B7Lg+wyl;3b%o86yhj(VT& zysl^(Q&K&Iz<@|?mDm#XZB{`^tw{1z)=IzN93gsaLx*4HH3Ydj7&Z$?qxqkp)B4RutpY#~c(6Ws47dpJHd>Ht-k0(56|hXu=#F579o)!?D1 zjK4=Nc0FNKwDly~#11`!T+__K<4)|GbZ&S0JEgcum(bsUaTIjm0?%AAkVE<}=wMK8 z?vE&d&={|YN`GEq5*%bmOgVQ;Wq9Ec8B|`Sn10br=+B=PLqjU0O_E+PvMV>!i&P4* zJSgAXiVjWz+1Cl@>R}!8=nqVjsw~bJf?}08JDFO@XGQTCG zMScSfmvLd_4~d=qUXzgWu*@ve6_O075|Ow`EG!5)tQz6BHD29{4nI*u_4|`PG-e2u znJUa<#13~O9#&hvD!nitAMRXlBIxCvhh79DLJ}i5=%p}(J8pl_YwGFiY3OWbZNrLfD-z!F0{Tqn?*TSX* zotv_{BUR16RYN~vQ-WFO){@3ciAQm4|{~$4);rL$y{l6CIVqj3*GF^#mp`!HfEiPEl&Xhajp{(+M zT?!2Bf)q53hwFf69?BT__m(|sXlL2(6wuT7*QLO~l}5raP1{a_igEt4WmF#88Q+Nn z&%ZAP8ol}eP0jf~>~cTR9yZ!PH~9nY54=x5@jm@I`XpjXgCYFSeiSqNFE1~PHPcgn znVKpqC{yJ?=ei*OeN6OE13BVChG#M1Kc~V4+hY{B#|P7a0Gm$ZhZ>Iw|6fyqhfYO3 z0W3%TpDhM&p)JcW1Zw|w4u3aX<8SH`&T5R*e;oxkw8iKD5A%PqY2Ar%R!e9s`)A&6 zxkO%AZyCD)4o89t=5ZjQiVh>QJ^3M=lBd)~hwl2>QT8U}=^BR%64JbA%hTVt&h~S3 zbknDQaFRf;0=!+VthTN$F75nvHfFC;7wVOajqM>V1nZw!NVl?m$G{-s$#@pDC2Q6N zIH{$3oHu3qC{ZwuF8bo18rra2-wb2yGn6D}Z3`{UyB+uwYPsF~NJtS59sPN4)-Uw_ zGWatQRJJ!&CGh`BRyrZ9fsLb2fW7tW_g}XLm-g9Eaw*yQl|v2dEIi9XO%|a$m16l; zO_?LBfG-s_V>B|ewtb1a5bt6 z%CJY~-Jxda&WkPOkH&0f?oP9)kP4v7&S?nA0vx(6H-`6Hp?^xG7zKvj#y%rKhgYTe zPv4gT8z%Wz6>E+Wjq4(70A4#dEh@Jjzt+&{9tIQ65hX~KA?e4|Wb32<*C)uPR(PQP zCYgOFtU_qFgRy6*+0I1Dg+d%1aWp(lO4c31X^G_F9pR!x1AR)ERnG;6_xGQgYyCz| zdQ7b)*>*oA$At1HZ5(fnux}Q*@{p2ZGg$T&R78B)2?ydiKz|ja{#UA=zbeHWMgAOTq_Ay}VFuOitMzkBz#6bI z=$k-V>Tnd|^60J77hf=Cgg@HRrYTG;Id z?7brs?sah2wtE|wEAspJ#L}E}-OpMj`oOmn|G~GrK{2r4*Po9F!lR|kaN9b6b%c2n zLR8RSA-}_lu;74HVbnph%n-Mqm)?p`TFyJ9ca^kK!D5LtxH@d4fVR11S%HJE~ zmqOE5oeb||z?eQ+$|nD+IdjHdii&%Pzazo*R~cfvUcSD3KJzp=iW&t0T^o)IhQnHI1yeZAD%{u8AsE*_eJ z0J7J{R9qt=y9Cv-5+sQcg=SB#dt^H<98zpd?>^1+W@ZWlVs8#9fZBX;O;oeGTd4*1x!2@PRpZB+8!L3d1ZjT z9mT;p(LI{%bP`p5czA{sL|VHMho`{HiuigYUw2@e<5nXo8_s;xQe~U0r|c^R6wiQ$a;C;j zhc$HNhy(p0HV&J;yglzwk(prplo_Wx!2;c<0l}>5@)~%zr zkHy#$Xy&6^(yFAr;#fx3qLfrwl^h(>a}iWc(D~CPV~?(-a(>+JgKc_xs^xoh2P8RW zMwWlP>OtB`{ds>*^40?3(!-&-A`EZW^cGgxI4%rBwaVYI8io_0L}!HqUn4q{AGEB~ zm9=bK&gxm=?l{}Dc{PS)uaxx9ZRHhY(VwUis1qATM=d18Reo%p7N&YH4mM-^JLVU8 zeVFx1J<@uBY*nOI4uRpphwKd#9l94e3q9PMzNqpfh9zW-p1RiBUhgcX60*r0StEN4 zyBHh0i!SvmY&`?pJ1SROpMl}J-7>+U*tzUMF1>vX6}TA7eiNjQHMLdxHK(Bm?q1Rg zxAC}#Vv*`hib>B=p#Kn>eDLruyW3=pPk{N-K3M zx3Q@fW!^?K;_CHGfa!ez@T-o;;>62rHGF1;ONY(ccfMYoJq|#_Snel|7Av)Npk@o3 zl84~d>WYX)q;a&P?RSGCg6jP3sx>r#raiqk=3P2tno(ahU z@qE%~hPk%-k__RaaKST1g=mGU{a*zXpIWZ`%1IPV>`XaCLPKg&9y3&JtO6|+*!gEG z7C_7V^LL^f>v-Ufg`toT$wsmhn$j$2#!W9DuOMezIT1ljT__YdU*E4<+1}b%As?&R z@>@m?eq&Zp0t$s$zboD%@_FUq<()XHq9!lS6?Jjx?&|I46XoUYt8Z_luVgkV6IAdN zS^w5}nAX83ykewpq^D=JUYO?btx;ig#kLm~H!hdc;{3|7nu&#pr3S=)@O`i))JJ*d zWZ{Kb_Y7n>d@|F>fwsFfxG&Lkb92MUa2GvvVfiF1`r7w<_VB@4s=Bd-HArMb*DyWu z3#FR^5ySHk>Roy6XDd_EtE=co2E30-;2ydud1@&dE-xBR z;(XomHT<{>d72oIYrXWM0fXw4y`WvgVvT;O3>hhKQK-IRF0F$5o!{>9o5x?}mlp~j zQNge&OEe6Ej6_}66C6MmzHBXtcC>wGGr?BLJ4;g+A47$#U*n^L7S;DCJ&%|-U%%O{ zmb6d4v*UtaxSabon;GnBs|iYRm#ygtJ)t^SY~A}UW`~WJR-&cWG4s$|kYY+m_Q21A zqaKhQNpOt+vA^vKfl0B2IQGR@C*52&=3#B%*Z<}MApU*EO@iUV>e+0+`-)#}un8={ zjrClBTMm3Yqwv;~%&gkj#~cjeL(2%|i1)}Br<*>Dv?T~EWG*D5PPx-4F)2#oDT$J+ z;5k?4_*yUF%=?8yFln}7_gW8ZwzhKpL3QQ1)H3XwEtALOzQ)|!tmY}l@4n(c7sXWV ztqh>po~{&fh2rwIwzcu_F4_Lc(ZxFOi)k6lbv;_JK1%$ck}1VtL^UwD&wJ*C%w*S^ z1y1nF8p!NUO`K*PnWGafdqU*5-otnUcYNtv`~Wicj14HX^yN zUHAU8>_r2Qhc06;AT|5Saq^EmC|Smh+ZH!T3wR!&qDL&FLYMN)+#m$1woMucFP}@R z<}Q32OF(9t3Gey+OV-@XdyjfHq#(PmCvGqijuz5+@Fv!#QM9ED-%SE!rtmiVg*LS| zhiGw2MI7>%>D5N3uS*TI7YTls;+6)IAHZAoeGR2;t~#u#h&^5T)jsP&Kk(FizV7$C zKGkiSte&l|h8Gs)W=)lf@*k!hG$)x!vB6yY46&``F2ZX0{0MvKGy6GDs96wp-4!iRE|-A zS#w3Gk1dk`y(ge78v{(x)MSx{KbGS)aa1M|4)g^8wC{5bHL@@YcI$qepyh+UnB^EttK>pNt6 zF&$wQB8*VKY17GX8H9;34H1BNsn&jV&(89OFA50vBjkbvx)TXiRk`QgLxZ*GiGBSw zz>VJlkB(JpPVBS#Adfs8fHO4fL?s0vPxF`)w$~8aD z@{=f8N+cUgl=txz`{<4Q@SQ_Y?@1dr zLeK9D3T3lOLCLC~`!lPW!jnnMjE8pX>+a5*!Y}yS{X@R&wK>A0eC(?(M_~s%Ws|76 z6-tbou-W6m&{f^4`!Wej9-!S$58Zd7Q3+ZFf=FXl ze5S1}C&bJdsjx{N5E_HBrW9~Qex-9Z3=Vn7=-1bC-i=PINp4~EN3)w8=S4yG?i_(F zK2Q3zs|QlJHjHFZFsVYFH}{!Y3YZsy$F79 z7`iu%@$zmbd16yjnU?bztrcTC3aZ$bL@4~>#wP_x>)E5^Ky&gNV>q9P_7u$_ohq=g{tUPj3>7PcP~24EG|5le>B+#7(YoNUD$cjza=V;%y`yo^(pp z_2AO@J{ebxqpp`*FCGdFbrFyEec%3Nr7|$0J+Em>m&WVv`t+u-=v(YIA-uiiRv4+G zJUmN&dBFr~mk+m4{?}5TFg(m5d-yLVb=uqPO#L%c^X5!$_0ZswCg4TH;TM=G2D2`- z5G^V*c;TPTW%9}5X_aH|Otf!JDnUwWqU(*<`}}fA@VjR>D09qJaaOe0F0QKXi$ych zGD*Ejzq2n2nyNeZhksPtZuH;(Tr%R&=>g^-Z9yw=puN6q4|M8EAk7DRe9HPCWB{3J zmE2#xC24Cc%q}j^Eoqw;KP`_{Nr77|}E}=I%H4j+a^% zy~-*m_O|gNPmyfE;->Gy8!>V!lf_SwrZf?%BOpJ{t~H7IHs$0%{HyM#q~_(8G>Vx! zFbAJ}nOc@QWjgS+C-8tvX)r&!WX^iil2Wl@Zu&mpm)R_`oRhLUg#-m`^;?Cj9>to1 zkZlqARB;K}_^C?$DSz<;Yikv_@nbF^)j^WxLX*craZx0?J_cnaFnSYFyK4ypNl4+& z`@&2+*MJv<@-bIE-eHb<3CRr+O z49MelMD%sdtpu4%>JtMkc`g^#N(w5i^BFwrMHc&6d!H2N8q_6PGrdB}I2+~fVm`6< zwg^nEND}#!c$BZx@jWF?nVY+B^)D^~jXIo+erOBeGl$n0~vopZLrcY?d4o#-^PzGemb6(85nb zl$3YEYSwQ<6}qRRig7O`zLfRb3tpv&GgmQYP8iDdBkpm^^T4DY>7No6rIjADQa(i; z5KKznvWGpPm!||z0Sn(YrESefn5lLv;8r1mq~l>t)p(EmvRX#%7KI9hj}oE+?s;3j zw($%sOFPgtH7?jww4cznx&7oOr&>6wK~N>Jb>BRes-abN@!^m~L^oQg%IpL^ZKuHo zO(7U7MQZj0HlQ<;J`sj0i3@iG_ULLuyk`$>FKB-;yGy^`Xk!6}3C+=@LOO`SZt8uim7c5;8VcGa9s|+0h;IB$ZUv7aWi#jgYh=bqNgKDZ zLNpuRl75sMh%oUVFb5E8p7$_<$BrEn41$w}xS77!$)t=e=iJvh#U7(P@9pZyR>MgI zXbCf$>fe>_M$jlnYgf0^8RwdO{rr3^zwkLK^Kp!AChzZmUQNHRF)gIS^FgLw27y0OGcn;*9>1$}jmW(VUa7l!wrUBBbikeS`r`CI00wWupG)IT> zXDJ`I%Y|5wao$wrr}TDwa%0?diznuxZ>zBV@#Jc1ESi#PB`<6Zm!X6Ic6bVw`egqm z9>^euipRi_*Qq2T`W+2JZ*HLk zj!lUTE;-*@v&$R?{?9Fl*~hM?PF}L_)A7C;IBNt^hH-ESdxrDWsKOekWc4-hsGUzK zeJ}aWSk?gtfuUY$HnBT8u%v9_9uxyv``s*GYo|V*x#eOmt!TdYVf!hYq}W1bb>-|s zJM~1eZGf}C9@_Jr!TZcV1mj}t*A-J|Ng6N1+;4|LolmhF=5bpzm4YP(u4LUv< z5(SZY1Nq?M1)J$2kP%^t4JEONUfcfO*2eY5nPUZ|bshmMfwjK{15S9$)9=0&y@an( zWC_BFTKFvvNl!z~pAt7K@&=6aR` zeer|vQNolqnfrkrS8{t-2`M*v$}-1_`>G_6?ocw{{5(3Ty1tU+>7&fWkQ(p%$s>=h zpz?ilVgmoJS?p6zO&Rul=IVdGT7`X+#q}zY`p;zQEJaijR=jz$l6~;QLoe^ zvNV(vBdB$?IXk<;bgvKh9@vLy$wNN#P&g@D2p!ab&9Gr!W~`Dv-2!)zEqFw)s~;j0 z*%XH5ve3Z=%6=JRuj<<`#z|4&DpsL{SOF1{1o#Q)b>5(l|1DUOO6ZZ}ZA*igEui}b zY&w!Mr;G9ey}xsT{Et2bm+6e#iUk)h7O)BP1j5z-iVZx8aFR8CEAEcjQXI!K`swN9VZDQF+6J^fiLN> zqkS$SXF3ky$(MO76G#dEZ?Hb8^?J&!wu^J!#MCxE?KQ~XjV5U#=2@HyfHlIQVmc%c zvwJ(r{nFV`&w`*0nh&D&>!bllk`%+{I=M{Pd`CU4Qv9QF0vW{O=u{nVxb+eP1{LF` zYIR$gWmDq!C2!%YO@EMyO=Q6LFB55J99D~Lt6e6UtkFs2Xw(v#2-C-m2&Zy4UrZjG z=YFpa5B+M(n-*2)k$F{jv-zPa=W{L`<>9}b=+5l6>8Sm1iVP_#h#yI_v&rK< zU!#SKI2gg7qCnvCJ2quJo9HiDmL3Z7@o?nvF#K^hJQqJhjPSpN)7kgvK~u?SXDA-p|_opK38bKgla_eaF}T`2YW{@A!w}Zu<)Co_N-wsH9exTRc|omalcz zAPLR?9!B`90k3+$*!U6*PQgBx@vLt|6?>+bUp@vSMgD(R8S}qdZqWz=a0(4Loij~~ z2yi)B_PW>|lU6}%T;haC^S<4~dtAfow`hB~T0p|%ZFUm>+QF9GPL}2wIkIC+LKN}) zy7jsRyOe&i$q5oGtmnGz$xHpnbW)J@qKJPH?Y&Zu0B%O zT^}#O7W(AtibIsOJ#wPagH1+{=W)7y%!Kk$B&(P1HH=p^WnR{m{rFd^k?k0*r%)xK z5A$UY^Hd3<#1@s1yLb696?*i$q6u6W7#KDk1PK4=%~vq9DDBkh9=xvqu*u-B#=fGK z0`Hwb{6~aqPX)j7zdwdbL6FsTXE?>I)CTR<^-E*WD@X)LT2uY$aqr63`;TDf=Zmlk z{*#};jsqU$=4HSIzmlE?w&-{hJ{OVvn=ktJZ#xd~=%ed>XZx&q|M|E;secP!wSeCL zTub)Cqs689(T+M|SQR_PTe^{^HD0`{OAeJhwO<)qTeb@Gdbt5*7%pHGwK`h5TO`P( ztG&V~SIyFSj(*4>R3*6Q{GDu_`cU5b8h%iWU#!2g?xOuU3cMlrd55AX|Ta&Ts5AlgOYG$r=WN%8HreUWiqXhQ?-_W^PPqA}1*q8u^zr+b}_v}48D*=Cz$pJTLG6fL8pBqryixY4|x`a%@c zf5)IGVAAY6`HWD=duY^1F>3R;89U&>zVXo%UIqWGulu5UAJxocj_uNne=1ytgLpbc zdcr>dZ7112Zi62sqaP)f3BjJIlh1s&5yhuFd`}=ba>tP9z`{D;_B?Q0>?2G2U3lx+ zs{1gd+k$2x)Tb$&KCyQXtRwMndjYskDlBw+Htfp%WR`f`?;rB& z!2_IBVpaBWQiz!S^un9N9UP!7=0R*rn~RFb8=rf!U|+uBq+r7m_!= z<=K%f;Ernr^dn1jLo0CZ{SD`w*_d$U$`P_uWX8@Oi<_IcQPna&*Eoi3&qrI5k3Y_& zOnQF$#fkp2f99ZGXfpqqlkkE!c%$N zF9ww|kCUVEt{1`{=nCYkG+_($yF3CY8XxF3z+0vN3@2Y-N$Mq|lg42yDaba)2pKo5`?3n=>JOPJd7v-9kf8XyjB9!l zu%Qk(ChS<~WQnzj)*kgb068rcaR=i$mgse^l7O>l$w>0~;&G+f2rY}rl4<_QbJUfH zE(n5`%}kF5*<-HnZhnBv2%~#GCCWEo6IBK)?KVtx1cjQX?apoLlwpoAldqH%5c5-D zp85L~(0tOO#5)M5QQh|4k#lHwSH8!)U*$r`A1gddd_>|OexCgU_a4H8PIO#ADmF0V z++Soy5A`MW^e2w`#Lp@1x7hYDvCLsmq!0+^n&rruoA=lHYt@cl1@lTi1e@54r{z8G zHk|%wS-C)Ms&LZcLKm@d=M~``lxO;AlnCkmqYxBHTfRVU9O4wtU;!=gTyo>z0sgNi zin6Pe2lOKrb7qrZ_d8o`nKG09m^YQON>Tyz2%a}n{|L$YT6*4s1x!as6fTB(Vf4iM zp;}`{f*;zQbcGp7!uMM)N7|s#pg6dK^}z5S@mg>u{ZsB#muFSs$Np3fee6S> zXe86(nYJc53KEHXhH(SGq$=dzAM<%U$`fjY^=1jkHp!RO`OX;xv^|{-5WG`qia$ko zxk~DMfdCg@<1zhAC(Wk0b)N_rOCyI1Gs;mbgmG%UE#6s!_+@555jV7>5bCrahT-pC z{gZBL|FSp69siT;wxz#TWenAYDL=h>DO+TYK?CKaWq9f(yDt7GY1*ACkd=bgHOB?o z(AlM4hFl?R*lkoU8){gdM2x|-ZqSZ+Z1c#`sNfmI%dsnk=QE%lUtqD5xsM*pmi#*w zMLlozQ7yO2R3bocDWF~{3V4-bu@{zm9usZ$q^`53)sn>Ikyz|VOX-EwJmy%DNPD$Z z6RAgvcKvb`zk=iz(u@ZH*#HyUzPA1U@&X)$xg#Z~JsgB1B6>TngTlH1pfI)sEsh($ zM_K#G)~%Dr-Ll91UX+275x^s`?O~yfr(*X#!^;a_`~U$yd)w1p+f!89y^&VI-O3Q; zQ*YJN5x~pjYBlj#gfPOS$~I}iG8k*@<%DggA*8SN-vfd~PdYDLX&-@nybs||_fQcY zN0)3mfTQ7VI^H{w>G(*Kt=C#flM89kg{1E7IC0kJ#TVlb-%b(+!CD_2mmfFcBd5)k zg1g~#ZJEP|&7M+6pHM@{sz9Ec+jxrKkio^P{~Wsb^b*fsAuDu|Jf!xPLWSw!?kJRw zR}5@+u73x)>~YefdoP$gdfJ1{mfzC&ZZkcL$%k#l6X@c-yWR4SbBz&*BZgI^I9rI>c&pi_1ou$8?NCHwd=#0CJp zrM!B27?58LNz1a%x5cX10D$6jAJ;K5;_o~`0PwlWZamq3Vp5>rpU2s*KY~8RwE$MO zQfI-wSd4Cpcc4zZ$DfTis^a&3eSLHf<8(_c9@qAQ-d9y=DQRgF0Vi=D+TI5)fNK}P zL7n#j<%iSbY?%AgK)~GwfY%*3^W>lar`EmjP}gL8131Ggd+MTW&py6>JQ|vLNb|aN z_jrE!4hR5j0AL>5R&IKNl)s9Dx8-H(g8KK~DIM35Am9e<7D8dJ+nlzveOPx`xdBNl*1EV}hv_~Zg1e_``knIO zgzR!U?$yb|^2$ne@qJNS@S;@r;{l*0Blw~2$yo^P?x3v|{CrEd)q!bXR$3)=Z~z<- zR+`L*y17p7w-(pdY#%mz!e|su@PgkdYw%lRI8Raz<#q7c-P@nJ~7imHXGl$95~gi=wTk7H9|>E=pz+vK1S)?|;8>0V(W> zjG**K5A5YPhOS@_DM>{Ct}^Xb@Y<9Y#Q!t2wxgoN|MZ_bs^&teZKY;hN|PcplT6U5 zTGm6$&IxUBBt*Y@&X{xn^jNZ6lumwWRhrUnCyQsfmQQ>hCwSO>PD>yiiP$F5$|as! zl3ElYYEa)m?DX3K6&*E==5INsMtP7=zs@;dS$-l`tO6Km@cUA1rR~8QWxvLbPsN|z z`$bF7(hi|$_$C2&l|!-WN$MSfaj-Lbp_`g!cQbcIP?wzIm+ahPQ?wF1MQh1@m%>}N zxgk91$~kFo$#1_~olY8dypIzWoc5PJY*x8AIAS(?K2**Pu6dY{qW+ZnmO%J=uUT<_ zv_l-|D9hMie7LE^pghnYVln)KF*c={G)D}=yd7SmhthhG-VTnHCOk!nxqwMROK{rg zJS*-9LTeD$`RnEy!RfqZnb%Fx-7+Yx!bKyladS@+8wrUUh(7bVT)@PkPD%X&)@AOjp4$Zp@BXM6 z;X{9??z-I&GQ-11LAejB{2tWak!ewrJm@ZE-DYNCF>Oe(7cWc`hDxVdh@Z+0 zSfU|=ROk637@<(?3Oomne6N(ZVktqV;U-6C7O!HbNEt}uq9J=|ni-*#=&2o#w7B(+ z9olkaL8D&eetO5TVYO)(+l*|Z_48ZJ4ZB2Epi}?o8gbcBzz9A)BqX>EbS~Kp@tU2U zpl#o|He>nr2>J7no~{L6)6nt?J^>C&0v|zXd{w1H1)Z@R#W_rrwn0>$DvSHF{48jSGUQ9z&I{jkS;NQxn<)TZ zsC2JEghS>`h8mgb?Ue_$(t-glu?f)w&TliNbB5%;M%XmQMTe`7uSrayjht~>=XMm$ z)fPr(ev)BRVOJlSTpY;vmPek19!5Z%*e2+niXf1JeK#z-Vl538oRrEU8EJE zwhndvwq}zQi)7EXVQ~>!3g!pp$kACJ4V3cbKFy}Wnr-+V1%uw#vDNW$s9FU6?zMHL zYF?MeiB^}B4?jdE72OtJ@Q9cK5Ps+M1F3Z;rDZrO-A>K?J02xhi}0mIs%bb)^U9|; z7v>ZFFh|$q!?LWVYQy0A>Vbl!pq}%2R#?;>vLuq%E^!`ib0ve9Vd+EPIAapC=9D*Z zgD!oSh~}piEj9T(VE7sTjDfFUtU}t;xut@Wt1@&1Fg2Wi;*w&gC2udpg^Rv7fD#{5 z;+U!^o~&aLZ)j8sk{(QE&k{Z}%F3;GyW}2nDQ&60JyLOBH6h-+7bIcj<#7>>@u$bM za;-DV9a+v+EAwy~JD;^nWZLB%QrE=8-fD+>i4B{C^yF|&2t{lbJY`)dKQ80$qQwDA zDQq(&Mt-BlQd*ryA7SAvP&Gifq?TOM&je8?CG5)eH~4DG&1db`|7fKJmXZ=f@j5%@ zC}j#sPg5@%Z_!6|gDdV%VG+DYOjhMkFqN|LWF{y3bg~T<&7}{W9MUGdMkvO61}UUZ zLYF&(yQO8aim7l%;q=ah$WU84TNa5WMoOzWNnkd|RJdofsY*+gXgcSUZvK&;J-o+^ z4ix_-Th)_LU;~}on^zUETya#TD+c9l|;}gYpJWWc^rt^p$3*%56Qn&F@ z6tkMOu|YL1zaIs4X`0cxEi2gmSGl>+Zgeps&2@_)M8k_E&_hg550^A0oM7wI&iJBI za|y>;0ZQDbrX$oJJXRKfO!EgDW+bz@oZMF{OiDx2!=w5{)r*eS^m8nib`%66;v~IS ziNi*6)cJ*dL2Jb}{J)Iv6s>^@$=L@?47HvtB?&|GUTk?LTc5tfoX7G;Su z-M-sKb+M35>N1UwBqi7~wsZ_RdsQ37_SeP4l!>Kz6w^p_*^c z!Ap{eJ}km`Hvn`Sa!g5%j%MsB*!!a5EZ$Dal<`m(Fhf1kVDIkC`5yIT*n^~JWH6;+ ztswy6sS;2~4*Tce9t~?xH|TRN@<>oyC>=ue z+J>aDwFEg`yLQ{f#A@;C7MG4hj~=F%#0DVuk*m%WRF?*j6A)Oq6-iz$IS~D0hvqGD zIg&%XmqoC_i+%$C*(v=BciZk7sT#bM{EuMcGe%$squTdFW(OfG7q6AgxvZW@5p-#J z4LDUvD`Vo-dIx_j?6cCWih~s^XS4vRP|yeP20lG(KFM?8m@Rb6zk+1NKMj(-zqNh3 znlDgXx%cB!yY1w=&wFRTjt+D2d3EfwS7n8**TD_oR`v$8+q5S{Rvc+HYRIkNY{M286|5Y((z+4phW2_cb1Hvfq}u7FAQq8 zt}E|8HrXD|Jl*oIbQewSdtFzgkn=0^-Jf=PQGNA&@%1%{Um-b!q$Aqa@&%Q4E_hdi zOV|A;;jT9Td8<`lZw&ek^Jl+-lUF-6nD$O?YMwCXmq}u~4$9ZShM8{!cz>Sl6(l13 zYuEU|oV+oaKb!hB@~kDqDgAXK{pLS51nU!gRjULab7!*y7OQv`YVT-q(eA%jJgyjz znv)nB8gj|&?R@Cz5&XzCrXOg0(sYs)&byyRF!i|Vx7gsY^TlESkqe^xKY4dnQaczP zDM@M1joW=r>WUlJNZFvu+=Ik;Ef(>2fPexbb7eH=L9xtts^HPJcJ>_AL9 zZ8`DU@Od9c%i8U;t#1Z*4oy&&N&xPepAv-E^z^S4&Xj|JlUlXb^#;oyCSh zvQlS3til2f)z$QE&GdjJ`e8L^8H=_CwCK*Hyu;0LjjL_@99=X#_gywwUXV#!PN)5* z&J~Tm_aj5(sLK~hkEe&jwZN5ypl>*=YrmiYz@(MXBVnJrPF>P2~9=j=1MF%PakM@*noy1+Y4~O`ynC-0Xa$~r6fB@dET{}$kc|Q@JH~UqHF82l9(|N+1X=Y|qOc*6fFt#I- z{@r87(MO*TiGzsgitbh( z(iWTOy5H(@S#ZxnE_+>XRKICV3vWBseL4WJ&xVN{T=}>6_mlnZ$+xIvn7H3> z-Tf+o#Oeb89h$b?)r|JEK24Q9F2%JvmScxZAHn;iJx<$BSs&i5Jgp%C>d|nn`Z^Kr zE8CuyRoVDq)nnE}m4i!XO7-R#9F;%ZK=y`x(IQOvfVSkwwasDfLd3?}Q=sx6>4?A` zK2ye5Bf>#cSh1ZHg+Mr|8kd6Vi=Wi-W_;^zGALvq8pK1+pQ;-7Du(~XM9w-KW3 zhp)ua{OYQzHBSiM{W;V0bYT%@J1?{qRlgCtZ0_#-z+H>XOr&sqY=ux$<%kpers#D9 z3>OhXPjx$9cN&om4t5t*7Un&*(cuPF=WEcAMcuc`#p^P&upCrGJ4$87ik=U|Qhp~J z(RCSs6;P2)mdi$^T5i6-XUeof)x?^FgzWYuHAg4QaR52wZDj+lZ`DQmW>qv4q;*Ft z8RLYW2BHZ3UXI4vdW{^6DWE)j#gd}EZ(Nv*?rSD)`bIVUv zW`Vk;WK3h}JY(9+!L1zMy-nf<`9@mEwilPluLpoZutBUc%AK%2Hm+*;PmmtmfE8zy zQA$X*>#xaHst#%#!60#5Qr?%YursLNruien+2s z?zWETWk{u1fp^7hqVN%_pX`g!jZbd|@g}MBBJ}>~GEi&jnR-<;8&IbU{zw^+B@IW~AVRE(vTHPCm=)!Rr~~#^oLBjds)Fi?0!a9^R1|{)S#WH@VtiE}Pj1TdA1l;0{M>LscW|p^u4_tkQk?d=B z_NPpHOP#lPdA;VV8F6HAyFq>12m|VBbN6C>#4Bn#3#2LQ_eNF3tH<_qbWJf0t?#Q7 z-Hr46a zmnj(deyYmPABRF=zp(AmsMvRU#a~vR>>}+3uk0?GTdt%Zm1i}xJFC-&Ru8ml32cxt znj752ptLe5T}6++DlaX~DjNue{Kt3DhVTo%=nP8aoP$@iqAZ!(sJQB{wB2jnK{Yqr zj?Q%vJ<;!8N0-SWu6d*JTFQI(Tc%5W-1qS+TkYWB|6IDjqwy;EIU9lVPGlLDlE0a7 zQuXAt+{IbIoiU#4{dU_Bf9YQ$EdoJiyI3NvXVZMoM*V&}KP`67BFGd|e{_DA^zZGo z{{eQ`VIbdb>nlhpQS7kCZRf6AFRm++(Q-I1P<#^Vjh;Rk@1G7f1;b^VR@3~%q~wfF zQMHC@*7(Cj(*F%y4lJFLgZVQey@I^Or_c9_XPNq7(70;u7LFJ}4XqW4xQg5esdJR| z(L+sm{1RMA(q+d<-Vv4IeRN1E&j~!np&(;YVq1fSaddT7{#^B>P*8O42sqbn_E7nt zEwaw>VU?Xkr;fS-pS!Ah-3VNK~}<0jC**uTd(0@ZM@ zMPZ1BGa%DHcX1ne8J(?Cu1+ch0S3bq0VVhrCIm&f>^<0d&DDDiu_-^Vi1%e(fA2u} zf3#>*iNipTwS=wcB1I|ZS@?$ij|)g`=XH%gh{3c9RyZy4w^kNcEDK0sosL~oz#95p zt6)4)mT1W&lExu0u%O|+#G}KYl|vFYf>Ct9M8eEvR+K{^%-BEWL|IZokkj-lwZ7Dn zs>2j{)&pXSvZ?N(eR-y83ab#W0_m~s?t~`Hx~4t-T$%5aN;j`TNJ#koDzq*ZYS7R@ zfs%%%2v}_J$rul-ep>ab&?izcy!{0&G!r9KG`nwcKwMk5>~1$}6km})rg zlEJV^t-i71pbx+>`PY>4s`{K(fUgE}g zB?fKWw*z3Z`XWX(;QK5XlwBl@G>g1n%BKqd#BttF)mhWJ4c33QJWd{L%bL$ef5xjQ zzWd<;Y`Mn86YMsqVDr;Ek5Sc-S!&I?TlGWY1t#dWK0V%5oZgnG>VQ_RwR4=Ie&EtT z|1uO3DbB^ExVqVBb6fRM4S>U-P^884df}!5UTh`=T?@MSuT6I3KT)wz?wqr~hu6TVhluULX=#LkD`76QyAy2Qp|kdk^&aSf2V zpkwfHF;v{H`nAmw>xM6YZ9uRZ#ur^EwwLv7Zs#N3&KHVlaJb$#%iHQ6o3r{T5;?NU zMTX%#wev#v+kH6->luNRUTkY%D5R$IqwTZ<9*iJ2w_^yP|A_JzlCK=%A0oM!w^~6% zwyo#=C|u??HMejhKqTH9J%Avt*F_84vg^iKFWyNRdrWYzZeGGeAf9)Tclne6v`X6tl z{{V@Y5olfzWUHmCJ;U{9{_zdPH`01D)OQ12(!2dbA8HL_N?*R9fn=CFcq)JezaR|V z`>R-cmjTE7o4Q64q8G?-HT^yNH(Zkz_ze_A&!!NxXh}(U-~c>u=G{i9@lJLLNVn4l z*_exlDh`7(RNfB?d{bEwi}j9X<#MUK^z+VAzKb`1+d_7;!oPR+@?J5Bfb&>L=ZJQ& zs`YHH$t67n_Z{HM6i^$|K>!E&y?uXw9|>^Ni4_d)bXOCTw`%-)F0j!9TY#HDKz^U+ z#n|U(?aRewO9%_5`F6i(=rS}aD(cy+#%649#xq!N;F`PG*Gs3-;COp~zXU8Kb&ch` zWlAt+Dv9*?n8ay}er^#AzknSLXRhAV=*Uw-H7;PbnJfAOAxnC-Ckz|U`-Y%q%?BFy zYaqwG7l-G3lsOG8?es-v-qV^QGFBM(>vQ@&7WI;k7e?UOw&~$`gT;?)GSQRjeQWL- z-#H^fRQBF2UK;xjtc$VhuttH^Pq-G%^5*E29o#J7sF4V zJwA$vur1kp!29}kgoS-6M7xoZks)cQ_-4iGaFdz<3q_2gpDV`Cc@oWu%x<1~EH0W5 zlafrCYoYEYKQt%bQyDdrjVg}+&Q+jKEk_!Y^qb=`IOkPY-W49o=~4b z8-;ywZQo!JTw{t)66ds9_2`O6d)jAjsx$XAkvNzzi?s*^Lc~=UcJM+1sCisG{GAzP zLV}|=eeK7bzzKreM5S##R8DJrEI3CH0_zB%;yC0S!%qzf5#z>^Xj2CRF|#+(#DNg^ zkvogc_uR1co3ndLEygUb1uD|TL7Dlv4jOXR(!lM@Dv7bWg;3;wc>(GwTT`7C3uBcV zA~osDmbst84D5ObBx^Vu!M89@q;+aqi|bz-9yGGQz<_YE#2tSD%&!95@p%Rn1Y(%a ztFUNF{#X9+TWDkdCMQxBi!{>dFF^xesfaoA)oz{ECGjgKTQ%%#)&=F>&gNJ zNNg%e4gtEw#xFztGhVG?(zH1_BpPNK&PxV)%L7vV#$0d4a>q@`+r`+HBq=-ol}_Y+ zg)1{zD3l1U^jj0h5ielAKY(nZ_#AbZ?{-H z2sSqFoMb5N#WVKGBER(@PA0zEzO!sTsKeXJbyL@+`Fh~u3`8&MOUz1e!0tI9u7 zrDGJ2_6A=H0Eo`OeYeCDEsrmk3ndrrLt^7@oH6#E?#H2&@cQ(#r#53!X)O(vwfD=O zh~$qyBU@X$NPlRT!!?eMb+i1genfimRo8){MRnzupY6VL(e15}J9 zVJYz~o9uRUclh54Uv!@)2|YPdly90z7?UFEqvH}<+Ax-TZ*8xW`V~WlH?4`6zNKPB zdmpxoB}%*{;e+EC%b+|TO_kFOQz*R|%|f#)Mv$h=Iga+wVA&76CL;_^!8sds2*#Jw zsa*vYVX!kBg)}F5Ln|>UxNZ+Ujq^)i#e&S*Y~k{ofzJmnQELP-;M*BZ?|B3iHuQbu zwE7k!Y3bKjDjy~ZOCdqYQ9I*E>SrLJEG>gz>huPVwn?=lBZ_?@E{+n#;Vsk;oL$

Nn`wi85R!re zY0Zw;nyg|)_pf+{9^9DFUe&@BC|-0`!mF+ysm>-5{z%Au`0>M znQoPdeOAV*X5pNTWOt5mgCCa8mx_svnl{(ElC6KgS$@4Bf<*j6y8DFiS6*YXcJ{M> zv@}?IrG#Rswn2*3wqe>h9D4_f!htGpfE#1;+#x4a*^!!y!gS(5a#}G`;}xW+n$tqioJQga;*z^xWoz^0Ur_0ef~RF>PjEcK{4g$Uz>6gAyZw~VMky2 zv=Dub)Ai^@j|}3cM)wQ*_2DYIos9Pkc6K)PL%^x9qjgB5e{{YSz^XEnMb?u-XAE>zz? zf#E9%AsH}+{ON)``42H7g5axHJkf-*9O|Dd`76kPh(XkN@=5NCR&9<;<4~M3vCMLJEKYahV7%=8t)&C*Pc9a*% zck>HL;^`**p6leIq-#Bo?;-Vo_hK#ZLHwo;8t)#h_o3&@EtO1;H+ndrP{EBI<~Fvx zlk|T5cSPXRQk!4k92)=?Tp#t{S`8)|7}=zZD*Jd}~G zcvkVj#^vS#?)~XJ@2-v>gB72pnC+9NLsP(w?9*i3Gc*2lJ06GbLJz%AHxHe)3TF55 z0GKVQ=|Q_is%5yLq8VsHg3qJu;n_-^$XGqNw%#2gl!-aXP7kT$aa1Eks%Sa9xw-jl zC&!7;%^(mGYzH>CJsz}`QUEU|WS>B3PoSUVZ&?r#eExDNo;*FY%_IUZ%K#70`^#9Z z=Lr^{GlB|Qj5x7#-)s}9x@L-+v3@z{p-@U#Pv0ymtDItExKuuku&=d$H0&z~BL3CW zVf`Mm+#6v{B&e7T`BA1aOjWzs1qqj1m zMAwY03@%$+_6@t7KMrF0%XGLHJ{D20OipYIm>J=zguh$^JcL$9-Q$<+NEIZ88Mk{r zYE}UK73!!LB;cKqHB2?@>-lA5JVK+4x#2%OxGkp15DoojqhcaL_=R35QszfGdnITk z%`2-ncs$&H8L+4B{TbEy%a6NDfdZi_QEa+JXj^k`@CXf^&$+K^!c%)FJwC2`XQ%9d zL%T1ert5dN77Z3C+XQ2Q5DH30CC{Wcccei^)Xdf6zqiqpK(C4a4_v%k$(sfOi8eFi ziQ6IN`Q(HN|94gEFF@K~Mvc{NdPx(`rEMK>ZANPCX-7-5;bHQGTp?aNtw`);jDJz{ zSM5bw5|WbDqt%Cv)=@w8v;I5#jD7vaghTT%ZqcblZAoD^IlPj*f7GSaPzyqpLngIC zp)Pf7-Q;y^{B-$GNr0yPNzO+`2NxQOHGS6j4y&K0+)}^kiNU<=o3>33|K!w^vhrC+(ky1djsHWsz8b?m%moJ{`$s`HxZV$O?1%0EQE+M0 zzuEedwTx*R>dD;jjcNpT3s}io$;HG^n-D$%D6Z>4WW4vMLWP&$oEg(`DgPuLr5;R! z^|f1+aJygZ3fR)l*C9cgxhYQcfXSUPuhCb^xQ-BMH(g|jtXCkg5j&j?<7fdI;^JYp zy28j_5=H6XxI9Jn*^;AMr)@{~zEsD{`Osnpt|w16NhPXTE?J)go@$7sRXZdg zNIa2D%O&bRbO7b-zjS~&6$j=S@nr)+%&%fOOF7;RyK^S24rhmFf&?0=2xJZ+FcH>m zF!x)!k6Yf#VE;I`57Flzo_7l?yr)CLZ>}TSiNd)duopf*24?K{;sHLfI%9IW<1O68 z?9RvjOG$8t(pQ?eOq~lbc6N5~v`;*VM@4wk3>TSMU|-mIaQ>faLS*~rh_9FqFR?tx zU&89Y!SbozxvX%Ju`a?oA5zoWicM<_3^Ddnr{ULL1nodyFo{1X9K2WyNif*2ox1-e zIF!K_S8b8=c%hskNJo&6oRq}LacSgq*x_%xvx@z7qvih~pPN9a4!YsQ9n8tc=|R#Mx{hO-Pj~2c2xQJaCq%;i@u2-p&zSz3}7*Mz*J4=})hcb~*l3!XenZbzB2A0UOCVgX3ZPMP^d3{o$Pn5~PP#v}zb7=H(}?H1;!G zBqmIOWVXvZR6(VLXP;p0l0TGb#%dv_s%w0y4BqauAgzkg&fwYKjYw8{H$hPlhf$h# z9~_f_`dj^KAZJ|x^8*{N)BGS?xX~WLtG*W_2*3Ot#GB+!D>F1)q=NiH34Qgz07CRj#?a z(59rO5X?+F?6eppY?zwJ>NajPxFwjFegVg}L(by}6?m|i) z<;q#LLoh}Pm!A)xZ)Y&eGD(8*jww=y#(?i5p+C`CARY3g(|PZunHCt&+(C`V#vgAy&V7)^DK* zj^LX9134FUiV0g>M{RFz{0_sn&wQcI<)5t)<6}$%x_5=IW4?Bytg+`>P?zX^S|C~v zlr0{Cd&>krNag>Ft$|n)R!X|6n){s98HQ)D)3lY#=IjoMyZq)i5!d^wJmO3nJA)V85w_`&JVk0B1s zcbo0zIOV>>7X!sR9A?m^-Dacwu+a&m6-8qMA@enj9mmTEfKZe{){lI0)^c;NiIH8w z_u3hnW+N%0u4=jvVrA_;>@Zuvj`!zTA7xz2~gG_gZVO7e_?E`8eJ1 z7`$r#F$T*)UlsTkzuuJpl*cGnuM&6}z&zz2tDVEkL5@U`=5O$qFZ5G}}*tzPM8 zHC&cXTc1CA-nIMu{SGHJMN=No03)<%M6$!Pt2%elO3`FAkNOhL4IyIFk(R_FE7_O` zIC&VIk{#*PKGB2j6|=y4aMqN@g9GY#aW!$8yZyZLQG^XTnU`@WIGhZ?vH960o zU~a*#Gb@ce_r50{Y)G$b_+i;x8kPErdQujJCHapwNC_m0=Hi*N8I8@cJl|QkCcQ&{ zTZgg6TV6F!%jQJ1C`*xBfmT_G6(git+IjF;q^oUNxQ5!9$g}87W5-mzKV~A%cR-<& zrZQDjwpOV~mWyZIN{3-)jZi&Y!ZIv>%T6#AZ?NF6`iv*JaC%8*RuV(j@-acfw)OJc zR&F8kGt#z6Cz>10NFtSZ*K^mWP#gD4z=Y>+G*b_5*~)Hxa%6!frnP zps5AdaHRPYTZ{2K`LPA{?`74#cE|J@Jxtul*D(K4c>CF5?IOiQF4xgK{95H&B+l;9 zd*LGvZH}-m=Gc~y;(_msc=A@LvBP}&hA){6OZL`^YwEVqFfLxPld~v2ad}8yk$s2>`{6SG@S|dWabCqn zrXk|AMT4XI_r+<0rYf~MpGTE9t$v#0xwPu3{=)`{J0rmu-DNR#rxX?ZSy>eB{z#`q zZkO8f#iGq(gtpKAA`8QVdV5Xb{KYC(l}gKA!)fTy){s+~(9Q+VVuf~JTL#XD?@UJYb3W>Z+2(hQ72YwG*(uQWK^&QmKJ^=zONihLz!A+vr`zCGN1w zXwNF~>!cOJWnu`$XxMILCB}PshGR37CPt~;?mmluo_)<#tb=A>bKaJ`vQ58Iz78l~ z26j$u_{8phpy?+F6ll=$=@JHHin6n23V!}OiU1tQ$sH zax`8aaB*npG)HD}EDYC$1NU|pVDrt3ok)^C^exL5@UUV>n)b5m@u4;)F)MjAq z$SpSd0ErVxssbrtp~#^o$h09rM?eUbAwIHh-+gf z&E&{^;HsvD%#M-$LKAR<=bF9s^5o9}LX46}xA(y!(u`}LSe901J*4a3!KQ+=o0|K1vZOQYBgYY~AKx}<#7;PK z^1a9JvF@OBj^+_*tF(=;Jt>3@nSk$%kO%7>q-V+0z2Hc9OlJ9Jjg96oNxbFLaglN2 zng%Ke@6hC1aul_wvt3g0&^CxqTLZe)ad&N9ZOX0`^!#zZGvnoYwxfUB3mvxcSHCzl zV)nk;@_p&h;M+{Uj+S%a>|1`cl|ZX)p7!k3_Uf7pKqJ(BZqsk@btDJ4Ow_Wh(E}I*XjCMYt%@d=A zPE>&9PiwBnNrsAAUJo-o8+75bM15=8InM{>vl7QMyL0;0l^hj(goNP}x?LGP9Zu

RLW*w;_5G$*-CTl{K7W#f7Cs3=I+^-TS*wV@HI z;G^S^$1!5AC}qx|iC-Li+548_80^@5A)q2S_Y4-VXN9>ECF1b$%f2M01+8S``S*&! zHeK_`n@kyAKTeWX;o!qNXyQNj+Wq|9lWp=arGv3I6dzEr`m3OkF6M1Z<15byR_|UU zosiX9qlurbXZw{pZ5HO^5~+(i4-)P_#bL zySJKPGJSMNnag6quy=91&|QtU`=Qbs?!m8+`4MAp_F_4lv+dDC)td|Fy4q7T{_`|` zjR9x!Y1!K@MYVzjo>6f~uF=QhT?R}^7O(q`>iBL;Fvh%T!ISujzi_C3*d#wA^1Yin zR{Xfdueh%M?ZWZZZpi({>(M_q^0~}9_PF4CrXr_@_qXT~y1_46+J(22=}<)n#f}XZ zUCMiywzSE;(E_GE(fVijUf%l|S1A^Fd2!y29<3gL<`IBof5$cEd8~)^+!oGeIz6N0 zTsAJxr^j>3g(p)KXa#N?_1X3&j`+uvooJ8GPqw0H>VQzfB0*rBT-9rcz1A;RdOLCnbzbG;rBH65mR%r~ZF>j{jk-f4=+2ydGl$O~T%~qsaJM z69pd^{DVo5|LsA=Ll*4+JJYB%MhUu4E4MvSV$v@?QX^Az$EP1Ha!4DkfSO)D3Y%Uo z0>eC7?D_gav#)_s2FVv)l;)p5lt~5bbTj88fT~a0!s^DO78l*?K~zoWKqocbvBeJ= zR#evY&YfFtWkO{L!#iH`%)*9cP`p`@J0F z`u!-**^obrD9`myA6Z5^&P-UpSBmD9GdkbM&uEL5xFntV;E#IUN{v_9KyF!YdFIHH zBdS`=udcA1YCDS zq>Qw&15zlwL2_tuKP@Gy=2^@W{>x7h)?%;J648Qw>A z5udizdsqXJwsG(}eJ}SjmeSnjF&Dh{Rc8gJEMhy6n>{wMx{T2=6<6MuG=*`-JXE`9 zndts4sol_}pI<+v;Vb7`D_efh(dakZksAgi-KDB+t3Uk;VO$7ky`J)$mt%FwXa zhE=9|u@pj`#cc)lCVJz`63^ERq{ArOT`>GxTSRT7o5sa4RolJf3)Lmsm|xu_?0XYT ze8`MFHga{3FV7dQf)U-+61X*eBD)Q+A3b+UJe>7r%8L;8=kJzx6Vnd3$zm=inlEo& zN6$}t4sQB@*%W;Ac}2DBTmQXWS@&rVV|&Kc&$W@9pX~k|C=srEhCau0ZrP@=H(J_c z_L;BhbQ~pu>%3;{KL%lOHgMSi+XJiLuHF((Fk#S8$(*k)IU#M*+3FsOqH=0ME;I4^ zmSi1FD;*q1wd05>gK}8u-C6MD(A@4k%;vO_+}~rSiS2@Prs+w?+;!w>uCCw(kF8=M zs(#1xR}mZi0yB1H(S|Bv0zU%q#9x{ZMO+~`gnAfa(au>f`_QF(qZJK5=; z9MHEd)=G_7V8qAN@O}K{@fF6T&T0ECg8?cE`l=IZ?+jQ^Mfk+5+d^S64nK z8FAU#eHK@rzIIV?o54v%;ZBdJ#++YjPDdaKDwBteN!YESXE}e1_4fJ^B9;HBsO35N z`ZGBcoAStKiH!Yr4?1yJHU`dfq%KcZqF36_ldRK2T~X+90=`vBG+%5xwm<5Wen(KJ!n6^PsZoK|GEVkQQ=R5VBhOT2;&M~PZ+9Rgbn>@X2MAegK;hI(& zmXQmtbj@F*kOsLTdVNoOP1;li=ZIyw1?S=Boo-Ca57kR*za=~MvVZKbUWBhil0nBXiD6-LE_aXXRLVfNn7W<84@f-A_!4VLL-~8(2gy;*BC$w=-E8*MaRn}i z2S6x=E7UA&H^Dy~awto5RdXsUJ~KA#@+VWjoLRWMHSe|GFH{?P6oZL`aY5#2U|%(s zWrQmHlFHa~8~oQ-30>X6U8DlB(c_@a1$2sRjGgDA|Mlg1>J7WfVY#&HldK<<{OaV= z%tnNeX{Dc(;>7;J_sScCoL_L0XMK8Ik?&!BnLErf5WoKl&_%iLY z#b^r5Ruup3hG%#u;T=+nGW;(Y6Q|t3#>;oIcM&!SYg|395TZVek?IMd@?Zvi z(j#OKVx)0HxAn(QR)4Hj|HN;JQ4aYA&P8;W<)#`x3^k2rvmfX+sy9;+y4u1E?PiC*bBjj3XXG$|+JTvEn?7~Dr(aI~mH=GZTfK)v&^NFX2boMl`fUNR!_V}45k{6~8iEfRq6Gnnj#oV`E z6x?*gYB=5o=L%NVc`JMfe=ODeXliLp$0YK_^{(a8SVRiqD*%}+z~>Qnen9A|1q*O3 zCiEX|Qu3usdlV%W?~!y_)(B5BuXdqCOiNAT?@em#E(k7o89?R;t0(de7ZqA3dDrFSV1yzF%DrF9m>YqSfRruF?`>hDKdVl#go14&bb~#<;toj$TMEC zeDFoc=Q+B+MVAc>EGj#LgE3Z5uhfL>o!iOxc`9mS;C9T0POaCBW-dl*(cnzCpHISq zI(VgNxrd@gI2bDqZiK$ADP7MBzx8@b)slADN(?w(K^>&JlRoF$=ALilRM6?M>UWzw zVWEui%(LFdbI!cmdS(n!Ya)maB6EXJxYJc%Wh-Myqp(8BKfa&0ip)r#8T?6*$#Jnl zeZ|bc`v$9*qu!2IuZ>|m=*g~1&-dZ`BQ`7yTcfuKX{Et1IrDvGnFN{yZ{6u? z>fj<>U?eJ&L2xv?;UjnP6gVE3t4mxQv~`l07$yR^Wlrs1_G&q%in_hxjX=Wj$A{`- zRYHZ6^FMs#eUr+_q}K>OO&{GPdY4ZfTQkYQkTrFG8CN7~HMe_zHn-&dMbY@K;>!6{ zIwYd_pGF{9HYoh1;QeQSB)7XKkNDr^Omp7eH0GhLh+$LLQRu+Ro~$7pEDCDAJS++7 z@N`;=d#DylGI&mA9^8}TH!1&iAqIUOQ(b5&@cyz^)!uRQm6!w5g$%KQLIaAtfdYAg z`q}O8YqbVvK6;dcyW7fsPuJ!o5b=D;V)18EbD8SN(>kYLNiRq2i?gKXu9kZ_u@=0} zjpphCpPpR{!YRJnN}2KTL~fDiq+gjOjm#l+IYyvz)`Ij2#;jzhaym++*G8la{ovvQ z9#*HylyV`X=8v4noATwxE_}F`h-hER+$fCsmmRy#rDQ!W)A|4RlD*CYPwImfD!=yRX zMWRxjo0i!Tu0X4~P+rK&@OkFsX&C+sWLKcVj6zQtbAAUA*vPrG^mVZ@@(<2Z7m(M5Qhq)J3t}cC&J#udi3p%vE_I*r!6q4CH0J7F1v>X=A7Db zobMYh*Sf}Gb!Dcq-6YRQ>Avk|I2&>jr|6~;VZ5u5Ml|&6gjVH_D*;QoNx2WKY{=<8(*>p7Hap_o% zd5Aum1{pZiTE+=gYP}@k!p^@>-VmyBMLOELXGn?fL>nck;M~KQ&|##vLXy9B{1bmM zHv;#DocJb@YdIB6y1Ym+)it5$jo@@A4^7klfC!1t#g>UY0SZBR$J)F?{6T018j%|! zJ2#^yBKp(__#-R(o|PC5>-%V2@xVRfDcet}kz^Yh7Afdg(q$oc!{|xLNz1~q)3|l@)XO)hT>l-SHB9B*+#nqghj~pfP zMr-6f6?}sPqN}3?VvaQf&}cM@x0R14W`Lpr_=;BJ{U4UF_kMH5GM!ylFW-vqX#KKZ zS&|O|@b#KCKR&araGYI#ndxr5GFL2QiYFZxxlVhyr=d3K*Z=i&ErXNNxr1O>87|(8SU=nY!)4tH(Y#!j!vbcXZZ#mEmTn%r?zm_e@W6l-*=Z|tF^rmtG=9f2GG^$QIcV>yy!FnxtG!vYB)0y!+3!Q@ zoWn4OOyHbJaXpFHy$U&l@&HyM>v7R^zT+i$Y|GZNse!JD&6x z?$&$?nQ=&idf!XEt9+R4nAy2~kI9kA4IJ<eIH0#KVSXvg0Qk?iYP{ z$m!OkJO#GQ<(P*U6!3X5z!3-OStaF`0**TBc`F7IaHAG$`dT2bFQZm(x#C%i`&t^w z9CZbsoK4GK1fmRH7(OC&xl@lu8q+$r1PCJ1NSl{r1)O&>yq^AoS!@6bm(7{cE4?Wo-cV6!dY<1Pk`sO$!^G*_DI4sra zMM&_YXP-%_v(OrlD(v)ZGCvIG=c!9E-x4r%?!2i<)B1YuB_4Hqj5G zd}K|q0lQ;X_)H<4Sm2q0)W>z6B+EW>EwH#;m85`M@H+}kkd5#pJyHWWgTs*E z}o1Ml9K!su-VptpzFt!v+Zn21TOw$xjhIeL>lKQ;3X$H}_bq_7kQ*pDdG zXl#xFz{WU-iW&}+GiWGfCOen#ITZV-Jef@aXl^A_K$-W!l`eW@_291!nigLMI1H?Q z`XKy?t{0*+TY-G3N(v?SpGPbA*@aOgq;P+kRC4>{H-x4tF^n+8`e)Lm{0yJzo4LlY zc38;mThWR^IPhT0zC;No_vIulZd(%VW@3WOlFv0XyKwmG=6fj+0Af$V{iNd-gfsgH zO;lhgF=61j6*VU*hb(gF0FG#?W9@2un6l<#8SH%fV>B;n0V{OMAEz_{6D<_*40uc& z-vMVG1VR~Hb4Ew834OqG$zOgz%*L(JKzwcIJZZ_;h9zwD(6^2WxQCX!8(rdxerq>5 zRt12QT!Tx+u>^Pi8ogb_U%&b8eHWwBO9OKFl1y%7cd(MOCMBSejFdGkA_tL%ApGmh za>FX=L-q6%4p~iou6w8bBE-rOrn4=ja_0%?^B*<#9)5IEH)`OFjQxWo1Fl_qhNx|! zZ%sy5CsVJ+UW50^QpgGtF2||5@J0^Du0tnr`5#+%^UIM4+8;25Fj> z5_bY7&%~J}3=lZk-@$U;3v*+L_v@f~y5N2M=JpENs>S8H5X!-|y%9g7NK56>{$iOvxb*V*EasDx0UCHW56WM9>7vP7=z z9x+MQN*oMHBmBKVTr~Q5SzmCQM*CCZQIP<&2Pn_rWRnJf^~9QSsJBPfyk9Ko8NkYA zmCCi>Bs4@-7JLC*QmRh8@7b63rLE{eH+2ll_KbN9s)V@qkp#_2pK+*Re-11{tVmt< zUPDf|1>)%e1Ew{+4gK@a*UB8BzRZQ(aISTEk>FRgsaa=`%NuMnKFNwxQ(-lJQQ~{C z^^1>ivkuK5|M;-o&E~AH;EsKKwnD~WuJg}1H_V~=r7{>a@DG0BE}!d3?o-5LWP0@$ zzfbrFPg)~0y(@z1h4B&pbm=V<&Zt6*d>r{h7vJeL`fDa`vYb9pmtf-GQ3Y-p;htVErq8bkshUdDLl(cG24oxWK1pH|4*Su?IIz-v7 zaQC@_GkyZH+<1vn!-q(%FjZTomYATDtCS&Wlw z!|}YQ8x>_q4IMI77-=+jC5@Lmps(+CvCqG%E3z_jnmr=9KK+5hv7xL*Mbxl?@PXU+ zUbYIAx1-xkFN*1-gy_N;oW#b{P7pL#t1pl$Vs&*W0^~LpZm<^NTMzdG&#FuZ z#^gHD3{*<=xBDlJGC~2ZXn3@wp_qEh``DJZ_=6LG=jI((9hYD{y27T_l9@y6i?&`g z3rwnJgVSE?_2>spqVhJV1XUU#pBvSGQIUhw+TTjhgax}B!f8{b4!Z003%PC#>XXjw zrOHq|;;^vk6%vPV@6!$uPlICQ9kxT(n*xhv{?-zxi4%q{stL|yuL7k|I?>*d9gn~P zl>^TEZUURMly|!=QP(P9DFmakb8OC7{GZsHxSeqqI^!)JRZSQh08;%M!j0NlPk zW3!=l{&Rpu1wMN`5*_6xEvXvU4}y4I-X-{0Hi4v{A1jA3F8A^hBKtw+#nLJsJ@0}$ z@~Uni4iq@>53wc3*Mo6c?4b{%`B#UNC@fb5!=G^Fh;`VLu5qTfCiDAvI7pm}BTv)M zYAyhJHv3|j0M@?(i0!-8de}b^UMtaS6oaxT6-kO z%ZDO_doWLIa%&g=gki)q8#N2=3Cr`ZP$j&| zrH&;(#?MRl*erAw4PKLQ2rcx5P?-v+JiPA-(Lc^T=||;A&jVM2gM4)IGwwcdt;w;D zwm#U%pQeFQ+wYcDt^UkJ8ss@+Vewwi&&yE0N?0W$U<7S~emmwmJ)_n;(TI@)- z1_UVXFDLU-f*4X6dyY{rp+tX4#wkP1GZE2`M0&|5glcbbvt!iZFk=n5fb`Kle*11*MAn|t`6#Wub_aQ zX-@H`j}5nn##|v{m{u>3?%~XJv-H;fC#)is7%rYuJt~7S*ad;W0thh}SDJYW0Jik{ zrr~lw^H38*@BJVPE5$_97nt|yo?!HRf6cWT_b}MPlu10m{`{%I&)QzNvq5b)m1oDA z3qsbyy?SKLd#84-4VXUic)(-CunFEgZwxK~dKl{GM5S%!P%k#Y5MS{RBsOL^b!-ElrT^azF#JPwk2K6sU zyQ1vRg$sTh>^%tI(yl%J14U8YQlOw#YomKf1IsKzDJ8Vg9K-tpKJJB8am{ma2e zqJZZ#kwALjmE3y=KYgk>k-y)H;C+Z!U_B|eZXI`oe3Is{LV6kYvFZiBKpaf5shiTp zv_YaS<$4ef_)k0Ec5BrkPevg}Q-tke+iUlv8s}#vfJi;a$5-|8K~C z1#g9XJW?~#r2IT$YjkfsOFT^ldLx^QDM^Yti1X!bLLHSdV^?&7l5#?1VfF?YCMwaa zbq?o7@y~uR4@%CEp1!)tYOci>@NXl^oRI+$4Qx<06Rlo}xT!|EWxSqQN(T);I+-ys zkt4p%gJIGLS_DX?irSuooXqQl+-HmqS1W!3obfg-@Ki=pe!s@Ics4~(n~Jf@-*Z&i^!%)^!ov8w(%#T4^5!B7gCgzXI_$3*fC}r z=Z{$C&K2*s2*wOoc5iJaV~Z@!0Bc5j?X2bKU7b+3>=@?LmHk-%H2`E-$6Fz6aD%qI z+8gqlRzZv<)^`4wU>jzb-AKV>p^#{mY-2W;{egB=DLxw`bRs#?kSClGw}XKM!VUp?dQV7H#lR?Qhqy-^R}{d(%SkVQB* zFlAi49dJM1N97(2)n@WYsug%}+A|4N)aHO?nOtB&X4>Xy-7am%4HsbRdYy!AY{&Oa zM1Z5~h_EDh7~>+^_a{9=`t9Y(AK1Ko_}v)8X=M!-8^c2Wp=AzC z_YqN#*V#}S0%-9U-NQ0zZ}R6{sEe0<{1F=YMx3)7e`LFuF0b^h)>o0A4YO#02TUwo zt>O+$mtia5YPUfZg=ku;cXa5~Uns(BKulI`>SV(i#B7i^JAry2H6i2@;5GduEw1;F z&%-G*os>cHj-Xx$8-s)Y`?i+#7ec^jlkN3Au{vVTKr_ZpvJCkgueVbPgFjY?N5Kdc z*&d!|Xb2WPSqvC{k4hH`QcJr0Wvq4?mGL9u6N(V>$}Kpxbbc6HAPX+)&~`1&Eowfr z;QcdWE)L%ID7{{^bl*NJweh8J)Zm82VKSvz!w`#$9p)yMrjsIsjq*kW@1m-8Xm#8< zG6DX)s!#%LGa5YM{wK)`_uahky`3A@KBdpOQHj~APpmPPs;I42aH=cwqn0?V=aenfPJlLp8I4=}atlMGT3 zsR_Zp2M|w$0--{)fLmwcy>Mc?eG<%dkpMgfmaxi=`bEUh&=lIH&G~17yAxnj+H4z$ z9)u)u#T?PXsU=$!E~G7bbNmi2Dh$sYwy+_uFix@&D7i@c^9N6S#_2TFXP_htm#L#M zy7}7kFKg81_JIetIjfQoJ=ESYDbl-t-UVRM6@N-k^K!AaM@V3WF;MYN8s8ChB11x+ z^x4afb!A?khob%*dRcdlL`b8%Quw{`q^$`uuas^t@fO?SGhe8AFGp{8utyRJ9{iA) zwh>TNlQ;@EI#Eu^=r5Hb>C`%X_VZzRSyc&*_ev6B3MQRBo8g25h zk(gX_jXRSu2+_^c{zRBj9VbP5nK6GATuVey4jZC z(}U(GxvM^ZX7RlPMNW6Jzj8`L1K^&RF-_e2!6>NLg)wN#s%C83=6h5=uLi*)a-W3X z{y-f{;m!|h@B82LJpb`K8c?!7H%{~KBy;}S<)M6#cfS6x6K3@ff-8m21%;F zr0>7Y@8AA){UJ!fJ}6w~;{9vE`ip-h26^ThD($;}@#wGD;(!GR$=M_4M6$n>p}(3> z7)XO>&RcT(?GMg~fslB=6Y7ijPZ_uWvJ9PI=EPt2>VNwK?=T=F;#!nea=$6v*_WY^9RDJh`QBE!V}$e)DHD9SJ2z_gYZ#0_CZ_`HFzs0XrPianzFo z6`R_ln)iy$9KU`wK0gVcsn}|L(LTQHr6m{5J<;AAfTpBvJmX zllu2d|3BtZ303`Qj~vhb_7{)-QUCP2d(AMU1m^w!kImpqYZ;WdqHF>Q6kH(J6aE2^ zO(K0D%NN#$dB^UoR?Pp`PFBXZ{7Nhbb$ok((+m7njssNyP21$#KgkoC@Vz=;*aHVV zSdl4-0ZJlQ70U=!&Lo0NZQuikHvTA5ZaOaz=2 zgR;6wty7)l!q?FMokGm4zyOs*cuOER7n3vzBZ6W;?06Oi;>oO7DB1X31+VDkyDolN z1dbY0K9x6OnQd~|$`Gw2w2>2QhTX+>JY}5$<$-Dmwe=IO8=!$mYyKYiw;-?xqFxt= z6T_y$kB2h3EI!3N=F3&%Wpa#>G+`RLJ2L_jB5xnfx(v2>^R=Q81>y+Ka@m2{imG&{ z`#i|lbplQidb(#gd$q1E$S^`V;tfD!81e>kR5$n`JNhT^_1g|7OsGGA5Z$oQ5LYxv z;Qdwe7pp3umk5#++CiEVF-Y``g6gn_m+PheopxTwMj^FQ7dsr?>*F2lz)A;cn{7Mw zyHxSIW}(AKG*%!|MbZYv86!3Op3A;;0)fCTME5XmK{;~!h>_de%}#8Y6fp1h&^zh6 zM-5u9ofFXX?<3LTANQf+iQ+?a_8ycN4`Qnl$b&+GzZ>2_C<(-vF^#YhCL!SCqCe(6P|>nw2O-Nj>Er$^nQT$-W}6q2Xym zmASgSFn|5M9^}-wlUb({i1C}!uL#Ea1Nbd@43+4K*GGxJPRIj!_!3aErPe)~r;C<0 zNtW@LP^3%O;+5>Xu3cLZ_oZ`?$J$=vbNId~o`3(}U|#=Of|df;)i*l95*Hil+`DYo zex*)>3A$C0*F99+`typ#zFU1=wLcg&w?b7rDWR= zn4o_2sg3}dCnxa@gx~^z@jps?m{I~EMzLeKF_^YSE17~kl{sHPRZYE>`!frJS9IbvK z-00zF`~8A=fWnn;_&E9T>hfIP5s%c+cRytz5EKD7;u*cw+u-}h!a!Ag!4Ui80n5aO zT+8*<)X*lhojJfno@TF_cq- zZxgD#d-S^8iVC@tybawr9baT`L{=f;LyML=xzj7zRffg7>oBym)a8}Q0dlW~z9l{6 zDz`du!cF^;Z3i01kF!sey||p&YY@}Gbe3TbK00{9xdDaBoD!l!~ z17O!5lPaBZxra?{fLaVAQ?IBpsc~kB<5`#NkV!r&-tiDTYN~E@GJ1=Km>!*oJGVlf zv;{A7ce9zl>DX^A_?ZQAJa&1O^7l-se(KH`C3?NJ``j z;#kWndmsSF85L@7+PNqUiZ41WSxIA|66iwVr*@qK-_$%^ z*C#nNdi1=c`BFG)xI)F%=8U`iN;q9o-Cj8tiJ;dxW2oyP2?9mNo9)94<-?Lv|21SH zw=#lPT~Nf}s>T|_=A!${F_2T@4*)FJcthdhX7|fI$~K=)u{ItauczuUoOZgR?=d>7 zOKNCOyHCp^5!eh)s)f3A4holJp8Ymn5{yG?z&ogXXmhk&6({bs?p3OduQbdzwD+r2 zU_CoJDpG9g7DU{^0AGa*avj~D`FBeD$)&6HpkT4#%j zB9Csrpo|p8!x)`xVrkR|>PCm0MTqeycm$(jKZt#OZICy&PQm>>}>3 z;l1QBhACg&(X?$l_0>tYki*N636RXfZfEqBE`wWQc9$9?J|GxKgLF6Jlh)C?t`mqZ zULX&afg&b7J{el3L_ip~g9AVkL|ACB2R3&nN~<&6*pNqznSd!S`Cq4cnUV(*^w+D9 zhf!YygS=@8j*k)kaVt6?z_G67-i{-PyRi6tXeckg!qFRmUoeb1)Z&kty%>_)Sr|L%I-;RGDcN{i z_}OD($vx4f(-(VwYeYmIHr<62qgNy*UflDJsT4EB{&2vx7#|e0eKnR*0ObIEkI&2Q zf%URSkOtzy+1e;P$l+8=tNuTo#6+dFL7q{K$QFM-6q|hx5eaEZ6(l9O=^=QL-yz2U z$WzLs)LYs$k+f(X6L0>XM<1IbH+ji_0+YqRvqC7e1CYtqS%3R7kPa+R3FaY+>n7Gg zD@OiI249p0UlITDcX#~nKuy9tmVCO@eoPTy1I)S(VLEyM2% z&%UTSKPBck`%)%F$D7wJbJwcc+_X(dOLcN+u1sm5lg%~40R9>dg7eFc)kLKE7Dqp! zSWdS9|K4SE_|?{(mZ$75$)iBo2OK{u2xPa4jN5i~W6t8KoVC{|OBS_=`6Yxr`i}2L ze35Nz>^A4>%;YgH-1i-B^{yi4+VL%SeP9xo%Lj5~O`o4bWVYCcqQ;}??0zB6&tWDa z5zz5o%o=D0#$z}Ivp3WwBgelkfMOvwahl?KJR5;TYA?+&c3Q(Zn}l}O((At^+b)-Ls~~mN zV;(V$%mO21PWd4`*BT#D7I&>J!sz)FW72};iSfT48ldJls;WQrj|3P zj+L-O(Yr%)%OreZCLG#K_!f-t?jX{4^;!Ct?)cPux8G7^OhzO=4!`ca>COm4b=Et= z*p+-4P|(iSsOItY5Y&2w4w8+`02Bi%@9YRbVDp2Ywex_i_Z`!awtAG~RyC1rVzH#S z8x(s^1C7oAp?6!O4;BJ-C7cRiDWXAC@I%9o&ozeF$9Ttg2T9^<*9Z7N>hvR?FziVH zcw2-yN2hVNCxzVIdoK-*imicp$p>eQ5APZ|EWS{(R%LeQUFJQG!^xKRm;1%_J9P-D zeUK;+0+R1|)1$+)hS&ThQ}y3I8uc^7C6O%H!53J}MV6)7*(DYn&D!`UQT#uAz(;pl zu=TodxjpDCT1Y!i=!}gCSfN0ObKYI^NYXg`dwcSxc{@ODl1gwFK-EjbtSBo%0QZ?t z26X5K zx;+WBOqH;`{cT=JOu*>_2;|Ezp)6k$l`U1d)~gchIM6O5Z&jM#YwQ&9)&=umw7R1r zz;kKRJUX)XzQ!8jC^9C};|iF>t~Dfk9%I79pJ7~?uS@+8pwKpH^aH8HV44@^5Z-c9 zTu$@Py@QWl)-epWja=v(NO%c9{23*(^D^k3G~F#Tz2_a$7#^RLL9}d;uOS!YP;n!K zb_~Aj!Wr-HxqW>l-}N(p5NCv@c*oF1)E|IWE67Ug`5%q10xP}e)PhB9jN`-@LPzXM z@dBBOrlt>~(c8t(zY|d-MdAxM{U6-Dc{r5+zds&X$`YaMOG&aX$uicm6&0fFgvc&C z!w^X+OOz3^rR)r{FEffLW6L%eOvpNR#xP@MzW2O8%Q@FM=lwh9@85OxSJ&0uJ@@^3 zzFyDgW4mRCHCFPUedKE$)uodQX_GHlsyg4Bz_;?WI?B4H+6gcZGJnwZ*S(gt`8Rv7 zjt?zQ=zEC)I??Vq`DDe*FKU>cU76*zl7-gjUzV@AP0v}(eqH2B2m5!`OYs)nVnBB= zd&uv?>D$Je_vn~C^iN;1Pkc2^A$P8WwPeQe0?ql(^R&zZFTOI#K?N1eK7fZ`nW*)&On(?P#K^!?_!-wja$LC{Jav@>bP7>U0`tmUu8uRC9P1wI1H-+M}TnvuIW0XaK)3cwcu2Hsl zf6jchttmPiK*9<$=*Q~{Q!gvsm@sXFvwO@FHUMv5G8KK4a{FMckRBdq|k7=uzl%u z1kBB<$n4DD)RA3D>2s-QVuugaH7NM}3{P=9EB|?fi`_R(RR#<&=!$}k-0P}oPVN9! z70&kS)k5lD;&%>v!!FRcC>glAbzb#NFnmRAr)4j^ad?);@#Dr*P+u(En1RCl1EVsW|kW{jCWMi2jqj1JA`7b0el5XWHF{ z6>jFw04RrhBge*PtOVrn_jyK|lWZJ9CzSfyI)2YGrJOEF_5b=d-t83sijD^0?{os% zrsU%56Hfr_P+vuFH5>`qeFFx@bZ5A^*~uMA;N|ovwC(TIzPvc}w>c1xL_2;fF^DGK z?W=RWth(8p`5akspT+n)^46zPmp9)_kTrm+Cg^w|Yz7D(cyEx9*UzOa4amZNdC`T7 z%YXF(K$F+uDg0^CcAWFfdOY91_>GXFjPlj#c%qBRSCInTM~U?d;W6O&3*2%&FPV6< zdv4nPbUBF^&NZnWchY!k!Ko~Nm(s;)5?9$UCF$VM&dJ`|`ORh5FkieuY)v&-sD^e! z6RA5i;F4i<(%(ulmPogCO4|Hu02k13XlTDzLJH4vvCF(+@Hod9$NK4eEw?5hvV8w= ze%;Ja@EnW=ru#CQdhV>0o{Hd~wFLHnaGRkzb4OD}t0Nmw)aJ5WhPl%|cZI*9ISAJu#- z|4Wba`=Y@6qa>*aw}Pv-=R#+jPKZBY$rXA6NpX6K2@7$}WJ>ON|J>{xibQ>;fVH=< z;%)NTOoe*`*0K+uIQX^WZtV2U2kcB=jsXH@_@7A`YjLt4-&R|EM>YSaASMT`s6Hls z*8DzGldVhuGmyItD4^~?XZ-}32}bEFoIO2m`vRaD7jJ)KXiJyp7d-jpI<01n%!Tv1 zL(i&de$n8&fZ(cmN445fkIO~(D8PzgAG+YK3fz8S2BRJaFid;df#{v+j9LBf-T+i( zc8t#SxsdIS@}Gd3@H`cQ+6Is;{R`7n-01st&E}X!Sy{MLZvD?NdwvX^mEt@=Lch95Kd#i{b&{5*>om`Mt6dtu z>M%VK5xa-$PpE@n=L@zJ2WJ2fAYI{kD3Ge4$#Nr66j0?p5-fkDyaDXim_=oQ!73hK zK$+~A@%dRylENrp{cm26QZWMj4Dsj!-52oyZvfhK{*Xg-0>&i)G09*d{2`D2>1kaa zkFywyy@`%QJh0Kz>5XY9KMZmX&jipg`6b2}`Y*q5kplif$B5hsDEppBJQ`^FI{_Cj zXNIKn4VfE^8vO4Va{0rM*6sz*C7Jly$L1tfs#-T+-AJI$T_oMO46?87Qhgt9 zecx5bbZvv99@Xj*04$1IKb(phMsChY&^pl{C~HLo^nca>vK~T2hORj29+D{}(-Qm^ z;A54!gq`L}`^=OUG5p4mHX%Fb#%_ScO^3933^mo@CWzTSBdtQ;VgeY!7T|xre}2<8 z#GQ7=5JmnoXHJC}7sz@nW;eM4pa0R9*)-{5e-0RQE2K6a*|*%Ny=*y+@0EgarDaf> zTpPd1fywTuc{>kUWo2*p!cld}Y8%_61 zKfnrp!+Aet0h};b@`mT#JYN62^`|F*8Z8Z&%jPI&RcESU%-X$awiA6KLWYEq8(o&yeH^`gu$=#}+a zq%3_m*NFeXql*tNC`fasO?J?CYp_z!5%9Y{MY&4`lU6+(qK~Pu7*g^FxeW7{7=~}u z?(0pq+##WXZ;#HyyRf$ezL;Pihhm>NzXx2vpQ`r!5n`J}7wFCGW2y(-U%Md&7*MV) zDfEBI17vf+2n~QRuxg;8?k0TSyO?QVo6lO@E@ld=(Y`G7ojD;iPj+Gg*2GxtA!e-2 z1+0J%O>#Rg@7;{^^jF_F1d;$I$Ku6vqnQ$_Wj7a8*TC~;7liPDoq~(wx*iJN_4d|< zv%JAMDH@mFLRt?3g}JmD!=Uy+7zq95+1%*WFL6YT%v00BnwKVKyf%9i9WMotqLGW@XJbII~OPtC}ig%i)&5Y&K~iMfZYvV5K( zv4j>-%Wy&AH&CDwxox`|&TVWAP}2V&@7R{(rdvbz;$`{iPc(%x#_A9C)_~ll7PVyF zS_U;*`fg!fOFh#!eO*b&dBp-I7j6U5L<{O5r3#d5;ZQ+ZbwR63dZJ?*H=iA_7S3+VOjm!S5VQsqAGa@n4bR~JCfpOZ@-u;8pk1%!gX&#Hr`rM{! zwx4N3F}!QHFJ5~1(zkqg#mMEKH2T1=+G zuT5mAj04}3}E9A)>N-nxNtldl~2C#rckEplJ`wC z<_d>wi^SMcTRO*Z~1BdJQRb~6zc$uj_UHNQZTjR~PfM%o+IX271k+5_fveSLvq zgTo*jsWSG?nW6DH)-KcX<7lgv!~eHKKOTEd77WhQ9J6iYbNatV3DbYLcCl0 zZP(tM-`uCdXHW5NiM>>lBA=7(MVYoMzcXXReo@hjCMCMylm-YjHAeX|H1X#5M;b2M z7w-yaycVIIPkAc%O6k2f5kQQ3mT83etsc<&3tR-nF`r^PZ{uFjRKeb}Gp#ce2=s^s zgJ&j6s#E6*7@gP#uRskiZcC#9fX}d5xQRi}c-ka#c~<{K55!#dQTtMWN7EhJaB)CV zxFt5ha8z8=bL|`(n-D1I2Fu@z3d#U7@pUMJ-kC%R#a|NFobRhwO~G@M?>j4rYSVRt zBC_&4!-E}{Ir4Ka-1{x0Q`dQqeT;So7^i$Mw#+${DxGmGKa;`4|0E|G$i~rA;J7+8 z;vY5O0_X+VJARwY=<=XA`OUM=Tx5u2#k~q>ddxmyhW{df;#BrmVw9;%{0`GFlA5siMi+qh z0<7HKtpARR{$~#3J~hy_{m%5>UtPYA5ip(^ef!&a>tB;V%qu#1NowfK9PoBi{HgQ? z)u&C(N<+*mZ2t4wO0j_2!@yLD^>22!f8i8EOLR=nZyW+i1f9U(%LS{WX6pX#h$(PH zMEOK5I^_QKB%l~&lA~m-6+^;6x|K%C{>plQFoSGNA#<6|;_e<%&6tx9a zG;@(J^xxO`-}E^{o>YB`J#DVz|M{+{k-rvt3upe9$M_$3gAFx1w%oX5?|)e{|8cLT zslda1i^T_{=NGD^MiN4 z2E4yYS(-t)|MOi@L!%`>`oH+Q+2+6fs7k6jbWH5`(f?Tfc)@>jbvZgADDrQp4 zNH`FrZNHBo44lj61a{P4bfDFvuX~38;>ilcb|()0x&Dmd8ej41|AchJWYO{DJ^T<` zbK2W5@jlNbk(gd;Il7sLm^l5BE0A}7bm)5c!<6SJ@DzfJ%ngf14Ec5N6j1VM3WTFw z_T?hl_|@P3l}k(|G`D|*{PQfv9M_*@J5l|q+)pYZf z2VCSmI0hbZ3@GMlJdv~DyoU)Obl)NW32h#lSs1R9Zwt^Go=e#pg-n_2lIZyC<@jar$Q#XwX<)TR1W!dsohYf%Ks`!1X3}TYaO>X(37F41g3EdtLoA z`JW2y+xA>%p;Ixi&HJYrgtDsdo3Fx#O`H-Rbv&i1$uo-1x5;a2?fI;4nD;E^01&(& z6e#uGRbViDzm00e*-Qx%h~F{mwwI3`&l*(<`13;*ezX0NfS6`-4cD4G^rxx7pp_Wz zgT@nSi1%hCK;nr%*@7~+L+qM0nkxM`SMCxqB?>pzmH}`|a4W^=9S`GUjSpZ48Lr~* z?LQqg@e@fZvpfLo)*uh~m!GpLPl)u%ce0G#UC=rmqyg+r!4GKYSP?AzXQK$mt@88F zVBfY=;7p>FJxJ|F@%tN;njRLHyNV7xsyIETAr{UTG(Yj=1s03nrs9~8ly2k#HNejo zeza7*1t-Q=HqL@V0wjI>LWQhhTY%EeQWbEEbpq2aFQ96GRexiWvWT><688Z$w#ol0 z_{?eg^OUWL4X#@ah)U@c%dq31^7PdJ*8kj*v}Spq4d9Z!I)SV5spIa8k}(57{-?Bv zbN)g!Kp4kUL;uzRK9LHmFjE7&HE&rY0kbSBPO7TuqI69chZ8W@mr3-MWLc?%?grd! z{w!Dwc3^1@FB1~(e#(Gj(Y+wo-k-~AR~n@W*d*rw5;h^80#G|@eE`0u30HPF0JIg8 zPXOZX1uzjy@Fg{86ancdRBpb9$T@en^zbwAZh4EYCU>j?cljzBuovIudLVM%763{{ z+?$Es`t80!?e0O~Bx$`hB$XX|`zhyxOS0NL{(%M~U$}k$&DSxdZW`KIsi@qdaDZ<% zJljK6lmPyuJb7Sj#Vg%P`A^Nl3Ih!xoab;=Zg-Om>>1CgM%D_zmQ77PM83OdP(@W| z%>nP^43%sINHP^?gh1yopsnJ607x~efml#||HvAehSr``wdBtxFKX}r-oXs04}`> zQezxcc`Jh|sv1+gh`mDw+!JpP5UDP4ROMeM)f;&H1)j!VOaC>yGPz3nfHZ=M-2@;*jQ+^<}imW#PdRo*72$z7V|HHh53 z!*ejz<21pa_dE|Fc-|mNMf!Mm;fGnWkjn=JU5Y>Ba~jSh{_d>cUHO6Ok+>N1yX*v3 z0eK^h=pdmOfMYj8tEZUaq^P2RPT=hbQX@XypXNV9GDn7XQl*OkVxCUThKqZC-loJ- zL&ri3yr(AATxluTaDMY{=+&}LVO6>HD$p>Uw;oCV zk6!1BSVyb4!v$?%({`6$Bpm~=7u!CGnjfobcJi4AR3~pAK+6)}zG9!++e|r?M_vJ> ziyR2iLvaZ7a*a(?GI>Dr*?@WlL?JdXQ%7gbo$ zb2z+{9q=&oQI~hX5shlR`90^J>xeb&gA<-`E zjzsYX4sna&sEq6n@r{Pv!sT}=K9aNn4ez|3{+gCcVWO@-VwCBx&u}5=41h)EWFb^Y zBDjXl?}E-=jil_iI(7A6;?Cwl_@j)-22p#uc4;5?pVDx$opI@xzs2M7(f^y(hzSRS zTxy9}^tH_S_;2iX~>n#w_lp>STEo#a=-=@3mhy0L+f9~WNexp2?200EjWWOXg; zUBE4{w7KScEj%yH9*EkMaIL7iR4+VdcIv!mEOXBO>owhpzl54p6%${Yf>lPu&+>PJ z20jP?QlH1oL%jg0aB^^gnTHA}hpOYFTHUEL5F2pZ*>UAqU@sl)AE&Qz(=xLxH!;N7 z?)<(JI`A_3~Vfc&AnTcS~7X`cQy|Q^Ud`y^B zqhlBUv>+i}12ED24BbxhwTcg3J?4OF8F0mgj?F9FrDIDpHrwKrdnEjgvD=ELd|p*! zK@r|?w3KZrXnCE-9}uYrg^SMFP3P@5o3p{*^#ai>0^+To9QeV+rh!u4BlvJ`9i^W~ z^pW$0Z?xU#!+raJt*Vfwn_n(iD8{t8gY-kM+Bof}YhtQmF08shK%((hz5=y5o3r9k zP(0qA5}zX?5Q0&W%X{+2ov?*_T;8yA?5c^R@}Hd(Urf$++k(eez7}BKyNrUQxrwVH z+dalilBN!|?rps6%PO8L`}Fb5&f3J4lHa>|q%Pwm=0(}8{e-;S;#`-oYJZkj_lJA7 zZB{itqdICzNW;hXd$q$a+k>R6p>ivw)fX18YatclFQ9TZ#VMa7B^c+oABwS(U!Cl^ zm)vzr&0^j%(rzboSOoS#T@zX>J~OZMI%O`hTHM0fIQLeD$pm+XN!EkjmxKFuU3b*D ziUF~(AsrO_Tb~*ho)kY>ciR|7<2x#qTr9*P$0}idu$MD{r99puyHujLVh3QvLi|BP zBNzC(dkjr{5c?&?mkbQv$PRazQ#7AHEzrHW6@P`-_Sr{@$SdA&C+-$r%o<3X>(N=^ z8hNVcs#I_FMxOrOS>(;v4f*| zhO@^2e+C~zEQ!_S-s{X7a>$l-d>?trk?6H|g~?T~QP|aCi|KZ{&y<2YW~v(=;bxR%FRw-83AA_mg;Rd?ZmbhenXl#kEn0MO(Nl ze8+A*L?(1Heq(F7((mXPzFY4-c%W53?J5jifZlh^TpBy5Du&`JF@(wyHqsE=9Y?}v zM-c84l91zP%ziWmXw{GLppW3i-hw?D>}Gs^_719jGk8Rp+!-A<5JIjqA=a@?!WZ|J zA=-~Hwo3<*8>j+QQn@q=OY-5D%JZ@9sjo#q1c;H*TVIybUu)TBx@V@N7b6{dBclCU z))pa8N`;RbK}9$f8Pv8LLN0CjgB^(J$8dBlx?~r%Ebp-&KwVL2w~zq zAV_8JydI=7PLnu|>T$t@*E^tgkKl5%{f!&#xsOW&QK(7Cp0{vu!!LD((ZQ?Dv>vofX0-hg3)Jo-oRgX1yk>L@;;I?9zf3vK%QLjO;~vh` z{wUjzP~?|IG>1l%ZzdVhdae*1kSJ1sXwqgD!5g{twQT&C6ch#9Egx)!e?;K%-tv=hQnf0R(pH|FF7H!Wc#d5~*{SZEyf*&NEL~P+n@Mc7FKvZUF?d++}r17C^ zKD}Ve_ z-lRPoJUEE(As^M?Bdc8?7-=*j?=8R9qRW!yJ&lF%h11I>kq=uIs?-~`L(n9I_kbdn z&!$Yr(`5625|5VXYzYOYe;psBZz&sM-o4glCpbzXH4j2IHT=@o!DSa^PvR`27A=?f zV`_~fzBMi4?st~BRusz$u{xqaS+EU{Tw325J0%OS;3T#be^5{L8USX%aQ=wp3B$-P z!`s*ko2~3h6_(LnNA{Om6eI&->l3~AA=4u$HUyij&KU)XJ;Gm?ePbMrMeTQ#L*-dpfLo>1YfS7FG$<)w3Q_|4aCWTH< z;V?U-tb(o68GBL-y*EsV%MS6Oi+lAj$c9E_WY6d14aNfw#U|$YhvwgAYB^ zV76=v+*OU(YZ+{avu_YA9B424d}bYzI2I!aOV(ZVuuwTv0+}EQTZaN5M`XWKc)E`3NB)&uc~v(Muv1|aG576Ms%q^lE$#PhYMOE^ zWkG{d{5gf`uLnlfaBGddPf+CEQAK;)u8pt3SWV6sAKm33RAoZ}xZI~qbEtooU}xV@ z3Ti%++dG&iVB!TCvxJPJ@RZA}@`aJ!7rndrn3q6jj3^1R`w#GT3zHVUm^kFW@ z5E)GC8COhjBd8YZjdQ+hZ`g=E+^g~%tL(AU1{sWfHMR$*9#|2F>pyt~qAdldwC-1- zQ>qw!P%Tx~6@z74*$71o^}^jR?+VsZdrfne+NX+krE8od%;22VEsBPg&No+Q12yG2 z6BcpX8^uFP8aW{fcjsS1++Z>cA|4~fy{EFE;+k)2)f z=3b%W@QQXqpf)~mBs8)ow2vfCT9^NO1UoLWv85IrR z2_z-r)S-4eI7bT3ybv>GvYevEA>CaSBOnw=@9}TDq+L*li zs`cd~5^qkiHhDo?3s1vs=xOmzmy;h1(MJ^%6xKv1QNoRz8C%m>kVRe)$sHGbOvX@% zeuN8=8{KgDYbYwn@&QQuCEnCB6H zja=1Qep^iJLCp+;37Np~N1l-XhvQ>k&Xg#b zaJP()sdr};8%!%+0%L7vZ7r(5Fzc%b&)B^iHHCWef-4cBspmdPg|*W8gFITS>Qd;x z%W-_N!|0rQk7X#+*@9ylljx;)rQTR|G(a{D{BHTWyMKxomU{VXE92=r{5IObc1XXW z!Ei#6ju(p-@^)qtB-llSFgolEElU69Ja%G=PMO0o-}j>vBt&@(??38=wn)~tGu~i zdhl+zVC7DkpQ!8}YAHa-ZEI}(5TpDo%Z3Uc#t6Shnd_FGFg1_1r8RFKR6q~>I#zd_ z>ZbLYd3l_wy=%DiDDHJd6zTwmx}P~vuwYs(kuva=LjuGg_@&-(*{4%E#AS5wwbS0{ zVHxK<8>v8BDPse~K;F?N_w4NwA{IaXNly+|@kuY`OKRUOsK6L-UYYxp$K0#J<=|Si z;Ap*d>y1I=iiU#;sgh+Mmt+-c z9zJ5d7tD!Kqi|~PaGG~M5Al|=9Xpr4LEAVf`;&XQvK%4YvW1Ma^q(o&Y$*${hlFN& z=`#Rb)!IO>2P&yI0Bzg6(nef}K29;o*+SMYb0M*Z4=|RwQ;kIf_+*ms;mPQSRhHo4 zvfmKYXw@XzJXj|uK}<(FN8R37HbBtbI;S2sP`_fxb zR@q*bY@nd+k_UW>t$m>m>z;5ThtXKjk=8>u0UBx1-ueNtY>$EeW`_U%jWxt-ebO9P zs7wqN^dpw3pfkQ#6ywWOTD9be2&=sN7)$YY@2r+>3fmu6V->f)Y0XdyHBgOihglLJp8CtLl{}h(+Gmw8*$(AZtBHt zOfeuLoKKHhl9&j1%%bm-Ovc9SMb{W@ofTUj5$y`oxwOmt;Fmtm z(6=vNe)>RdXX$JAzOxcR0EVSBUK_35RN9pqPxV zcqS9;G5T<(r(As%HTT`Da)$4Jpub@xP_i?|svol{xkQT$zJ%hj;W*QxiaUb?HQ4&n zG)k_;o(wip*?wDv0lSqEbB>qH25Wh^|rE=ep< zIwrfJJo__ko4qu@Gwf1=T|_J*E=JJ&dWedeV@*YBKQ8j&X;Z< zB(@vY;Wqio`1f%I$&Q1+_lopfp=-kiy==gGdBDgHHX`PYos^VM=+ASm7ejXJVSzyN zop#pxP1SENv}2~kj}iDe>7SJymOUzs2c?P!@hWXj<9Uw zlldaWPV7^9bzCoY0$<$11*YBD^l@B{L{IPhXbi9>4Q(-dp4Wkqt!eG)udDmg?-3$Y zLgb=$UM~;c4;4(Cc#_#7f#fIvJ2$USbNDG%2>_q1g5TZ#5fbt5?W%d1zPjA8j3OzT zzOeRcpz)w!p{~lMp)v3%dN~v+@y71kot_dlpvVoO9=Gp?xwi^jDsI~20{R9Sk zsj|j$3tK+&? zOKq$q=RWTKO7g<_qB0s93DswPjGjoQR@I_|GO<>{uEB07%Iw=Ib+dA>)+m{t(!EBA zw=g)XXs=rN#z;7-aV=)C=&%-*qKkI1-}CZnU%Cd%#I7d?2fLznrfkH{%c6D;5FRyJ zF-6be~Q2m zcc<1^Z zcl&*Cp%J4M%$hKDj3D{v^cZUo!pTj+h)9TP&VtWlg*3wN*UxiMjP2{Mx9EITV#S4x;n#HXm^#vKcC(V$9_NUz(Y3}1s{oSTL#oY!R8*T8h6JF?IPd{xm< zWnmT5ANCP^m1)z>E=<|wQ+$8eE5EC>^x48kslwrQnS87z;r61{)x0K;{S2P&bnhB$ z`&>=qQ6~93^7J!M>~3R4G8(WuU!?_SpOv3U6xUQ)NElo+$v!9`dDHMD6tUCz zC5k5o+8=f!Xttrje`EY0DZ3|}7{J5ZnSJ&|`EmAuo)WN{@VWt7cH#e4L z1TSksKToF15zbBN!}6|TtU|T|_^X@7T=;@tp%d`T&JQB_O=5k!zmvN$-JHjPyn?KU z{WpV}c8F0tKY6niBs<*_+p2jB@Asjbazp7XloxL8Px@TS^X|UWm2@O!D0ZcX_LaBYxGJR&-K3kQ)KV%w}|Y9$C|G4@x|H5sZ$|7Q~0w%@SW6w zX9MlMetEWec`90U3oq=^2nWmM&6|8pW(~G_%Nsv@!zh z(OaHc%Qkmit5)B%lpX~Lir(+*DX5)P3r<}f!fk-GDq(xBN^UBSyJC*DyecPDl$}aX zmLsK1M4P}1ehW(>S2@g-b1X%}+IdflsgN7UPKjj23`A;}GRj;uFzl*1WI=KlN>H=Ksy+DEOk;eU)%9nS!d53LS)-GD$&)x75rSmhf&?^3J73N^=$CJEUCt?)|hKGIG**&`*%ENTfG(NjmV zL8TY&h3|cLJpR5{BV@uiqnhl6wZ}nR-7nSmG1T9h+QQYhfP5r$v4kkK%l{rUPuZpD zKu*lO9`#d=)ytTDIkkiP96YFMAdeo6uV}Z9*4yY+;8>q#Yw*=>Ow(OP&vqz$oQPzz zM?8SMBNmn0VBm{H(p%k!m%a?*O>y5XmJ9b^oa}O zz-AcXGCI;6?LGsm4Si15`K`=-Wfqrb(6=wR6^VMj#}Tq9r%}McF9ca8s!)!`v=QM* zirH%Nj*r3DTlplgrA+Xr zzUQKo%4AiIUl4YiW8H(-VJF?_XPK3g=iV+4tjJ1YOGUo_h(ae8*DaqR?u;%9e1N4s zI01fy)09yo1+>_uL3$7p5k_U^xDBb# zv#~(?megMAqLs^&hmb^YPeL~~{M4|)V?S)gYxnW7i#ez~d5#>L2_ty$Sk-Ql!SsGB$1esOp0kJL`-u7zgxDF16qjCxP&-F0K{E0YC(nqyH5 zrKPwEl_;$((34DM9cKI3jj`)9x2s@kq;{X)`dyI0aq}JaQH^rm-t?)rzVV_0&zmamQN zR@qjD;%H6DM$?S{jN;mK70WH}o1J?9>VdA&$#P79{ES<QJVKesQxTE#Q0&k`Bmv2jJD#;M0F;t++Y zKT4tyaoLVX!j#P=$M)aRngD|TCq8;F7}I)`BuvQc*Y%@R3VH!wK&nris21{fYG+&s zW(>TcV={p%!Q}Re22EUt#^-^Hiv8M&)d_i z*qdL^qxA`X1get;y{gmHbV86lAucar%{vq1x;rw6_&qlSb8WVqsGo@ubeRDsiI*pO(F86QSB8A9fffb1%#}kD*5lJ(#()9HF zPR{Ds_Cc=OSFTKq9B&w7d2}BPCJoa<>7B9$(1T(%1>#pwqO#Ix{1gfva#u;hBYy*( z%Llo*=c{ULWK{%7nrmn*Cbl}syfm-9PRt0uK7Rl~1v%)HZ@`>}4>q%HqORk|Ev@^c zDap|Z+UOX>Mn_;$0SD*!;^#c|h8feXEKdQ*7CvfgdSZ$@(OSim>Bc70N?ql`HF=fV zO1WAYG#YbY|HyvBT<~;KWbO2$bTx+qkDlhe5}n}1){PYdPjz0;nkJwogOe^CLal}H zq&Hph6->DfMZTV&0TdDxzc5B4958J>PS?be%kjB0M`&&yd>3%T;NhQ1p&XV z!LSdPzk`+1aJAIKI6yi>`6nFY-WKmHwDa-cN7%W+B)vAILr<9~zfbYNq;Eq;>XUII zz-jD9d@M@+;Hj%5lfX)>^5ZTYX@m&_JRM$l%B3Gu-$Ghhji0iqN7|V9#HQIPmUBAY z3bGw<9BgTL>{{aGkNa9x3N8bAVd{fW8c~M-EJKkeHvgDKVmX|!h&6JZ$o;QHLSub`}8^ayRm0N zlRu&rp&q~nuU!)IW!bjjJ05NR&l<#krysw88yRoJ=qBO}_bIMrRlcoj8PW{q(@vRS zcNilZzeEB2r~7uD zOlUX&iJG8<`aiv{2AGS>JVHel>+MSF&r`>x_`GfKE^MPIHZbzR?#?cJS_wL zd8WIeE*8sSDtEE4*hW|6hL`Ck5kma+Q*HL7CZqUTzFcYne+^$H{n1qMMpu07hED`+ z$SQYO+M~efSPN&a9W2DuIw)-}dNSyj?2qAXGQB#}`TFDoNH|BtEpKTyXg6@)8<;!TJLe;hpK|j!bIN?iWq_PiE#E$X&^odLvF&M$2=h{eJuGXKnA`Z z>|JULO%*#;{P`nnP<;7U30?oKJY-DU1sJG94i& z>M8`nRBbff1Wu|dKa}#}45wp0<9nk=TTw8rA%mq2rx!R2?dzn-8RKj7m@0B6M-SHJ zIBrJ=4LXWy{}AOM^s(H}jl@1h2t<}&87H-?M8I(eherpT)Q)sg>l*}S*_=3VFIn9L zJW(B3X(ok+rtz)Wf{}jOw}%7D7fezos{c_X8IzU(rK)}!pj7?qpk47Q6gJqz@%%Cb zn|?{9?Z}Hf*bgJkpnQbhO~`9&Hs(xJC(c)sgboE?EO*1ejx&n8ohIZzCbNem!VAzb zW!I661VAGi3TQBUfXTFyKRURAdmUB?!C0ba->zJn=erS2x}OhU+%SGPfhWq|rWXMd znv+R##9F!Rx125R2flR~@cqww-viZa3LsMQb{u-5JX|6U4iaWAjh6SM&6Vkyn0cV= zLs91~R?=y%fg^_Di=lH49Q;hax%GJ}>NQT28MH?91ml4#Iu2>q6U14cJy4qzngu)R zlrQFRAC~Ui8*0d2Y^iqg?v&7c>Gvs)0hc#dP{ z6J_6ovR_L~Ax5Pd^zZ(8($kPl^nLZOQt$%0H~yKUi4~^wR)_LoR@0Dl%gY@jZWJ98k*k2e4Tf{CLWi+y>(tVXWQ+4;VGgS$TDFKZMs z+z&IDwe<$uJ)9bHyIE`B#^1epB3Lwi5-5mAaaE0Di^%e~Ha%cM?o|VCt0j!38+38K z^^5>mda(T2fta#1I`9{EBETH-p!gngJ-*dU3JfmRt1SYk!hakXV(tP!6@Z3nN-E72 zjZgzq!eb~g_wkTsZ|Bb1ykMIxK21sH1XhXn|Bx=w&^AIl^l5F04PhRB_}_YX8RLsP zyGMk8md1^-=6dZ6M!7RNe%6}=qCN+IRR9DVOg#E|!8G;;Bu(rYN_@8SS!-+~PIJt1 zvQx5MF;tVVcyUb46yq;C2T)&-3#~Ae=G!PDbXD;8xgjTk9B~rS&Z*puN4!?%d=@J9*0?oz_^YU9KPx%{yx z8JbjxZm^t5?D;rYh2LXMmAG);>}4_K^{dl(Vw#{NQV`~#P=hV{6fQShvRcpPNE&m5 zZ1y9;wjuNbavhSy-=ROhH3`%gGi5 zKKBRmvhwj>GKrrG_%(@rO82_v&3d{AjoPi# zdw9l(B)@KxxgSIgv{40@ipA@L-YK;eSKX|h;r8?%vQ z*hDJniS{H(oUzh9qdb}1VfWr7`AN0$&U7p+?KvS*ln0L=cnx7&l*NKzumH>GX??;I z=pykyTB`k=EzI$1BIOYp&{z zgw|M*9hN+;LtBevy|E9Ess^PI=!}l)Ej+8CNj*fU$Qk3e4D$!erv^6q2|Jiw7U*K!0upX3cV=Gya)apf_5Bg zWWXuP&{W25x*}o4d$ig~EM`OAD9+XN7rl33->)dR+J_v_=c2H)_&{QdH>%17p6fSL zGKpu3@!jPY{0w}JI3gA3JE)v+`PnAT4I^*&KPf70bF6#KZJJpI6uCF|o4qf+7}#^Q zq3y`Qzet?eZX}OmvQqBs`60$d7HYk({y+BqJRIuw{~yL9N;FZF5GoZVBwN{~2#Jz? zE!oGuFGC0|q|Lr3W8e3&Qz^p8GJ_dQVlcKah8bh-Q`h@?f8O`?{#@1Xcii`J+~4E4 zu0OhJyu9W-pXd499_RCUJ`DHvGUp}@k5)%Uu6HOxhB(uNs3-d}@FSi9Mu4;o*YTZe zY~us-P(@at0AJIsqj}~s`u6Xn)YUxui%8C&zKApr*(4rFDfW#!Rg^lqgj?{-dzf}e zSGtGkxf5!)kfp!T9I(654SLD?!s`o@MO?YiA~)ZdhL>~d zU(W^KB@d6*zBYW!JW~9|AY}2h`Vox0gEU^+6DrUHBnX9?ri|mz%e)16f20B*$+mfU1%oZlt;EaLsiKhp{6y|y@S+*g8d_^FnkZ~|1r4vKqT>2s?nIRlm<7zs*CqezV_4?t^;%M3 z>|S**Lgt`WImWz@cLgX%Wg&b-e<#y8OJc}>h1V}IuZxJN%2r7zh%N|x%EKens7YO)wyF{8c>Y@X=--G`U`MT+a{BPkZVgh_c-UaLeruxaec7K1 zzm}6Tcz}ppta9d#KW~c^+>H4Ly@Q& zt=Aw6rfmvw38MmBc<-V@oO2SQ<^0joKxgCj_0uB0pOOnoPvumk$f=nn2(Vxw-~6~n z(yS_;nW=tR9_sABfT9oJj4ebxD%epph|EB~j9=Ha;&p?NI(pl4)mEZ$rL`2qLs&45 zfFM#Z-))WEXJrk;Z(suWs0QFOn)lH^^;^GxT=X;nqzm^Zp=ta=?pYF45@TJfd+- zm~UI_-?_d!J@8d|%y9B_W2~f^@lYFc0n%7%u5H23-Z>DzSD(4 z!oY3KcsUhz1f^Z2#ZJaN)TRyMdnb7tI<;A!OU_kTZ{J4CYQgI92O)JfbDr&T7BzBt zcEKYmbMxdcxXn<7jZk;0Hbnpw_c=Jr4vp3hG&nJz8DJm2o{M*^8(K}V`MOkH)O>A@ zs!V1eno4~!czOCsTK!5)adOYAEaoiWCOQ2VuDZsPUiKsz=`$z3Mr%|3`NqudOBeUJ z0zZaewlQzox;D5VM(3nT^BCg)Y}h|&V$3mc;jn`1S`MVs7DD}AXI}0*))ct)YQkR+ z)=4F?J%%iJKr%z#n+n*Q9RBRn=`&Y*VD=pG;&rBQ;kG0xEiJrMo}!d+gH6MUf~4&I zbs_PcrRu(x3fXN%gti88-T|z-?UnbwSII3y1b2 z#)*Lov;*A5gjX0!(>Q*oxV4+_NWVq#MX0pMayi{0rg*kXv+*8K&L?$A!H4#^;ivC% z1Qt#$GRYFFSK-}JRu99T5vAkn<(3e)$|L=>r#;Wey&rzn8bx}pp)}Jmeht2J^oDBO z(+~V|J)?o}2x{B2Tk@QS()vpS#EEmuH8+7y%PdI<(ytF^ON-gNq_WT0^Va2(cM2UY zU!X>fZA7hp_w7p>6-P-(NNSuq*IdOtBmLIt+>F^3P4cbgqN_RGorI`w@8o%Y-*rE67EIzv!C@f6i zQ?6f)=XO`qrWEer86)$@x1*l)k`Q(|TPF97=BS;EvR1@l+$yBan&3A^*t3>Z)3n zQVnRaVxA&>yB*~2uY8GX^jKudy|F~fqgtOG!J-EBjjni8pjc)vE!{CS8Vm^r9CMR( z%K5PdH{xr#OC^zkBeDc;Pi7-teToCtp$c+Z%p<7Mq2hIt+!!l+;#Kg2 z>B*h_?y0NEc5L%@Ch>6pr3P=#s8T!ead`ChOmt%@x8-$`9ofR!~9LquSl$4tm3P-Q6LZXecVX;fTZ>@Y~SSD zcW?W1*#gXb{X4j^Em6hAk~8@GEVydQAm~14%kxcxXBNHU!u5Tsf!|bb@|B2ANlR~K zAB<1tZZ8?{WANz0!!Wc}+hUxbbfz*19?*8y&1mvFpR|5)&iYkRZly6hSw~V$Q}pp! z@s~#M7wH$G=$_e~LVOP5RTJ)GNY=8v8es=mPooZAnjXUu+Z49i+-o_c^85Ny*DGc% zfe*(`n%B;5M28c^!>{<8T;`Ta>SKW4n4Gp4#dWxl^%Hosn!3psoOs|bx)h>LJ!p5d zoaU49;|wmh6JN$n`hXV3&sJ~U3>yX!Y2PDlJ}<^Z!`JIp#8q12RcPOdhe$*lFE&ym z6*tya;+g_DrAGU%&nMuxX>c%JUQc{m9|K+YnaS`#`J$Mi?yP5y?Q6MOu(^6}%a2Pu%V8q-=yM=0cCBZYP(TZQu1vJ|;ub9*tF%q$-3<*&T3RI5{dXlV)chVBX zo;&al0$Qt{6_cpIJIs3Bh&_X;_${7Hna_k3-$CxC4&-&0hF*9<7h&fh+8t)TnR_5t zJ~!1IMeK6;pAu4@U{dHD|IuRW6!LDB<3wf2`=`G45X&p4?9HPiOr6#|qb&Nxd8Xq> zVhOGT3U_2mdItAubGgXQgH1o2+t_9WBxtogMedan2vlQ@Bm+?&)yG>Eg7BGXyQ~VqK~~rG_9R(g!NwK|6eP+D=rpKKER{ zwY-Q+sC+wyBuzFmzY6O&6_M7O^`EEqmXZ)jMIi?<1}%E z%Zqb6=bs=IQgCkAzB%r7IA?5$G8q8t&;cc-H8WQ00LVcjyA?L3&67q~#i-VPW7nZR zR1s?XZ10#4@%<|NYwu6a%RhSlm0N{H}>X&V2RZF zqz|v+-B}ht?4OL_RHJXKyM8!b}s2t5egjLpcmmc3=^<;<^> zO(E~sb016|wf8%BK_}f~zpA82F*w6vpX6*T%$~fEYHzmfG3g&QB%0TJ>_WlaVd^Rm z$GrQBq8hiO@;*jB8J`2mn0ViIX8PE3yqj_RZ(LN z_jGvrHKpa3yAoq}^?i&?4l*D2NMM{l%)+6ps@c_aj2bd{y6e7a=vzl4_#t!kM*<)F zL>H}zT9jM`i70**+=~7orK!B}d9QmnI|b5>EUF1dOz_s}&4i;_cZHd`%V|YHuY|e6 zUf-nbJ=bw6RqUnERQL-TxunQ!1Kh${j<;8zz;t*f)gIP<9`u>B&RqCZvPUa;3o|mP zg^~6q3YCv74Q~?=>ZfMlFg0nTSl#R^-Sk6qiU#~gb$Lj}ILG!As~zUkr;G1+zji6P zL20<3+&#t_$K6M}gne}Qv*5=#ov7OGSEf2C9Q1buN4V=^Z#7r=y#d`tb|YwCi84>; z^It*n)dQ&0>9yB)Zc`*uivlrj&IK0uh|V8)`ip_fzPcbv(eoA;f=pNCuKLTKkmEma z`09N5@<-c^y7x9<00rnPfZLhbc<6V!M& zvHNkz=;UnvCbd!wQF~s%)VFXne>Ka^?2Do&*02v4FRRAR3L0tl?z~XMV$?0SX@?u0 z?rMpi=Owt^xE&f_K1UphK@L<|_XozsfAK9{Or8TRPu+-}k}BmQ-k7P*s`lX`QQEcu z&7wX>wej=8#d!6{cVmL3`=qM|?&1uqKf~v+9mp@w?8$lA=adKBE1gL}R+|M6TbPBr zn*P;gNJE=_WajSkVJCuN|B-@XYyY{_Ci2*#2);bm+(|A*+k{-|Pd>ys*KSOUsN7vg zEp&GD2bPN9W{*0uQP^O_+lPye;9A$zFKvW%u9NwnDH44Mcl&wWUgY{ul2S;GHwamS z%#tcYNvsX{z4%c?ME0DPG5==2v%$$g=DEmhG|nGSGD}3Haq^w+q0ACSybcI6j}7n| z_!cv}`2&Q;v3QK?%pCl*pW2yfQWd>ik5}n5?SA$3tX%SduiN#eNm6p*Qp5JbJvN+X z#9DhH*=3#3nO1EU#P3qa=mB|&F2T(>%9kXJ5}Bw@eoH+~o_@>vfs}inAbc4U-Tq6d zi+?kv`40l7rZ4K?1uvQT+HQNYY+vlWOt2Y2b!SyoFP!%s*GoJ%nNl6o z*5NaRvq@PRfYf>i$aHJCEMFv79IKmlOo=o6U^NGqzIE;uJ=#bMH)u$%a^Mv%Z!}$9 zB4H0<%oP&M)4g-u^SygUx;u1nTDV(Qd5FoAxaN%2O5U!K(j{w|9_6{O`Dmgke5A>E zZr~}v$%Rl-lSZWt(b-!;Sj;Sc9>-0 zipL9lvsLs|m|i1JIj;7Z&{Ki4rP38kDeNiHxRj>OdK|20<~hFIYQ(b8CN~su#vAm{ z1&vCk5{X$n-EaNoE)podo$KEJ!uvqSs|d2;iep{*qJqQj!D#)jP|9)aq}f{-sOyP1 zhnXjEBT}a8DD-oxv!e)d28<#lXEMNmI9(uhQvH)+9lSd~OJwFE(FvS}nvN`bYaI*j zTsvs?4e{azr827OtZOWAwdb0AB+ z`wm1mtit-HdzmA9xpM#)xr+sT7}LaoZ#X>o{r)J?xP^LhM0@Wi5y}^lt`woD=M|?)URxkp zw;;FB0te;j2L$sFN`;eI_p?)HmS$Q1oyqO=*8wEdMKn0%UDQ1CQCr&bzO)NXfrwkO znm^QcT#xlR+NFhW1!=t9w>ByjNqk0a_r3|()co5W&W?)7k)9E6o|ZqaQ=BT0F|lYf zDkV2{c`}#Sv#~Lo8XYv~(Tk5&&Qr z;`%gT9NcC?-pYF%v=+VHyT$o1dh26!vUxF5#*GV6_BMPpn|vBn=E{ss_&%Kv@A_9G z+z;biRP|oLE1kB%?8Fferrf}1-%xm;@~sCo7be()cdCQs5^15s!h#g zb68^3kFj9rg7q6$V)}Nz^aez5gMa&7TD<)1EBVAjt+op>v9T$-4Rz73YmIa2h^aE{`vFs(1=V>pJf_< zuK54VH?S4`y35B~d+e|_R|8vvdBQu-+)Pg<^h*7)THNxlO%otm#u@~^-BpY8d7{Pwg@jqBRAi#<>6s?wc5 zch&KKw(38Y^rvHPAV--Kr4z$5h4DXgpgRd220?pTqJB=dI5CwnZ=+?t z5oYe=_z7}|%L#k&Ugw{Q^4}>@b0x!hHHfocUlM%hyt^+ha5nB{+GawUuh7WJ(g`DU z;UB=wU(^2ocpoN_7XLQ9hg{#2Bu`yc2=W7Gs6~#JrzK6e8V>&|g%38?7}wLVzpi?( ztx#753i(xvT5}}WSTy$p!(#~Q_OXIrk4>7E{F0MIkt1Tnut8J`s;U&!u}6(Q`4tS% zOkhfm>0i#%Z?GrHHQ|UiKn31vk)!f&&I*6{l1T1^qqG0WT8#N-OrK5Qw2j$FIJh%G z(ogWrzf-8c`3s8QMdp;mTfWi>QpY2SUy9J-V2$ZZ&ej)b7oltK)PFs*eOqlo8_e0L z`e}(i8`z+Y=Fa6?**kX$kH!a5ckVVJy%8b3aez_Y8L9kH)$xmMG_H11`%qhX8fD)2 zOz>Uax_9BQi-G^HOZ%Hy(lMCS0l2|Ifll(j)%LHSp!q?2SX$)RuY^HEixLLV)i84~ zGZfXg6Z0P0!)v?2GaoqXJ70X$hm zVL|Wuc?<-GWRV60^^ZvZ5$WHm($2p9KWUdXSDulSrOaxjfdZjyQ<;$8&{ikSU}8N^ zu5X8+Ty84WKO68-lEn=P|cV`<+X4(^1&%CCHE>*KTB4X8RAH}UZWPxz-NO7(v=#PLsp z%fZY67B}sEZ0iqxJ*!n~gCL*Py39EFySoZJXlR5^|KL1yf|6lLNNYbueha}}50LwLeFrq$ zl)3DFW7;OP-Z#8IKi=;h^)@_T-PV9E`X?fX?eaHRAc@q+G0aoKWPK zkY78t{3GD}X5dZizkAF7))<*20Cy#rS#kWAj_n_j{=Xs8?Vw$|c2TL}rvi@rd4umf zc5iaBZUgnZx#W;0^S)CLpQ~#=KlerCIQ#5A2JqiGMm?4_=Z5iyx%Yv$o}|AKDOSgY zFka35qQQ{<#s4_u&`a%?&sAPhaQ+_QL)e6rvYyY$E4~4RGGgS{?xRhw)X+E?+?o+- zz0Au}c^3^Gi{dZ;kY$^>R?^azhdlK=PqTm%F|MNw496erqW#q$Jor}R7P6|6`n&V8lWlj>kMYt`US zER7v)D7ueCi(lrr`RDQ9ShT04BaH~gsez{=6zTHDqm z23636*?jLbz3TJVw94pp>A5O+K6Jzg7F25SvO{Oe7iGA9OP0sA1=0tO$DTJVbNc$Q z(&_Z0Sf0Eq}{kd}IBwOE<#BU8@=EVah z$pVABg@ulETz;oMwfx@d*Y0Ww|EzZT#BWLN{%OUM6M|~pp}#es1SS^f$K=fJO%+MGeY-y5HR7nLap57GlMiV&d*K6@a|NIj`@nXrH?F_| zYh-L(2-s_#V#A{YzVTI|%=`!Z&OOt(qu7~gyWjr2o#bn6$q-1!ohM$uFh|<_G+hGA zfF)`qzqrvTTX1cwf>Ej-+y*BGW43#>ur}N0xQMA;2b_@T`Jr+LMydTrFEm$S+%%HJ z?OrBj?U47C^=7I@Tvako5=b9181`ZSK-sPA0TZQ>t(snNCDOHe&?!r^)FLRL2@&t36D-wvr)JopHEOzA!Np zw#B$%OCb?4$6@ilF9M8cTQ1Ri54p4eX5g=;m6psPxp4sM4hAWHC0+=H|skAu)c}+-?jw?^U4pbxF(D1X>(Y_0M zt}2A8&~IiQTX!W|Ma!-{5;7=hrU0gEscOJe&>#ue-`J; zNs!x1+j$%kDQ!fdSqo)<|2hqy*(mK#Vld#!^K&J1j4R-e8y#Tj&j%&!&Y3)ZD%irW zMVUI&Q!r;-yc#>RM}+UMCgwjU>UntP7!++4X9}H1mr9uHq$Q%DOcXM)x~i%wv1j!i zs}}l|mb@LQzp=`(Y6x73?1#62_bg#tLm(;FIux3K0xkf)!2A(cKxtS7)UekJ5Rbmu zd#>LF@I9gxtIb|hW`|S|>hCsO27d8GNK>f8ufz7@Rh4?AMrW0(S z+gqDX9Y+dJe0e1`cyD*cFzSeq(S1OHSZ^oI(r`AKLkvEwQxX*X0Dj|(^ffN0nhY=` zYwnj}jQleipNi~vpTee8kFO*s%&R4dIq&Dc`uAaNuq|ELen|o3GGHaGJG(?sTL!$T z?_j$skDL#IEahb(*yvlrz1tOqqC4oho?ZINkR!VW(T-B3!^wC^58DJcc#MIvPaAs>feN-2&ec*p^O&}I$9&iGR$K?3!{U(X$|SP z*zA24DpB)GLDae=Q8{l&7#CpE+dE>L86}M&f27pcEzxm}4xB(MmP}xKwXvDmX@SE< z4eLh47H7)k0ZEJIU446-B`n+Cs=dmSxy!nK20WRuudT>6?lV1$^jJUUE=?ogQ6gjT)T+xtCwcyNd-X5n zNueuakOUfv?&?ps8kcc4x)|_OsLDl3M}}HZhuazRV>xsQ@F&IykK8GzdYoSo*(T9Z z(YR7fa>gvxn(yev`_~se7VG5i7jTi!IvQOb2Q-t_Qlma?;yP`TBhbl3)n`0%(hPe- zAaqYASP9z7ol>5*NHG!Fa*x>d^nqgeiW zX*BF}*RVFaB(-4G9v6V& ztOLEq*eIg6zau=0jV-ZP&#^>_7s?~+<1#UevFy5C(kWc$;VZY!?iQP660oSy5`L&< zj}TLdAHlwQP%BLyW&+VKG@6RtH#3kQ=ijIM@>;a}$Z(|tHmP#7gy2WrrkDEeAEcCog%Q;7@bJ0BMpX*ZbRy4DHu6+31YO5x0&@&abZb<$f9R%gv4H^L07ap?SbZObcoo#u} zv3y+zHsB;Th(_w;@YBQE1;Tzps3nuSfgs7B=;$Mz<4-M=cMQ`AFctSLKSa8IN~<3A>ofzT%KjkO!9u=j zN@4rOt-5n=WnwfZuRT4I5X%@ZY8ei~wax}OtU~zl`mLGtmxU%JN>X_{RMttC-QjY8 zv=%iMH*>((H+%#J8q3iWo9cXdGdKn;0Zr0qL zDSD@<%f9aA^SM{nLE0l$g_is`;>YI-Ry_|0d?V$!io1Tj${w!mZ{{;WFQw8ySXedO zQQ6Rs?x_9F{gafu6~ws6cmxHke?`;Ql*lFrLtgD12+nNUkGV(I+30k+X3T4poOrD+ zk)1-vrB27ySiVTm5Uzc!G2OBAVVI`R-o9i2iB10^07(JovIxA zB54t>gC@Vau3QavB)V)B}>`MI%639mQn$Q9Y^1e`6w|PSW;<#{o%O z`0f;Ai7)0i?5m&kM(*cia-FB4d}prX`@_`!0J5hn<$+nqVFQrhR2nrQI}eb83Axjfx2~kzdPT zuC-jMnS8cYz{APM*buOm$tyuI(djEW@%O=wq6uvb9hXOwGVgPtb()b>`xy*^3isy< z!$sSD*n{;Z6N-Zzy+?g*G))kynPiDl;S3kx6J(MwWvuU?-%t0?7=e~+mh#pyv*1(T z$ZKP;aDVHa7}C4$Dp4#-0bAVPpiJ@Y+dIA&!g;4x4$6ChLGpD?Z3>g5EoWfYy^>xP zF3%0|I_v$&^9MkOO1b}TRQ5MQt`BU*+=BF5YAXR?S08j0;!$)hV7ESl`EM%Lm6=-EvysN^Op zzF=ps%a}vH>~MU0&1S_R8tp&ORhoAvVinC3XO->-19Cw3q64+=gNLBV$fNcVt>=ct zGecWZqZQQTUeRyDUbMwEIcX6G@%=8W`5*59?`l_og4$})!S0s9ze4>u)0$G9)bSsh8pSUhX4J0pd}TBO zBAsuCEJzxPj<;T_rOoKaD@qLnfp{&^DGz_N%OZX1ke^$!OwC7XE=B##Eb~YGCXy~7 ze8sN#@1!D)lm+Ly1QE8+&ug9Lsgt}^mvT;75X7znD~KT&m+RQ}?UDzLRTy*Hx%`m* zulTjRGWJsHhb=-|nSfg7?h89AE)!~v`51OpoUa{3bpGtZx8kQB>^?BV0!5yOFy0lg zT?QSLqX#a3dmaE`^V?`)4jk`p>bogZn{bpZ@{=QDyzS>5s`Bn>OJ1@;M9@*NGVzj* zi76l+IKyYUDwehx$rLezQ8%|!&otj&i4%5NeXJGG+NGV59#RDihem+Fy9_VEl8o1} z)(+YhS0+s-80b~S>%w9yEPiqt5bQ&zPS530tVnbL2d%e=Eog2p^a%wfuh<3@j19}$ zA@ri~LFl5@d$8*%GD2!a9NmJozUCczyg737C2g3!>B==6rg^;eqXsnC&_UP@tn+z{ zcxuJT2UiX>I@57=&~b?ihB!5Qr#`tV)#RycD#khU>bp6(j?WQfh4b3Z*yf!?3d>LS zY}-aEGcPQ!-qQjjTC)bsdXzCQ5LqKQI6PJj%az%j)JtbS)b7a|UU@GP*efKO0qd40 zL%o#)fvNJ&*b8%Oa_L*4M{E`<0o{}TUR*dh$=GAMVKDKSm%7pJ6#ED;KjuZ%M1>8KZ8cN~EiLo*f+GIIxFHobtpv26_8X*_Vqf zDTQn&Z^=y+46!lVK#?`t9~G4tgp}VHGBZkrF|h?8&BcxEl^-K8dJZK@C!T;!Kz6#! z)HR;*9nsF?|LrEeQLy>V(6PYPq%pJ~zR;#jf249M1fqT=a8|?Ity9-F_9f;Fs}T1a ztIHK^VIRzdXI_j%Ed51e|MQlI0@{qcErCcl7rFEZ^B%P3I2G^1TXgdaxH}K#^$^x0oChGKa zDs_$d(b6~j)@R@iwDd;m%0U}$eLZ)$pvW>v?XJi6c|-fe4{PFtyhlNaLA7$piRv`# zK!aMhEQkEj^Zj}aa4Y_b{_t2Yq#M7H~ zNc%a(b2L6?3{FZ{=P%c8kLQS}kJx$9bUjLMV)0zv0-fLGDPF^NaI2&Ja18wU1M9aw zJ6I4(u(ideA60TP>8MEieg z$zOsZmee0gW1GjkJSYE5{@qZHMd)Ie?T|;;ox+4h<>bFQ~CulB^hiF;`gfA#Hmt`#^al3sgYy>rW1j9y78rNeX;S z=V0})yu&L!5TS=}U<+H!`gR3EVwneUm{Li`=lD}-?LCip>-^&o**VNn>`sXj=H(MR zC{LJ(|7QFT*>$1QIoLg3ZG9H&@i_7D9-n%r=+;KNLRC9&5JpHU{j?xsc1rITNV!dq zSLZ=nb0N#+x%}ecB06xFMUCsYI*~)VM(Ivd2drt(2oT3B`*={Z5|y8 zVKkUAV!SQjlWq}8yDz2s9#~^xHv3CNj62p($8Wc`@j%>89u~#{s*!Hi0T-6ys|V7a zNC@evDX#w|e_~eVDm;M?goxJf1x(69ER=1|rv5!1`8-uI*CO=P+Qe0Ck9Vf7#D+g;-H3@DDn{h2UA%zd_Cd3p6yccrF7&;;C{2uJc zOS1%kybm|*vz=w47iE+(%zwrhcOO7^pIPRbuCVSz2H1-{dG0t(4NC#-6#&!KOx<7` zMkXC;3GcowtNOQK&T+Zm#Sq9B?HVQG6%Xd@)%h_)+ZgGf91uTVZ`{cNZ?I1Vi2HAX zW0@Obp$>f%E|_fz(nSl_pOi zosr4J?bGS1F3FLaq%E1nn#`n=*ULEsW~+cj8v(9!N5gMNoBPe4_7*b}J9W&ZnP&a#QxQZH3w2UqyDclkt=;YjUDw6*EbiAwq$eZA zOvV_4il(FjK`L0wwJvA+ivfIM~uG-;~E9USHL9+BlJht!45~k z3#&(m3%{?WcuvkvCpk-oNrPldxp43lUwBN_%VJ2caKzAN?ePc`K+5 z&cV+oXoo@ueC)hDv19%qJ;&VTa;`5;0qH%Q9B|f=w5z%iP|j4wl)speK$%#IS$`Jr zWF(TZtF{S1wyNd!YrME;zVoH+m?$bnG|^QlZLMj(tP_K{Vs!B<>HnA(CVAiy7mA8H^E+ zQn_R#my|Fs6q#fz)P8uuAe-GY(^T?Dk}tzngw*uKmY@)7wVGr*>)ixn*e1Aj@edRh zfP8MV5(8@Grv$9t8rg9lE!>+aBvl?V8ZRCiLd!7Bh}S=Q=y^N1?ZUWvK4M*`WTK?) zKqr3{o4xi+HJX>8>J$y~&62S0{K_o1mfZs?5?@|&z({MZ1mIelfLu(LmBem-T>r=y z-EW0~yF}138g*ZBB!SFq=)}=md0o;Q0Fk=T1q!&qsR~BwPx_WoN5a9*Lyp4{9N!;1%xR*Deg#3k5+n!87nJ+yvF1!CY z|LK+ebH(;=HN`{(j%H9is?LzO0aD$}rNn!X*6fXm!| zqPY-b>7;fD2sG`d0u2V?5UC_fzQV{JkFE3W%^K8LC`KieX2_(}1i}q_CER^+99$k+ zH!U@G@OIl-Mi@iwDtzcWamXgETZI3+2`>*SqJ)4GeG4KN#g;vkYw1aS2-hM*4srhn zgQ#1(l%(DhB5tR(0xnVoXMjMfMl>C*D0mp_UIVZb%pq3Jw*Zc8Oe(zl{p)*4Py*v! zV9kNv$)wu{{Y*{=F?cv(F;oCn&0D`i9y)68jnfnpwS|WQEU=rPbHI=FcGG|pb_)gF zmv~S~iH2~u_2Irn@k)SBje=?-EWJ_%-6Lxy{Rx5rKUe`m1_JBkqT9Df}Kh+o?wNYO+W!}zLWpJ2>Irf00}7> z0gGBFnlW@n93L^r`=#Kp(WJVivhzA)sKAEn#}g4A2Cp7lgs!3gzRK^qF@Uf`L6)>T z4sj6c+XsuDXp8W!QSJ3jyVtW%l1=CFss!w)vDgHj$9Mvc@^scTu51bKk{ZK&c*Z?4 zb*9mJ0VVh0wIp`+j_uW0l1qiWi5wJtfq0270vxH4UX#Jeg`hv^cbd=1d{)-RtbkdY z^>g$y)sV6s+}#zKu4GB>O^ZxB$OM!PpLS1HLp>ojl0!!S6! zcR$m`k6j+=Ft&@@+!9a2tj*BiP>@iTG?8&<0eb+ytyO*zz?eZLJE2?{mv~#t5o72? zD{JC_5)NyhUx?ZGn4;e<4`QkxK+tKywGa6fuSSm^kXzFx?#XqRUqdPjTco?{*F1bo z3o+9B3hv1CK=qq1GuqEL5JwMJ8cyC$Wz_qqMzeoFXJPOqCu3pGXjsxqXCsH}mP#6V zw&y)Y11Wg(x?SqI3d|`)+~uDXZah5w{91_;@%36pZ=9Lmr>=?=R;1h!Y}2q?rw}&c zqI#{Hdo%W%SAGw)lV>DN_0cvzsxwj9lDc1Gm1=C{zbOt$M z2Q_7x2nQ={s=+xZh_EZ<8X^@9v5Wf$!y&o%btWWOL!A5^&d|WhnqC69;@oBL(`4IN z#!6$#ow}Gac|^EJ#&(IrWd_((>@xvxQ_56fmA%TIfoyk+xBcG}YnAL&xB>mqbeMW{ zSBltE_ix?ry?=HNW4^PAUyEB-w=$3}WXFEj%TdxuZ^FqfeF|!>;nWO)4A{P}6=?}i z9ndAdln`sY0zoVp)||z-00)U+c4u!~oU{yq$m@kk{se5^0UVKv!37}Gbpz#)%ro{0 zPOy`*QVxEVrT_{oGzcKGLT3y$4`2Lpo&<^kuJ^L0czjW-qHdGD4T~~ox-|n{SSYi8 zuND8vYVDj*$}`;OfNBS^IU&EyShhc_yp^dkfVl>8m#+Y|N|qzf zm+utJppJBHmk#~~t`!7WwSPYo-{f}Az)o7!kdbo+-Y?jwj0}-+3<1}!(9_0_% z9C|_QVXc#W(=&sUvIRoi3d;IO4>@W5J?Z6u0~5P&MW;v;2CkW8?J!s{>oPnzs3)`e z?UiF1Y<+ea5v>J|s0|JeKl}-@3o>BE-Smv15XAh)7krN3V8g+J>4~|zlo&gxD-hHx zaFKyFQP$T}iS_Ly0$zXX!7lUDXe6vRHIQVNB;#E)V)yRR!Kzt!QtbY7GiSCzF#{CX zdnYPLO!C@kQWlNbKWHu0;&|9M;^q9kf8a-~Vc@O%c6;eGGSl96V z16PqhfNoQgdf#M=m5X;gu~5!@TYKOanu9qjB%!G;`q_n1|vN;~wqXZa>5m#MbvgV$Qoeu(CE=H2|LdZW_`9NRQ2 z&@)J0IP~|<=Z}iq<02&C8A{1>k3UjSjto)15_r+@zRpUwM2{Qtjq zu!x&jfI>ldact}n;=uAA7bUtpHqg%@1+&A0x)^fa3fufM#Wwn*)ewGM2nWLjCo#ou zY!G$}jAU{$llAfDnyy_hbi_>_xQ@8E3C9QTnv~pq^=7XN%?HP}Pqh|%Xd_4Vvs}LR z=L-&ku9-*ak1{+aQoKh?>6wn8jh!2;Lia$*?zV;3E@f1tdBFvLR4FwE)G~eDF{hxo z=F9tZTnmHkXZ!hcS3lH1;LuM}uU(V2SQrjHaOJu`^wL>;QgZVI%PB=C%sFV$*xBVV zE2UV`S``P(oCepVWbMk{k%_~CdNpJG7o0+c?!JWBV9pq>kIR@|aw3pN8e!x;WBk%A zP0fiC4w#4_&#?>+u9k4?9DYAdNEJ8|7y+r(;5aJLNRw^mjF`lRiydoy*NOf4qt2Iz{F# zaG3H^aaAYA>0*UV=SH0++(Sp5LA_|me%eu{8MEST$NaG|NlQL7azEi}v0BNA-d0tl zV+iCRn`nzL3-o%)%t4O?3uU@%H%gf3xDK87qcc5&W>GrGI#1gYPWO~YDQ=(SRqc0l zb)Ze5<&7%R>aV}LJ1yXu#Zmo1`VdIW<l zl}7yk^+2#iXi#R^O(77<@*CVV_6AnQlYg4llLZtn?Y(o;@c*d~=(y(OKL&sXjy>k* zlw|Ti!~9pijQzePQ?lwE1NJEmM#LpjceO z;WptweY)JS`voo`W+s+el1CE8;W$)b*$&;UeT;Y+PX|q}FX@!rfq}VCH+}vD1M9fp znmfP+wBS!@|6_bGLn7}}GAN;q)7Y34T&blvc8T=U0%%n7aUA($(1h?-vbh-!G@;~m zBYptAir`6b@q>oVyse8AfreO0%yIpq&^ylQ7g!g;4j+kq{&*TF0ZlSOPGQBQ2xUz_?Fh!4@dNpi)az~gWui-JKVwMZBr!d)f5WWp5qN|<#s9v7klG30iJ zQMWl4Eaf5(U`%4!!P&nTILlSVUidyx>i^m25ukR;onnEv83`RHv@ad*O8mEFd z0nLjpD+<6ukVLZ7pZWoys0#z=pHjkfDt%bn5zVKqj%(_GOaP|}XSEP`?!0NvH8{*W zr7Dw0fJN~o>+UH*@Z92OI_IJ3nkAKZ1TMNk;r-eUXw2U?MZ82hQ{N+*jFlX2(&j#K#6tsJkz9&Iu=TcqoIbe@69^3VBzf9XbmlYk(dRW`}| z6@O^qf~ylV(_p6N;r!~0v7jz5w(XsuTm|}CQ$p4T;tAXLcq^}iV#Q6gss5p( z0j<14$DH&M7mcNIC2ru4Y@E+-g`NxNF?gMCm zIRN`Pt8?=M+L9SqPYZ1s48l#Y&=)UPL$N6MLRSPxL9)nIzOx`V=B_b&W)Qtt2I=Az zSl|`X>n0RHDefs<8qfsug`4J0$%qe1RVpq5j>VI)BgBzNMmp8}PBQo;F%(r0XcMTy zw6UsB@F*ZbHjoPj0J$=v-g=AkgNthhdl4`%Ol5Bx;98a~+y(Rq)zfd)nAyrueqA~C zjFoI3|HcRbQUAG33|PNUE^t}` zN=d-Y?WW9wmr`}d_X{nND|Fu{n!e>F^Rxg!ww0B=08lgoQ2gh`q2~B;R@3h@FCzvz z<%O*wZ~AbtxKnPM+A0{2)&6^s@LnCOqqO$2@b#NTG4 zD=_;+UWWC)(6n{v>RH&7x0E?fwM1`})$V^vxC3c!PA}C$B1}5G;p+*LK4z?Hu6!Lt zGj(sPA_BC*GaJvE1$J}E&k2P$A-W5QP)(MdLWWNA^_vi=%}N1FamI6)eh0C=AxUV7 zoLNLeJgp%2AaNJ;2K1r6G4&lDzzOn&557mB-Bt{C`ZIT&^U7g0V| z4-yM>CQ=q%0GB{rT~7*&zr{bg|J4=zOOMa+ZZ{;`b71qWR~&ki zrX%6VL@7U|_jH!8jmFG=sqJXZ(!Sbq6|%QZqBxsZvy%ztTA{!>(X$!ySqRsBXl}o> zq$JwD_0h?jB0N{tya!*)Q}ftl?R(wipiE6x#GMIz50E<}s6bH)sb$rDkMeYm85xda zJCwPN)^J0}qn47_&Va-Eb@D9UWTQ2dz8KnNoj?y45Gu8E+pI5Z54(2OlHds&Tu>4= zx6(ScBl5<3vpf5csxvWuYF>@f+}hgl&{`wQJ1^Ci-70Twtb5*!RHMW)byvKF{eVSv zzTey3SSR1W@8{jWFSptIMR6!iKpm~RnYw)*4FlrR1+G$5O*m8SU(J3Z8(_V&Hp@ns z9ACdPWt-Zh=HR^{nxuCO??jGX^}rf|@k33oJK}F$k2dD~wu8fK%CXU!IcI@L=q)52 z#ESEtfefGFBXgtJM~->iI#yvH zkN@O*=;))&7#r>LWwSj(lP|bz#B=p3t?vw_Y41Mr$kpG=)hb~*6vaL8qHy5uwBY9% z_>$o)BWOC*sPf5a=Ltm_ypiN#y2nqOFc37P{80M@MIZt7NR$nP1$Z9D?3tQp!P14Q z09Xa5BIqKz2!X;(5r3^Jhbd`~z3)xPb!n$gacb|a@lpUSsC7SnXAto8oq{$TK5KVh z?>p~jcG6thN4D;DhVwOssI*&%*!_meyFcC{FT0Zb)#JT+A{L;IM{L)Hvg}(w>V<_8 zVv*9kfP$ulv3?gE!>N}^P&K8&{5lP z3Eld2f=i}5-vO{alEftU+#K#=r)RP*go$ur!A;u91HWV+!g*@4Pww?LH%BYX%8wz} z2tU0L9NhmZQ?$TgQ=elh&FTY(cSGf>gpw0a_4m?L_U(2vY5E~t8S?46RNPvQDtmgV zL}x?yyatzOR*ILJ~i%LU| zi{RbR7a~0R8I)qt6*yeW$fUB`**~Z6`57)jx_8Q6y-&UYH)ArzQI)WR%AD!%rvrya zbtWK91<67fvAC?=0v9RyKIzgKWUu)wgi;XVtT9DSbLscsOj**5BqP9gx~ZIsQz`(T z8IsZW`Hx7{S6(XYFFe=GN@+jr+*>gjCWhi)$QS>Be~Bo~Suv{cgEd5WE*6~k$)j9U z(hb3l+b4!MFDid>MGls`YixI0paRPhMUQJIQAy%Gx_>LN@E>nFAlBlbDRfKGJc|G% zs^G@f@LCEU-9qJjuCAFTxH#ulW%C_HS@9j@l7Cvfjb*%bX)OMAfvU zlW%;3(TTq{g`j;kIq%TC5^y&Q0C{z>N@dCJ`10k;6kmNHs;K@n|Hu0j;K9)6jCi-k z0v6#;SdnU$TE94fpb3Ho9=dXRqA7yCQmCqj@Sx0RmuC=gj+M`AS0TB|7izNUxjt_@ zI8`xd}SO{mx!P86t;k z(TSnGNbq%&_q0_eXiVA<$M59ukFJwVhO$WbtK!Y3_Y(d=KF*t+0Xi1f(*E8GR&r?5 zn5?{$uj4!RfhqAyb!TN_HeB$uu?T)lCpn-jFKGJnUR10hl;A*r+F1d;?gy;O=f&$g zd0B~1%iTtIM|u~%cMNlNEBg`jQ9Q4GitICvRsg4joj+K(c1d!4k z_V(slQou8V#g<pl;i15CW`EvbSi7!K);ZQs`sN>fsLaF(2GEDkaY|r+n*QnG%Q$4~P;sDBjfRd& zD(qDp5#jbGX7wVeKr~OdWiH5YddG2n0PGVvp8kXgSH~w;FCP9eFvOc#fe3fLz1s?6 z8)#Gdm}E>5(ntEm+rqW=XLTHW4#d2AsLdnOUOe_`3;4IlD8y4pa;t-E9QP~St%1I! z<3##2$i_TZSUpb`{$Xu%p<4%r{AvB$9Ks8PRfEmAE6o0=hnEH-J-kK-diNZ!}Rvb zG;IHyne=~}LHk=09@6||GjoumZ;W<-kH2cY)URy4?YR2m9OXW>GNvzTakc%f0#awH zC8w{G8noh|%m1kg8z`g0<&8DSR0jx`Br!xIE6-4f;6Cil#^hPerKUi!Sjvp#eL%Cf zl<~Ec{)hxb=~FKmRs?a;*Ghd?+1?DF)!k0Az{atTD>2+7h$tNq$GKKYOQZvC_s^di z{n8ZNuD_v<<~c|_Oh( zc23la1=(K2l6Y)GYPv4Zy(6v#SW}s9(DMcpS;?X*PP+=yK@O&nwA5ZEPEN)jkpRIJ zJ|?cygA)e*hV661Hu!8_G(O`!;{lkj>#x5ht|t9kWd8G!fMU1Yl*aM^#g-#V99s40 zsVNoo`TkIDusx8v%Ci7evkppfjCnRs~LG3;F8u}8R7t5-I^ms=#f$+LA zs92(Nvg$C5Ze}DXaRG)XXb)1BPW*I1~?3=IMLG|#dE9|b_9id@R%Ss zE*3p8PgfVXWf>Y!y-kaFk2SaPzYmlh5rTCo6$BPlI?i-} z$fpcloLvm_`vtw%ppTe!PvFyNSQ!-t0u#3o0MF<^N8uNcsPy0pi4U~g(x%W-(*mQg zpz!{h2eag3CEh9xlLBqroo;ME+V>Od%$vmh?z6hw!`A`=(4Pi44say3my?gj4dx%o zl>;o)#m^0F$Sgft5jz5($C!y_7a)C;jE~EX1qW6C`48aCibJNz;$mGO=7N=AYl~*+ zLTbCdC9)fpd?(^uK{vRL0>v}$N`=&%PH=4&3olKd6845?bl9EH!lT!Ng-i03TF&)b2+ zr@G~xj>Udy5O-ppgNeiI*HzQePHj9 zJLSGE@a@Dse&!J2U!P8^?@#>zERc`oqSt+c0P6d?Ztt(fyVmKiKD40ZY(Vc0Gv%~* z9Nq49>p;%qBne*Jr-ca(ojtG$eCQiKavbyEdjt1`Xe(ljJP?meFYvJb&=MP}6QDJ) z)AvAr$&^lRs6GPQy$=s|M`!OdMnf#6@bm$t0OB4D70s12p?lNdzB+OE{EvSCvsG`X znaBuPs|c%-JJ$L|J&2l5?=K8%Qeb0vdK#Z|zaMDK)?b``(g(5rUq1)`pVb>=80?n| z$K)%oyDq;-v~_H&saq2)dw6|bfx^AnrQ)?GI|5Dct*bYH^$1IugaJ3ftENNZd~lRmkG0_BHMVB3Q6K8oxGN83G`GW z?R{~8jsWv9*>uEfAm;$kF*ZSn2uUX5XSmPfxFKkVUrE4h)p1~CNFY>htoV|Z{k)=i zTHmRPwdPyJ9<6b$!HAO2cPK5bUj}UtECiOq-b1{iLgOY#AVK#KffaZB27M?8+*>UG0<$z)o1Zc zdwbdSc{A${*Ru6FQ{%ShM&bnhxx2?nwlCEM-oT^OpcC2I3|sWMA>a9^#Ud$uR68G(|~fPygB!rr@IZa2PCPsd}E9Me|QzJ+st` z6N@*qCff{nnZ4ay6zISqi6LJcm@vK|3v3$4;Ex~+6S)>#3PfdoB>{5tHJol@fOSPNNGV5S`-fP(&*#Nb zx8Xw43hM?i62yA<_E)|8$gMk3eAK= zx}?(c1Nw!5cws6*0`HOTvfa15pgV7r z^1{)&K+-HK??mWMzn;_s6zF|s(C8h1)Oat)dny=u!QA*JK;gEyWeR@R^k;V-FtjN5 z@iU#6y5*%Z?$DSj>@pkP2K3?nhaQhxha7wgL2AU9^y4hAv#Vf|DoGsu_+UE!8-D=T z9Ar54?Gi3}OEab|N2|!`{wKj*Y;Dg?$0f<1_N=^7I!@hRy_)+$;5TT;93&>wTViGI zfIn59OXri$r_FR=>^%Py9hQlp!!nQn>63}5RcdjPT{zQfRucjau%Y3M( zHtbYn-H{}Uh5}bmQdR8gK0)fsMXR2HC(g-%cVWYnHC;0vCup~8kKW0?)Pr`{U>
(?~{U{SnwGOpDG5?!>)a&kR1h#C7&UyX$zxH&2^ zk{gnz;sHq&GHHk>_=N<_MeJ$jMES-rl#u#s<3Tkfw9bw%5gjdo8kJVM`Qxw%uar`{ z%1oaB)hPc9_AIXPclJE^ma?(Ge3{y`x_6#hP4cQaYhB=Pm{{QPCx{k!{0%e+w!dWvfRm6;otR?+MAuxbv@PsklxE9!0N$7Dp7w>pAC;lPm+K94pYRAz$NC%XRaRSyk{8x@7c>5!^aXAv{|lm3 zZyaq)BzlFO+wM2&^L6wHTfW{~B}HsBx#jyhWs@brywe>#y}7T!E3WDKi)1}r7x%g3u{S79Lb!*?N)a(OZ zuU`);sBb#vxluLFCVr7PcQHWtQ05oDIDv}FvL8jIBPgB!!XIFlLLMmlEC?pF$j0hv zZhpm`QG?KB?>6V8R9ou7m8i|Zl?m7X-$nr0+gyBZ0jpK^%3-zYA8GRqh+fYWoh~VT z7!=r_!J;jjpDfMcvMC0U**ViBB2_TebmFk#Y6O=jao+q!G?;28ma!{47`K}uHo0Od zSUXEy-!MK%PA|&urV5hD{gnhbwy8(Bu5em0-edEAb=A79Qj=m$S3B>i9(w1axb;lk zVBlZ|iEL67|No~nqGQV{O>HV#dx-HVXG9J0oeWdsN zzHSQWeI&|1Pz5$Vzas&?zqiBnP)F7r`6Bd3L61xe3q0%Nqx@}+Udzn$)q02d{A!}} zxM&sM2DT9-1(QpEEggx?&(IWvx-i|hXL9#PyRj$yuraf7lnKnWm$#7$ z98^K(_sw7f@jdSFf*>pkqioAPFUVs1JTzXQtqUZ}BKJuI_?i9$nFr)%{=O*)Y@(cP z0BA{Q`g_=qrxNt}kB-`cjoDe-4%0wruuK;gY-)AJ>dJ6z6X4vPQE(FmZxw8RNb zQywRPvN#9=jxRKuj9j9rzLB|inpTT<9 zNdpbB6dB3;rG7$L4$g4B))>XDpO`ffg{zIE z1}hw7K(sC@B=9Vl7W3ONLk^_(OJ`85h%Wz!J6TwwPt?;c!Nh&du-zSSIfrpuO>5G9 z?Osyc_sEN>zfoV%WcUfG*Tn2htvl%dKgFK^TN(5Jfvv+(*OTF8Y<{*zwkkkfnhV+Y@yHHtxztreG-h6LfBA`K{hswGL5CjhH>U!BBy3;K2ZcdK3 zRmJ?1sYu3(x1thE+GU+^ZAFHs8mhOCx0L`EVE}+$l^1gN&$8*5 zBbCfQ#d0;R5!|B{5A1`QteC3B(jqw4bGe?@1xJq3AMWk$WZUW(hc3ILNBvOY{rxu+ z>_ralUIB&yS6qzXigVnXzGwqXT;*zxH!QJPBL!Jb(bb07k=h zSLg(-(?_^%yYP+(Pf7s6c5MV9&$pGr@v0NA+HQNI@F-jLn$m8o{=wI9N+Hj3{2S(t zfyf2@aNz^DbGb8i2Ek0g-CL}^ADI=leU6g)QOlx(GluB7wSDWA7uu8hq2mzmlVv{B zqlT4UeU;2zCZ9HtC9N07A2r)85J*LWd5|13yAENMJ9k8K6L^?(BX=)|p7cH$&v5?Q z6n@rk{|0~`P=9lGL3?!PX?*lJa`p>VW`A3+?_Ap01Pj9Ilxu9ROQIgmHF_vrbvi^1mrE`}eL;LIaq8NSeihZct zQU*0uzkDiAI#089I|s_^s_T1od#?mgm#;K@6F$7V+rT6=-j?$=z${{Gny1zRsY$vI z@UumivN9bUzM)YR+z)K5Lc=eD{6mB+{BO91FCB|+dxT6xy zAJ+zJL|T-T^F0G`rK#|=BG|xOGF764AXcT~6-n6ps4kiFLG9eXZ;G(W zVRiih7+xQ%XtJH(JFE(8q)vIL{@y#N#U)(PPa?GQIbzzoYic*-%w-G|n&}xBu!L$& z8T_LFgF&u$v5;e)aoq{WXMeNu4nHlGbpzF2X-2-nqu=URG z!KH+@i}nKgRSqT98?F4-C|MCDBK1sSe&<$f+_25F9^G<>(Qnb9v+Xg#$DHl>p{0aF!8yll{28N1W4`iy3I#JG+y}yN= zil!}dV*Z6q{i1yT)EvS0az{)+vD_wjlF)Fsr>w@5?q*m%QqC0ZG}95%uFeJzO}&Sg zmuKcXFX_CL$Sl;lypV_W!+V4R0ov{%1o>A=~_N?RfeH_B4`Cz_Cby}m6kPY!YtfHKCm!M~W|pp*rC$8j5~C{Ql0_{n6KJPlppcc1N_Gij5 zdOrS|-%zD$qIUW+Nf-6biofqXR@d<7?sijY3>mj2>3q+6prXErNRBo`L_hJT?#vv^NOW-!}xp>lkBsv`f{SdKIt$`P-iCVz=s^ zVDDnLLqJC@O~MEEm)3LfEelvR3vYDk6T#aRj9aERE=A<8WDCZ!l}(ol-(GKx7sF;< zPNb1d91cZSol zYAq5_^XQxe+08dy6XRW*6>mn#Aw8W;H zI*)(?{ZTsO0ZJ`jQa;n>^BeQe7b~EKiNtx%IXP4imAm;XUzlY;>x*~Mu|;Wm0|vSo z%76F^Nawn(cdJij>F-;vv|_2Y!X4m{o(@%vLK(B-cShMb6{id5wj>OOH@#VHM>oYZ z*81?jHLoZ~oZR#p>MJx21t2BkR$6&YP0d2p%PrnJmK%w_mHgrEiRz=IS(1l#S$c;a zare(NXDH=pJKuJK!R}0*INFQLJgXr{qAbWKA=n!2Jn!C=E>EN|uU0)eO~v!TuB~## z&(~cg&AnEYT4m*dKEhatZ1#i?3l(!M6cg^ZrGN#2LqRDRh9?rv}Z&||~Bh|fOf z6!4q<;9dtK8fHv5ho!@mv=o**V%vSk2L80q$-#cIT#jb5NwAd4BDimC2X9s>(X!|3 z`%qJOs^uya`4{2ig>bP-M{&f=)+oT8ZXY7Vi?3g}XCn3b>U1e8*_60l02f{ek1%X6 zxz=7@zT55*!YGMh4q-lXuK3!*Bc6l1e#MIxc+<+N>U8U?a*srp597%uCg_PMJQv8h z>CpMup7g$ee(y~`U*N|*Mt8IUo4V@erw6F)qtv)G=&e3(%bOSj)Yh#d1$9liid;s= zt=>u=dw@zPbN;6KT*TP;r2t$eaexIAr*vWaTOzdpRZ7P#21cm6dg7*A@}1-}Botk` z=i3BUKZ>L%+=bYH>Xovq9r7&6u>!HFawx<-3DxB;t4ODD2ElmLiKlzGorHb73DbnlH&y4*CS)3`eZG5t9)L+`r4RmzSLIUn zHDLGnMqm;CdoxCZCe<+iq(O=GFFx(-XdD+J7Wq3795)jG?3{J*@(t7q&@7y^E!$wfCK8N;yy#)_ zWAZ~G>})m5?F{pCsF)rv+mp_T=j|@n>URZ|B8DRKjKeUoW6p+Xj*SSmkLGqkIP15< zjvmLx>doXAeKSG1+)Uw-D3X9fNby5aYPn#zJvXX*)tkYZpFBXSr`qaCSkCSW=a{`lF8+iK~-icE!LVW8@U? zsDo&AE%F!f!F2WcBocJNY@Jf;O4$*OnI(00Veswih{hhQA^bf7+IKjUHP>SJsz^jI zGi`ME9pc-;4{Qy(hHc5 zf)?E9+w=ofCD2W2d;y)h6B~}=z@UVjg2_E#GH0ajc~PJIJpmw5&)2O)XB2QYY~+OI z=5WyDB97jvc`A?b3+KJ(h{kIODE>8-|9BWMkbJq4t_OWdCtkXWYLUA%7joG=-e{ao z+Q_UV&q#FjKjnUS%68fxGObz6#UR!|HjcivK;V!kC#8c8IIMhY&`ALtjueN8Re@M% zjs?it0*7v$EO?7BFUk0)(ABiS-!BA?1Bp;>o8$*5d)%GtHPsmsWD-OQsflxnsPnR8`{+TgkK@_Ot5V}+&6Ce@KT!;h1qY~*` z_lMz~vrcKILMGsdVJ|)m?^gmqqCml!G`KG*ASIF0MYvYFN3Y^m91n(oQGv^=I4sdM zB|T5*4XDiwG4}>63muUscZCqONxt3H7YTYNe{o?~1oSS5P0?5$dJqiGUp|g38I?C4 z3WDCP(Z8!9K=kex0zkqr!7_uWKotKbdw^4MDUMNnX^{TuxxDn;Cw0`UdiuIN^tOFg(SQniyPIU!d>203${rkBg<&-xzc1bfnrl}h zLHaBRhDXJG!Qll6VwE-JtpE(;hsgM&Rrn;Nc&`7d67L44L*h$2Z`BJ;THF1^Lng$? zal0smG#hmtve$B`=l1TwFKvlVOvBtg!odrA&ID5S3}xmZ04WP7@JG7%jdAXi;|mL zVEQZc9jcq0W(1T%h`zoFg-@LYJnl2ll|mTdH(ew=Fj7y?AcL_O-Q!)p2$auNiOIf) z!IIK7+&zbM-FL>4fSqBykcy?rqig#uiTgmGMP^~S6^wB78YgU z>V0X*)&4mABv(P16F7T+(tbn&KG|IE$Au7yV3=<{SY<;*P;j|RB@7ti6npN9EQbGT zoG{X_mDiL=i~@+VM^0E<^wGAql}ijz#0LyyS?+%53k`ovOU>WcUjef=NRBsuA851N z@lIU%xr|9Mi#i_8=0ntKVqOyM8E8 z>fbiU>(v8QcE0B>4K}8L$B=6e8XI6mnfyWtL>VO(Y^4K=H>M?-{+bkqk*e^Oychma z-a|BZ37M`-QRR)?@Q+_ffNQm~`l2ED0M*#xe%`J3$<&XY!)o~fmurFR@>TozN?g`X z<;yLg?3WXg!X1#l#6zug5hpFpl>YJpkeurHs-=$f^#ym18F44qi_3e?AbmOc^R*zu zJgl;$p(S! zU;Pdg^6DCX(4pxA2eNKh)vhLCyjD{4r{gSMnhY%JpW4*z17?!(yA;?EW`gaj@eyWb zj&Vob24-}Obe4}HPNY%H~NYB3SQ`?Z9)6S48VZ-3B8rWxW!9RXS0*IV_IV(m; zAw#mkRYOcpelW3FI2F@=kA!S(WIxq=1XqbW-N)_YG-#ac&SW^4IxWlf#(wao29ZJ) zFM;m|(&6@KaGrgyJh@GcTlHGwt_t)k?z;T};j-r58aY&D0 zaDPZz({bveqN}X}JHs_(4L{|%?8G$f*Ee>9f;mM72$~+7aA1P@=lzTw!i+5Av2V+V zaN!?v#eNrCU_=Ux9Bi&g1VLh*^x!HCDGE2NH311RekB1!3J4>aj}_43ws=p^m|H}a zB*0QGsfWwN@oH6(bnY=}bV5X0{=OI{fB5JRNw6Bi`{c$LSf&5!cc4&>P0||C_lV4r zon&!m6fC@U6j{XUcU}*Fm&l0aV;L$d@#`}P6}USAllVY9qM#Jm?z<>I!VB{K+F_i{ z3S5^qNAe(6#=Ayl6$(tV{+N7-gd3*?8wW0ebbo0M1hhPYzrnuqLWL`?y4M6-|MptE z;jpu$++(TU7gS4w12D9oSuq-PLvoXbHuQyg%w&uB6#E&@p0l8F+LIqB5SMbnJ*-3m z6z=TZAB~7?f~btG!MFd~90=r8H>_Sn(}xdL-IHzQ`B2Dp&MC{|%le&L!{d=vl(*e+ z3TQ39PIiJ?i{eNa(aQn@fARa#0@_*bJFL`30o-eP4DkH%#r~a+#QHd^w1}q`!n>%E!tzKMLo;CwLe3XWcy13G)4YQr zAi!GliUfR^>`=SsqKEdY@#(|IteXN9at0l{8^0~qZ{t=pp1>YN>I-8G8}wCqihOLJH+THX~s zCPn5!*wLl9_``r)f#D!s9=NR^1p%f*5>2;EzrDQl#G4ib^W2jU1U{2=E9@0?@mIeC zh3e_-DRG#Jk=BN)7v9Pvm0!Ir20xSa2=BMCOVwVYtKWYGq0y6zvQ!Dci_;f+H6{4` ztKWfwmQ~Lx4O+w2&-Fz1-i}Xe?x8MKkKjsrM&W3L9R6k`Sn#coYj>T%D|bjmYDj^? z{MGM3!7@j)J)qB^`ozeY{LN1fe>7?e&#G-sQXFVnj&f%Va(41RK&9c5jAFq=k*EY) z#DO!ic-O##*a?=ZQ5+*Uqd1Q5vM|paqS4RNU=p4&%6O81Ok@-o(kx(}e`yX;&@A_~ zj3=_hjjPgH)$IFXc0cLAi`!imzL|E=TQTlUjOeohV~`D^&%wm($U5*>zXOGAE+&Vc zE6j3;d4Im{(*(5sK)m*Ead-Lch)CGvaTGw@O&=I{gB%z!zbBR60NV9ezXOHRJ`B^( zAfk9~8KnMx!QJJ4 zcJ&WYi~e%D^eXoiU%7&Dtn*thW`jQ-FQ6s*NdH*|d^MNCI|i`hzxo|~_2c`~V#G)k zHFgI@nGlhz(Rm4t(chEi^`mXNaAksxPmKj<4HXd!y8VwB_>=sD0CQNseZ7e0?Iq5` z8(3e9M^<8XYkrFQ%GZZ``maVOA$gMlA@83Q>^~S6c(1Mj2R1gN07usExk=Ib+RRl? zvro=bZ7o?8juObDHf610mM-4bK{g~ZUl>X+AtusH+)EFW9O)En9Riblsjz_?+40cp z8pR{aeaIDiS+IK!shGYrP{?1K1A(5Wj`lFgR>7_(7QKvLzOmnsuk04wZuEIN;3_xg zBC~ngD8MGA;Wk82;>1xOlH;M5zxW*td5L=pBY^BnCsYls4r*~s25k?Cdxm@(HDY%< z9p8D8fFU!c6L($%-a79E+amEBZf3(Zhy?1c8Gen2#m~T>g;N$nFAmvfTZ{;ANzyW% zh-gR{YY@Btr8#sO>^x;6(}2x1_*GYtm7=uui}Im3pVol?&;4_`55F9?&z6WWEXN#y zPv2sLO_1HYZEg%C!;%^DMS&#tIPm~v3b2jMaWRizM55*Q(8zL3rge-0F)2c}9sLro`$LNLI&4A;%RzJU9{*g<^nrzK9W(pcLVI$;wHq@23VIxQHVrOJzKd65+$pX zR;UN|PaS7`gKVE~<`Z{u0Q>d6xVErchCBooH<*1Lhrv4vi1&VN4g@qc*svv#REKv@ z*XJ%4t6VMm!E=Y)kRCD~rJXofC(F3d!6T4nb-Y0IoYYr(&|F%nfypo6rvB>pqXnY9 zS`I>>ysNA~GT$c1SJPVMmg^6=ETQIK9E|Ee_1;AhY&wHU&0LgvP7jk!FBPv-30(ix z??6H8c3YDulDQi>Q|_=N*1x_Gl5exl9I{{IV_BZ}|C-y}x7DrWlMmbBd8(xkJPDUd ziZ|FbtXZE77OpiHu6 zdw?2HAZAAF7WW-vG#Wnt>UW@UHget%cIV5F64y(3N}B`Y&$bR8E^otnEW{x`$1bdx zIk$7a=_?#5{Fio1f7@)quA?VH-I$2tjb)7b`z3cXSdSeZ7h?v_cj*e6_#&YIHkF`k zs{mv~4d`#Z1G^>0mg8fE&%gQ|C|seZ_WyavA#8=QwL~=pE=C;uG-HCt_J=#1Kh5~8 zGlT;FoixDTvIwxxt#o-q5!5%P+xmvZ?OWo=Bks}4|FL54-k8+UhVgOlE*Q^0$S3@b z(0`bYfTPoelFpG4wucWNe);^F7K)lZYH4XHGgIr*vC*NJCUZ(CdEATRhi09bXJ zGIE*5NL+~JLZ__3r!$)$;BW@FenZ&qxlHx?>bQ3ls1I`&nK2UCES`QO3|q^;`W?F= zFlGMcFy%ND>500WTYjUZDv`~&`)8xx5sS^{IGREO$ z?il3x^g?(VN72<#ftrR{>MN98YQ2muE-h{Ol6rdzo{fuyqhFSCt_BbpGYOMi6kRy4 z*apTb6O)=)qsccg5U`@^y#MN;aKf|$hv32U(*?9w`LMN+XoW~9 zgAP{v2VvoUXF(M;?#a(JgDPHpX`F-{SezY?`Tp5NaC`0O=l*lVHZy5)As z2-337n7+&SN*5(f7kdd_FbX+_hf0TLr+v!Lr3 z>4W%5=L%qCJjon8aLk&w;OxQFx^_k5Iz)|sKGFRTbLGe({rVSxNZ_Bna*|5azU}(A z=J6nEs&K`xb$i>Z$G;l@wyti1_FbePdShw$GL%V;fx@WGQJ0kc-4H-+wq7+~USHBu z&&onkdy8p|?%C#3>Uwumna^LG!~f|sqi-8PRv$T4NHtw;GcaH8(1OSe>%7}qv4i-D zUOM!Rca1cyFgsUxeAgaBl=4@08Z$d}4wQDQRhp7hb*!g8Eia#q2#i6GJ}^`Vqvm3CU5D(0hM@YV!^qulAB6Wig@bo|)o}>`yLoc*foK5sncE~U zBrzuy^9`Sg?2eoT{6|3H{^#1j*gz`R2Vwh-O|Kk8Mpra4V2)CseC3M)Cd#`|mX{DW zm+`8+jRACy%{f3z8r-A;%cWe$U{Jc-!%0t-KslqNL*{j0Lo=qtlb(t=%3n#q9A#LU zCe8!M)t{e5$Nv49tMJTW1w6oH0uakx9rYOz&H}^XN@oYaV6{S_EQZ9b1}6U|-E{sN z>2IVl@CsAu1z_>2``V1yrAo?W7BoC*p#X14F+wT485a~kVTXi4FDyWTR^FF$1~=}} zG9B)PesSIFu&eDtt&7I*Y$9(s!6V`ro{ID?G94VhSGef}S9$WO)P6!xq?$sjnE0xg%zqNAg;wR-}1u)6LL zKv%thMZdqPZL1YCE9N*~`tl{6Mwzp-bL@28d`xh&5zJKI@(!NjBx+@ellYr3~lVT!45GqHNxLZxpd92juBwXX1g= zW@yL)z@gM~v>0M^9D)S^Enhh5z54+W=EsRdgHi2(LthEaN1mN_0OuXDb2O(fE31#g zbIVTgkV3%CuE6klkci2#Pqp5v8zTo8-kSC*##@J>W+TjI;s?&ZJf6(Z8>oCWB!-ym=4Rat=l?AokH@SeXx-VBBdPkYjLbal~7 zGb&RTXKZS(ubxK;`N9u^(%Gr~rl+UJ_DfSz{5J*I3 zQHOslmju9b``_azfiC+h5PslI@jL?lAPdj1XdhN6jOHNti;OT#njOSm;*8ka3yGthDM2 znP0y7aMk{#N==D)=CYS8ON=e5{+hnqy<&ORh0{ zDZ00npCs=a=aLuxlR1@Yl%RCduexd_^6ID{r;w!00ldg0{wH8Up#R|GBF>t33UK@367jyvZcq@bz zlklwWX&Yr_n=T`xdU$(<4MA}S@LOHc$QMbIi6u_csCF+w9&jskm}(KR<2~4#sCT@N zt_(qm2c1Sy8740cyv0Aph`Ug;Xkp>G63rozxG(u(Z21M*QcPiTPpJHmnPFec8 zGC_T)LFKtpmlRmk#+PM|`U&}^-@+Cb3^ai|3Vz+nPBiC4Xt4cCbv;ZMqeD#~KEQ?z zhUKe+_@>sRey-c$m2fL(?7a0{KSm&++ytXz9_zbfC|BK{b%!A94*riBrdAZKmlRf4 z&>3(9`XcGhtwlw*-!)1+#RXKTRPEgHQn=lcZ^OD7u$KB8DwKsN=ni2_V_4P#i9$rO z!g853J-8HsfIe^e^d#P+>*XvUMty=Jo6yy-cswW5M}@N_NiRQ^8P18>)aKa{J5!Aq zNcAWStt5iO*8YJ&rd<{X&b%4T6gF2|E-Mvcc1a*vuaFS&>dC4pD`9vhB|0IDWu;uR0J`T=>mBNJNC>P$_e|F6mfUTSrsxjp5YMaaN+* ziUNKjcZg~8T{Q!;XWv)_>}2Y7i^Js`j%CQ=EM+iE0X!C$6D^(DvASVSOgY>%w ziEv0u*=5s4DFYfK@Ck?gV1A{#k-SzDx!83E_~q|N01|>NYI@r2 z?!SbR#^f!fuk;y5e1HpV$Btf198!?MC>S-|X0N6n-J9kK&_OOCnGO_Nq|k@LtEK{v zsD##%2|hsK>cuBEi}(6E{ts{O9nbZ@{*PxQvZcby7TH8rc4m?tWzVK9St091w#>3e zB{O?QLPS}GmraE1WY6#QtaHxqeLLrTKIc8o=l74->-qG0KAw+xUDxBf?$`ZMW?+5x zQkth-h_%NG>j3}ZW>4J|bE8i}>qx&%yGfu8h3I454t5tKm4&a`P;3Ovu=C z)*0xiLw340K5MJTf~1%t7Qt>Ve_(?(Mz$2e|M zT=(8*G24<|PGve#@1as@ato&@os2rX25}aCUD7aM0s}6f)qdCtG5K5q2ah9j?vJ(@ z+I-8V>B1;%&leXL8TPOv!TX|{l z`6*y`B3T;rQlU4uKJ(o92=`l~#&gwS7#_@>Dx3b<%Ts+9=jxe-v+A0SRqgkkN!#O0 zBu?~Q7$5&GVAXK5asL6>zuVdNADp4>1uj85t4Jw7RW^+FFSY!0uj-TxqdiZTQTG6f zU)7vQm`6tYr;dke{M6}C&7YPFjB*qD!8~(p3*g2FbeGwB%kLH<-l2u9$(~=p&-y)A z>NTGyM`wI78f3(0X_1*?@5G+Z2s6d^mgofH)(uRQyL}Xv5T$%0K_Ie}hG|u}0PmL0 zNTVi-4Bf=EhCbp>JKAmk)(k#LL;`z`gEIpLuTr$R27Q;Rke?RLTQ^DXW zJGgoNSTxmnkLL(>Lx*HG(FL%dZbkN84)A=qNYLskJl`4;;1`Ze_FCDPcqE=~%uRbQ z37S7ioKahgG~aw8Z4y3d?a01Sya zHr|2Bx0LTac!YGPbn?oQH|;6v+iOX!>_N zO=DxI@;_G{Dqq9j<4RV<6lS#Uq_!BwQ@q}A`kC2*MaI&2nz5X^sLngD#Z0&5L8$G` z)DmMMbGbgfhp9gxp{B&{dLc~_(>&Lk&TSUqt$~U>y)aDV`pNN`J#y6EfiR_Bu;l-jmk!=vj4jjSeN5+VNTZsOQ=kY=W?4K>Q$`ecDkY zoLx9~RWeWQ9R$SY!k^MCun6N)ZAo_`W7@~xMhu^?$o^}?_o2@uczVgkg9!-!%IbrBPVFdnPCX$yl%e3_mU zyG_jon)>FjN8K8=I8_nTnYZZQDvbZ;g53Ydf)tHtU`XOPQE&lj^MCrn&DvUB;d{Zv z9>&*E9aeMdQfEEheQrhk6y@c*bMy1hq?Zkt7=(vGXph_4W#@sOLBF*Yodkz4o{NW^ zRRE%IZZuN6z-Q$UijxcUOgcZRqY{K~!^o({bwtku4A{Rjg3qN5blY*z6XQ5<-{!Jd z-2`pd*A=67^Y2==qhO6#dm4l7AEcX(;`2IThq6t&!2sss%PP z#z+`-;QLmOsoGU%A%WSq1I2C+gj}qZuf&&}BX2@muZh`|$XhELD3zRJ_;Z(lvKe~H zD+*5|Dw@iNuHO4f-@P_sP_VLAo@}J?&jm>yV+p=nWk25Ht|0cO`jY(vT5t(Ohkb>J zj&1_zUNhk&2hhO~F0aYecCrd0e(FQfLFLoOQ-fV()_=%L>Zn6Z$H?Qv*8VVpj&VJj zO~re(BDCo2D=8SEoof^vh#Glc#1{Qmt{=!G4Bsh9A}DHb#8Eam<&#fZ%Q#=SkCm5s&E?!Z68oOm9H#1h&3CIm{zA{W376WtyGX!(uXTkAMP?74 z(*I&61?Y0Sd6ug)u!?#+KB!KAbn@m5D=Z+KF%a?aeq#R(Fus-EuA#*El}LKze=VH> z%y#RrtZ4~zxjXt#UOrmzawyB&nxmUY;@Z)9|4O~%vF1#jNe*7Gf6Ktbhj&3q8_wge z$G}=nr&KYVM8J^!;Ie)Ourc-kivCJO2bG0usB^$CRnnV;3P2s&=d*m)Mxr=c><(eS zEdsha#`$sZU1)OmyAM|~UcbEB$K$SBzW;?fN=V#yTTe%)HI4K`q{Enman4*MN}z&W z;tBunw-^4S!2u=*Ptf^gM&Ycgnr|sXXzACSiFRJ9mS1(FHp1aMNG81`QMLJf%R?gvU21ALWKW1-aB#U@WOjn)I~|-u2?)LAm&| z9m+ z%si{G++UASp1eEXm>s*HV}*IY{FwakL5vP*J~0dc*8c*t`=cm5!0GkHMxO%&qBWsH ztW26tk!--72hPcP^&4gbKUN3%9k1F*Lm2ctwMX~4;VP!Ax9HnoVEhv_`1dyVpOM7? zO`dOipYSUqOodS+9K%SL?AUmB-g41}{_$%&TcS+R3CfFRC#-rRI*B5Ru&|;*(fs$( zmjp*+mD9M3CYH}7_S?lH7?+K`?R?wocMfk$Q*RG!Dlg{kXJu3u4~gApOhfW!)Sfn} z0N_(jE9-scU$_Q|q;>bBJAD)0JWtxgAl!@~V$Q2S08_Oc{=%{)(>u10F3fDh>qKCC zLKQ<)HOmWSg_DkQ_-NvM=!ViSXRaZ>NDAyLG=Jq9O3=Tv@d)>;9#S!>ChTE_?jc4)3A_7NSeuT3E}-Kz24>^7%)xfN@4=DMu^)g==u1 zW%WH>mk1*6Gmya|Cukx^>9mEH-ye9en`-%~GbLtYy6;tR@Ej-gZkd?CLY51<1@rLX z|54J!SvMkVhi0_l+#+c`^L(MVyne+}2i@Z~EKjQsdeUd)!lmUoch?CJL0Ac$|AoS` zmKauqaCWpH0<)>arM5$)Gx6M1DB{85m3=papk@6oh)sf>A9OkLyD;pdF==WYzkM7E z=u3`GLTF8*b#SnZ%>6j6c47R{+Yy(34Cb3zc`A~`M5TH}5J?{JznnEVNq-Xu+W`vt zFTvb?lN6NX{}g|g5(TQzs)fNg8hyC+`h4NgDcx(5wQA@GQ!>;}6G@0Y9Q42BHT}V~ zDCnQNi>xW9i@%5#;dMEJfRk&O)0J@L%kxiGSAmW1!-u77V+*R!!bkG$w zjE!1VIX0oWUBkM_Cs?<~K|P zOI;P%g1qLSl2IP84T5!arg&$#aN7{K|0!PH`-n@6B`htp7=|MzAS8qiF&mXx*fY6d zURX_hF;{|ri1y&Ou|r0f-1W*z41(qjKkqB>+af?8vs~_eKpjLpOpgr#lnmvetP zG4bazgah3|N|$X2&T4EHycd|Bcgniq_+l7hx8zW?{KI!-h<7@#;vqIO80JAo!kGGC zLx0T<>!>`Eri4W!mxtJKqu9rTz*)qEeI9)c+_#WCLP-YX3Uau&h>F0=3J~}mX4RY{ zzt*z|Ry5S0%z54x7*iN5=L-+H0pw}6y?--2R&%CS9k0&C9oe4zA0`^JR?1>OZ^pao zt~8U$E<9EyPcwa3H>8>q*nepsV5V;}UpR|*kl_BMwC;aNKZmS`{%2870W@yPdJ7x( zF*s>IVeYl0TK-t{eq26srAn^H?u$M3(Dt?9!A6kBRtm03#Q1B7zff}08APk2i6JZ$Sh3P=x0tKb6_@D+A+C3Qhqkc?Bi$6pQcG`(K0@0MxXz?1b-ASOVfgEMda^@i`wr~@ASjR;3%;XUTc|JMn*>WlB!|OATe=9tVDA;5~WXbJjF{j^>HJ71s!z9>quN% zT&aP^<(v0-rESXq2(Z7=M*Wr!=XdcOf&A_PX0kWZ5H((XgG1U{?L3eD()2$9U9Y-v} z6IZe9F~LxJVYw6nDDKoQ-yF0ApH(KUMfg}hgV>|opme9)Oak{uK&)a=rcVY>Mkega`7;IScfIWsk9FV9Py8l#_-22@ar$<1t{Hi#o-L zuKw{BfHQhJ)*5!j5MNI<1Tsz^qVy#S9|&lQoYj(lbFxU5iFgW)I@Ml2cAxh)TmC;Td)j)jw&AUHn|yr8NWMIY$ReXSH8PEzW{V$i-zU%5N*4E=$R~O z?pK)Het#s0cwni%umlu(nbSZ4WTX%6Z#r)T5d)1x{}SK=;Z?TyOMUhbEpJmeZFJ^j zviGxmV=mpiNB}$NAUHTUVzvn3TRb}XlJw?jUPE>u7xi7JKRjsoJLEO^5it)B&nlb* z{ivF_m4xr_TmU70J6&dg4HJMP8o>KoDi+0t&HIQr_%5W;7Jn;0YdLRszzu?kX>RcP z?)%n~F&1YfRg#FJD8x-Zu=u)D5UO3}F_2GJ#8mTo(`_0eehAec*dZ00Qk}F#4L^AR zo6rjzqccw+_&EG}p)VX0IWoE{;eh0Wn=3QR_5>==7&{?vJ>C?h=66g*Vn+gWz;`-Ga(5$En2}vxe`%>BWF=eLNY< z{1YM$T+Pg5+Z`@|64kE*y2X$-kG|_Sn<-Fu#58B3&#K1(S8BW=-|35QPGPNwjIGc2 zw?D+fu>{iMktt>ap|7hsN}3Bh`}4)?_3_F-<|@Q(0>g(nsK0+55ZF2qsG_{0ey;wgb~@p974Bz1WA2y3;q498i=CT~zA{g_6CyY7jBG3p#XH~i zk>@6HQAY5s+bu6{D`b@3_u?NN9;V!5`FfgN!Stl) z2xz=q-TcQWyV`S+DBj9(>}}u?HX5JX3%pE@LeTCN!;T>mdn=8rVRjGXq!8jxm?-oH z2q~z}N_$CxOKPhLs}hrG>O+bLApLw9Wu|cie1eo_D#=x41>ky!?b+ zz}IBQ`B?OUBwwG+&d?j8_A_s)c}`Hpi^~Ew^%qG6kk4i;5na>NB;ULE_WiP5d(w&c z7pVZ?+?Q0FBm$z(g*!VlpPff?#{4+W1^yP~iSRT!9b*;F?rd|d|EToHq5#49?RW32 zOf=@lpfRRLJ!-S&uBZ_l15DG4VK{=`5$Ivx4_7(0A8;j_o6A{PWO5tVUPH*ATOnIP zs%oLQ9s84JASUf#&S?Lt0j&Ek4W{RDDHzj6vwH(i&7H2l^D(E8w~kZSIyz^JJ33fT zct5i?N9A^^knW&#UHv_x`NN1w>t`ts`R0Y!!%Tgx$Pt=N6N^<@gX>$B7zzO1vaV!> zDabrpol|=-x2r#DXOV7c?+a+nUzuEa1~%*jF^mW^D@X5g!&Viy>{eE=>O*G~H60EQ zh3Sa?&s>Jj)ybj`<|}7>T4l|N zk9<4N%OnyjZ%o@R$MeQ^dM-Kng#7@DN&zxXm2>Yun;R5cKFXe{BL(6^$i9$%Sy))% z6O}~99`bNUq`#|+EYI1wf;@itd{6c5cYjPyuvvLElKPk(H?`hk}OnXtPyOK}Ih^<(dzoVEC*AR`%!klG>u=PDMX>Yc>C zfhpUt0;xQWh&=O==VhP1J&pfKXZv1sZojWDU;UFp>_7rrn%9ds56S>IFMi}Ho7w|9 z+|SGQz13H-v-E-6+vM0)^8~gx)j~hhXR{4<;7s-Ps#N#SG|Lri>?gEEe!^u-%j+Oq z&^4mj`292KYRD;`$B+8I628NyD@P1$F3$k{`Nb8&0E=+3=d&ZH0sDJr0r*gnte2=eH_5=8`pT;P6p$}oSqTex z>|l7=u!cLK~;?W^U;}J5vsX9 zjeVdS~XTYmz;ilY=gv3?iH4PcRM8l=cB$n|Gy5<~)ERvO=Yq?rC645&+=(+-%UHw{cK z#4lcejs$EhQUuO1k`KaUH???w`f@WvT+7Az>by^&oPSbA*n?8eS47^RmOz!kK% zex6Y9qIK)V_)j8Af9`U&Q*i9^^;t-RL(Vp6`zSI-izx>F`??P)k~QNRR<18wF0tQL zydJ!5j5I=*8Ock_CZ%$(L-HQ_ja-aOB@c$`_YD}Y(>gSf%NC^c-yc!3o>lR)NyXn{ z7iO=!l3U(7v?+DgV9AZG$*BAG5UJ=r-%yG95v|>V_gb1p)F~D$Mqs%0^mgWbz$;?n zN>=Na09E=YE@3k>t8Zs(8wZA&rq@;@!_Sne7;QE3_1VCbx4zZ;`PD{Y)nVE0hD{a` zlBXn&T_GToEWCQj;{?i`Yi_;YBp!g;m%(^p0=wF-TU=<$oJkl#aEM^n1rrB54RyTk zc{NU&?Autvv=Q8P{N=)!-rQkli&fX|CTT=T6yh=bHg{9^2H<#O)n439`E!@BZMc`8 zOZoyiyB^7K@xA$!4q)VN%i;WYKC_EK<^{6xSzY_2-VXUXX9F3$DgltsSlKSuDgTq! zkkn@n_rWRD=TQBM#i;zvdPajKedhe3&3lPc{>w?sI1x#-K_nw;uO=nI$GU8g@E$hP zmc@&s>NG_SyQgkALNlIN*K8uJp>+XI0I4U(Hng;#1-YqqW^CZtiF{w~A$|KFk0OTG z)|_-($Z5Vf*2{5g?u)O?8mF{VcVbjiB(Ce`U1K6V%}Z3?2{gL3F4h_h?jznX+oeh+ z*FS4p!K!`zt}}V~m6_ij=wRX{qVKVV4-|IRfJmx$UT+CDp7hAZ)1`+z&c}{q{TS;LKdC9s2>}TjPB_pT3AGH+j=z z6kv#(e7U`_i75Nw)rl%ptyuJ&6ZadGhJyk4{6}t4Ec&>=DCe`HRMqWm>3Z6IkiT7hkRZ^pH zAc(sKP~L|66wM-hbDZ3t6wiNPuMTs6+5`(=Wcta~kDfztBJ7U)yv=}pqxr!0RPif2 zc(0$7>qB6(CGYW`PJlS67lO&lHPP7>ft8Yn`v;kUVf{SuKKl2@-6)d107iq5UHi0n z>f^_*Q56fB#m}DWo%a*G+`I_Kk-@rhsMIt?QwxSTQyf(q+2J1 z*r>W}qFQNcaW^G?W4azXR%bmDOJG9$Z2LpMxn~*bSL1UB?)?#XGWvmz==22yx*;y_ zxeXq=sm!ia-y&|&hO<+)V1tx-a;mP4;@{>ytCc!E)8@n3S!xBWyVPlQ=JeIX6J@eU> za_QCJ^Y@OEY)_%nTNs}Dguk!|29Z5$(RafK6IlkAb${*>8Db$=08eyZ*k)X(K?JLD%bRj=NPWF6~J;1vB z8b79&`a**tbAUIzG#Q{}08A51&4bLE4qH#iw6Nc28JR-2N%acsg|b+Eo*Csjxu4?{ zzuGGc-9;iC@HiOykv%wwkQSofn+Qr6*zfU$dD zGrC|AA>_82`DCY}bl1SO^2RDYgxZU=JTSOX!NDNR?nAvXU)48m!HY5+EMgEB)BWve zNq}OZZw*sf!FuyEIYsLcrbZ4;23&Y6n7Qu*(%lE!e57e&Q24WtHl)NVrTJIj%gadQ37$`_dXzT_iS`362dUk>8 zHZs>FH!Qd}$gmdh6hM&mM_tLk9jmEO_NOk%2XM zQCDLW`(L?+5=tIR1Oe^1Y&82hNZEkY3tvCF{KewMiFm8Yy_PIUdtZ4g#e&zCKq$>z z29DcaIXu7sDkDibj(P91ke-){lUMl**I>2P^-j5-`IYbSE7j#h(k;6?QHvoavBdq7 z+%`2eV^!%(tsDZ=wWOEUSE@(->~+PVBNvqk5{^KvaT%j8{)KCp1h{ceP#k|-vcUhH zE?N>ECk%8`iB?1NDu>?;?HP+{FMn|nf1Srbj<1XsyuJqG(bXP(>phIe zpS^|>YBFfU(Z8~PkU-VzFW>GLJI~W!_x#59?0eaIPxs)d6com72JH|}$LFa?d>j@b z!U%o6PF@6|lO@v2c#5p?IKJ+WP!s;f*z6lH1rxqrz9$G%@Pj>T&UGmF#x}(WLPslT z_~_1W9)}q3hc-uk&R>~(k72#U_;b`|TQ%OleQhPMqeXkKtY+B$vknCLX}?vBt*qz^ ze*j6s-xAk~| zXZwr23>Za_V;@HF`jL|os>WHg1U>J8J9-xEh_qA=@{h3Q`8yMccA)XIH&x_+Ak`9N z^Yemg?L49AM(EUkguYKhb{=}lNOrS827eK6*8BbtH^ zn+6(p>!ET`6O;yARCTk~tiT(J5U;`wc@|8$Q(<2HlG(sUV!b~WCns0d{XQ>&4HQGP zSF|E5lP;J_veMejPm!>C$~o#as9aZPOwcqkGB;K6_MviXf~*;CNS{$h%SuD#_EjI< z3YVEI#=GuzklfSV!?uW%qfovWJ2&Ahurs{b-W9=cgZEq%QyOyq0bY>5GHmofl6(p2 z(7pJJ^c7^Y+&y2LnncU}2(QhG7Oi^+52HNRQPQexUWbAbam>e>-sm;q;b;)tM$)+) zn$GYeLhOjnMH-}@wC4zjkQsC&Hu4?RlS)Yy8wcu1B*U6Tj_ncqd8OU zsVP&nMA=rib2e7o}iD=Ii@aw#46g-OP=^;j<>q>ECPTLCjOUBf(=M{dvz9( z=Be54+|7_D_z7bA(wBc!+nBv+>tGO7OmZ$o&dAUT7$$S<{*zovM&X{qo(Sln(m( zer2=}V!ycVQP}`-upix$Wf+odNiVIKc=pw|`jp5k$u5C5W1ncx3_i4P{aY6(v40yEG1VX8|77lWp)|JTA`a8=utYIU;QDX@7*~|-=z&S0W ztgm*ft`;wO)W=N=89d9Jw5vdCQ}1-ra%I-7VZD^J^vxC*Q??2iZK4iAVD3zFGBkNqee z$!G^Opvqi$0=HbaTOg6>J#6V)O)?SxBO~ztLMSYxn3h>}wS0It$$IZS!`l}1Y)c%H zocNu?U5DHBJ!dIVY^n*2LN$XpSmwKjwVVhB#|URk^2$3eoA9+{er}0(?+!ahJ>FDl zj&2_^tS@@@4-GrKz~FrnPcS|~aYwnvn=xL-v5KsRSLuMmWr8({C;}nJH({CeG zL{!$QB@TQw^NXH(Jvn^JeJlnU(B}$6Zx;Xf9XR zS~6mduVI~riTt>C?sFL>is&RS*o{Y;(Nt8NOYRjeX)`_|OnUY!i{FBC2Q>B59-QM4 zcOzF(H*TeMb-35#Ay>pCsU~sR1eaZSmWzJjiaUJCYI$_?TgZ`SJkMso*Owxour<-| z+@9oVn6yIIS0dlwOCU+)JJ8KCT+05eIpuTHm^equk+QAx8g^oPlsyUI z;}h`XB*M$L}@6*l2kC%^s7wwxl*hy#vl%f%r+*w>i(p z$Q6Oju4+n?N0XPhE5?Vnk0nvE;J zHCgaf>%7;bL~AKUl#M`0lfYj7`qCQ8{ zFmEoI%0q2Ksf0G3nB&YmB!|J^H~);e8BCmnqj*N9s3AzxB9%kfo@CbN&9%TtnuF3) zuOn;~UAeRGy}y3dm%2YSxQ#GedDa|iZOr%;-R+{^wKEpcGGeDgpH;oUu47Vuyr8+!RnPLe9WT5E=5hP>v)pP9;aN`F1yww#w&jt_&Dzh#`R}KhJn(Ttd6L>) zO!GdIeqYb|(PP5GC8rlM(|wb|(Dp`A!X@fJm@bn|!IE^a0?d+*ZiYhV`Z<_4r-UOz z+2QN)jtQzST4b)?0uK}qd_Na~8xFIA0(<)VoXfhLCbrgJ+QUBDUKX`2!wx3a z4XG4+cNuI}htffIcHwJ0U#6+}n(>^Py6aa>IPh{d$=T3csWM znAQpgLi-L41xL>%5?h_+m6`1C^eZWLt(CaJcLMrZ3(9hoQ~IZOdoMgRgX2*)Ez3@Y$Mu>I>v0fpw-I2g8PVnOug$4|`6WN?TZCwHJw*DHgBJPXsn&xyJA-(C$V`dspQOD05c$`~rp&zNlga z2Hv|2Z~J#YdA^KUnXmD`wL)22b0O1$$M~GVb=h>&hR|Eri<7?v5f{@hm*0Q(aU#qk zqj%7dBqp_*sw`ECJVi{X+i@xK;UJWseI;i7HoO@G5YO}z-a+Wa5dK#uKEa$aqv<+d zDSw>A!p3vfbIoQ=a`hDUK8r)XN<+hlhEMPh5~b#&N^hWq?WZ zu6H3b17o^@pXl@eCJnV-Nkp)@-vEns$9GL@{+os$r8t9$Z$!M3YMB`t3WH-S?*xT; zDBKA0<0@_2buaN+J4Z>Y6<_@6(@B(N^Fg1~y@PGmwcy>5sE^aHGF_XwKkSVvjlaT3 zHsj;eO{c&j7#6%iI0ws2rbu@G`m;|ou!zs|FX*bk9MfWM+a;*;l*D*^Iy7I9k-m5! zbJ1HuFjsT;$qCK-s~0g6ycq7t3}17(x93fLut2Hv%S2+Dap6&l$X9QUKo3zdDleF0 z!=MrL_;9yBtbr^1CFHR`1yR~+ zOpl#2C29?Y+OWO@?|VL-`6BZ*sh#hK=9v-qBMoLI7yEQOuxwuU3J=BH0UOO$2aV&i~FHhsug6hZ(kIm2l7FHK|&#P!87UN{_;Q)wGe z7TjPKp*dWq)k*TWmc{<{Zn?RiwP~Ic*?vTQSVv7iizq*h%_mL$hoFh&1|AWbI*Dg~ z9cJ>ehKjewF&1_ngKnZkt)5$6?O|(1*P7nUOVapTe;;)67&Xy%px!Mxq8DpMy z6{~7$vRlvpTnnYm7wy^AMO%oD!h9i9m=E(Y_ggo&MU!h=^UG1@i-3*N7g9y7s0v$u z&C*dSkWhTnH>l1v;|a=lQ&~jjo+%wWz;pz}au6|ofwruGwM!g(EQR;p-hl#9ebw1?HxALtM9zSHPf^ZJ>EV0ZtH=N({h0w! z!X5Np=dM23;e6YmQ5q0j5|{pcenxJzqTNzk+ZzAaMP{CN!lj3EVp|*cm1-u~zo%`S zXvtLEi$D=%!@~8Wp}Zd02crvO$i3rhzGN`5n_MTl1hgg7<%e!{hJ#}x-X2#lV~J$6 zYs|DXhsBiNp5DA6bVu5QqD{>hzs69j1DAu7U06P&U5^Diz%=VtQ6V|ZJLWT6ch10P zC?(H^&_xi+{j6dso4tG&{@6UQ=IJY z@T0^R82Jpv*pSxM((y{hCxIxm@9rzqdCjvV`?u{&QSzJ6V|I$*&OLbMzRj z&Izl3ncV+*{c1dU(`vB$YFV5Vrn$_^0k)<3SXyxgViPgc(!1c>WpRW>viB`lp4r^m z9nfB>SX10()r-!qnikbDs;1~@`*KUnMVV0>I!S=_-H&ydpK?_4gWQ!JkVPjxSf4ot z12c8xPD)P9lHYY{~qPk9Of6cPL&C;f3_^?O&G?&;Kg_?j2Rx=(B z?js{rSjbFR1cXFTq#u@V6E7RAqsWK;|KkLst_*2R-1zcFJ@xJEv11vT>>y>(g(-e- zS&6%gRb3oeg(zd@?yx$+^=1M z`;!o{q*+s zauD+RO&F6)&>1s6|Fa246$RtVqWF=@litgTR3G+wTlZa>on0q9O~|-jsM$S3D=Sn{ zqTZ(S*BT7Q#G({OM;UHwUAk$&MjxEB-SnM3!34!KLqd=R+EK1L%QYqC=aww(#dgA{ zI*>;k#=0qTFfxw#ddIG|8+wgw)}d1Zg_+M!$%#^`%wGLQ5SaG;xzj`*=c(-*dG|f`AkS$O!S;Rb2K}p2`PZea-$9YkrKnaRlqdK|4aT4U*OZ@MZ^dR`UEObK0 zUKDL$mF}&O6TPnExJn%?r?EZC>;R2D(-i*F#?@Qecdk}pNW zN;@z6{k#!0FD=*pmEdi7S0o#I%5yGg*1eM?+V|Mbe^DAbl<*l<5>aZIB-`wrlXBtA z;WbKclatp6bK=M9fm!FiUD{?;1sTn{Qj2zFHoc&f_tR_Z&f!!MlKLHV6bXndeL1_5 zVEjv&^;f-guLmL97E2AGMJzP|WJp}rj#-z|T z$U>$ua(=yCx_VD~qr+YI;W<1ZJ%*6MyXp@!OJ&L$?r;dVss*AwXN^jz7b~PmtC$rm zqc;grb+t5KbK@0)w^XpBVRZR9Xz07n{nW|#BqVzmX%K;0EWC*HTX-40!SOYs%-vDF zZk8UMRz98y4bg%`fe7P|CtEUBzkH2spKU0JFm$IxeStG`@R02bqmQ}BBs?6{*zh}U zd}3u>N?n((Kbn77h1ZbomrH}O7N+D&4G^oZy-{7J=APVVE|XTibDj8tUNXg-w z+2r@GFtspmBPV{>zq`XhNe`y+&m9>upTVSY$_Y(9bv%Io!i8Xa(a?+xX3XNiSL?^? ztyjEWC0Iw#gocGR%&%vpr5&-Y@F>pjKb z>cE94^}YPb+_Y1L31Qcz_tB&(|q9np&@?IOPa(bcF-xkY-q?_2gF_OvLy~`zRcdDJUtU~S}orjA{PFwo~ ziR79FFxs*i3SP$o_eSe0!)2tWB`*%H^U)GD)a28>j;IZ&6S8BP3W-NqlZYQb^+Dzv z0inkIr&~1=eBYOIbYDNiq@BE7p2>!EQIDmOJ|b6s&y64Df~&S%`gOaE4|Y8={MJ6? zD8i;TV+!LIx!L)7bQR`yO!8V->eu$k9cH~-m~tBw9GA|UXHY@X+}vzmWf(lpsApXX9N`YY z*!+S=1ey8mB6y$-8)O=KvU3|`3(b>UM)r^Iq&OSqz-R=JZJpFq9N1yau>|}GZsH}>BShOPT;8oi+@tGC5$5;ivA(miC3Iok zyh?DVKmsq8`AUG?gD2gb74*$$e~sU<`QkqUeTj!pQIh~xk{mY0sjN#f1^22C#~9=f zOD0LN-uF_|)Rod)ji-yf!ShL+XAHzmKg)7i#(w$DbE|#ER?UmbRjNiqYaa=l{4}0L zF@saC_~K5Aq(@AZz}-G>tFn9MOjxgft--Ifiwg@)o!$pdR%PhB1yy(mO}<&lK+HTa z%LV{rJoReC*cFix|90LQ>zkO{HH|s%5OMoIc$wc4J1r2o%3$5E> zbvo9TLGa~97eu`<#S<<%FIC+-q zQxH5ACR0Lv6|9xtbK?aoC*dykY@HoW^^!5{CtnHNP>MncrAlB4#WN5m4n1)qai>5{ zneOL%`nGV*dfXGd=&NptIDOE3g^Jx1f-DLT%J{LKXS}jsA(jBvvfF!%w&PnlJsl)9~uEl*|_YNvxB$woPPtuG&qjAj=9}Ep5a<-9sTGO8i?-85d zYP7vKwo6khE>5v?V^ehEsb+X_^yx>m9$XUzeCncShrl5?KrywxjqE`fJ`da|hrzYH z9N9nPhgd2aHxmaC%aKRcIF%GJW@M9XZXh;zIc~m-XTRipw_8iQ`7>Wls}rukPh*F+0{oQ$ESNUCUR%eT(|Qu=1T^VFdT$Z`f2n<#ZKVC{qlw{NzP=O)3ze4 zMa$Y_Uf<7GA8zOmdp3Zz-?bJ-s8N~4pj`F(_w=HobNTlzn$fGPj#ejw(0B)p z$)_YzV)U?A@${9H0*m!k5bjxeb>rQ}cVSA5U1I(dXGfb?s_F1c*O;d0E>rn?=$_1} zbLBL2%9+S!R}oFQ?NjT+rnerIyJtUoLnUHnCPc<^w5`6xar9k*R&1eEkr9Qmg3PE) zw6~oR?`b$Sq`@umI90^F@55T3`{Qr0It+~D{gRdA;i(JBYG0RaE^wUtl%F^gB~yUk z>S4CJoUgj9Z(@?F%h_+Z9*xGdWrk$Uk~7wXwK zrB-7ogE4l0UfZoFr^FqhDix0K?vUW(`ikee^^ISoOvjSv&uz_$69j`qPNDTqDb)h;%hA1DU+@uj+3Z~V_(iC7cD(y5!jE#>!FJ?S^M!g z17F7ptd1cGpPMDu$;W@!jZ{|N`=vzbIkPpJPd@o1$bQ6B*?WjEcB#2mGu^CQ)2{Sp|>BrQ0iWk%$ zxIcj3cFrn|UMormD#ml}TmA~~%qEYzF*457Q+m*nz)baf(Bq3DP)(HK$6a$LAMGI+ z(iSJPrBlw!xt5ic&4spEXL^(KylL`wjQn(dQsL@17cuXusE4Bj_M>x>7Z*$fu(h_< zk83A++eLSqJuyaZ3kfXr>)$AtmApHrldsL$%2wARu621)l&`dXpTK~!FC)K77|LY)~~rISLF8`Lw33G$gbnd zAxM6pg4~&9giPJ)85$1DG1r$=*Rzf^d0TEXgdRKCs8AUUQGf8|1fmiJjhUErts`@t zx%Er&ogj%qcg&5P;MS$Ini|yZ=CWQ~UzGPNj;ULh#)fSI_OaF^DhZ5S>V0Kq7@5(w z3)!(r9%GbZo2Po)S>?=0uT+hF`h>IXV$4g(`c3G^LPq^RyQu`vy~&Zu%PSWB2^ctFDsimVhbpw z?Rjw7ZNZ{kt-}_f=n}_za_35i0mm0)Q1ll}{2$gHa%(eQc5s1npY}XZSdIODxJ`4J zHqxFHBZn9=R&zf4^rcia#gA886x&NjqkiZfypF}r+Ve2b*Uxc2-DAtwB^Wv)GD^Ix z#)NLN7Sf{&tnN&~^^R82tHR${f#I(;ZriUfM4kErdybyDvwoD`2GF+^=_E!>nA&nGQ~ z{Wu+`al%bRsnH%GJTKiR^>7q|uWv#OCdT^xB3oaduBo~illjG$MFV07l{a-|z06e0 zG_92tf?C!QfliATD_ z{)NNZQzPCC@ieroN3*MTL_d42Zq@ppdyHX}4XYbNmorHFbo1onw`Aq;h+C1%C5nn~tEToB|GT#<_9 zPr1Tn_rq=~j!rN3-ZJ;Y6$JjCHl1h>!q z{ao1c0#!`07&yqI2Beuo3!RH;xH9$ks!mT5!5cxe*Uu#yKF+JHHM|r;uAOu*9yqGF zxzGRFZ#;ffO+|uxuJ2q>I&n?+kBa{Bk(2e@$sfNx1Pg#w>7#7#SEWo-+f_a>N|fq( z4({`m!NjMImp?+Q6g9{l?sZylFa zw5nh;(;%OGqOj-JODnbb~ZV$DPZ)&)w(lbI*6~_uuT<2Q#vJjC=NZ#LM<*ke#t@u#q0#0o*m@MeqCZ`t+5k=~Nl+!B$;^^|#YJHMAF}|g zXRA$+%A0l>`cdJOlA~k0yX7{Gb*c>EpaXb9f-n}9|2CWapk`C_tCY$#1A}&#+hc0S zg+HMhiOlh=uR-&AFibsnGu3FN1f4g~NNqa@z}aE{Eu8(|Hmo#SvkwPcsvZ&%R(Y_P zOC+bbSzdK~xA3`NTr?>1GL^h;8=vQ-|d>v4gmtLAFnVh_z z09uh0mJ!E)d;a!?Lq73Yz)>^<&#RoNrK%bLO-|4hH26`j)3BB45q}3tJsJZi1kIS+ ziQmAi{BI}DPlj0fz?nj8yS5ed7ZLaWsEVhz7 zuB?J3-&%>}mYN>*NRCbrBXR^IV_2HcvK(BrL@pw5clu=OtuT>lJB4D^zRcnwQN^wJ zII=NL?Bi?>@xo-I2rC9zz`B8`#Q)@_b%vo|VCB#F67WZ#L*o*PrO2RQsn<-nD5zt{ zdzmX2%)%ErwV8L7B`<2B{IVNEXhL+J{IMY9)B8SGDinhi|9=Wt!#)B$+u}nPq`mEP zsH^b5ZugNC?G{(;Ox)d!aH_~V9k!=4q@VZFl+DiFF37yK+hn(kVzH;ui-105OrF_7 zD^%L(r~Fi?>R`{|T6hKAK68E{!P^h|Sa%97L9MilzP9SLpSoJ}*BFzv!b*lBUr$ui z@GyifV-#}l-QvV!;UJv?a}Zq|uUTONkKGSL!ZJb#mf~-R6Dm)UT$?#}xjn_ltn%s<<=Q!*11{n#+;(vO`U9NCFX0w(MQ_mkzUqpfeU+;KB}bP*~V zmT=dXzv+caM^`qZsJ~0jbh-puazp-ej!z0^^_!wo?Oz!Y`{7{> zJwa)1!0xi$_mkE1l|DA9apC0CP3uBQL7EUOB`~37@z1LTpL@htIT&SL>J=&!`pKW| zi57w%#?s+izX5>&UQo=}t3Mt*wQR_-ry$fR52C9?O@XRwdNzj)mMI0tJ14LC>_G*s z@W6@;1hYmV0LN%@gd*E(=N*2{-ZxV)SkJ(@Qv11QL(9OT>H97wwNvv2iT0|g2C&WN zgnTkC>3^;N76H7}zdphy@msp^s&evh>&2i*j~`v!yKv5bXoxHFW}l=@`GTI|(QvuM zPH~u`XZLXaR&_LNEiir&#z|{QIdI?a`2$V=(*=vNruAo!ztjui)DwJ;C)jPPedqR8 zVl0oXbe1f|qrNWhPoDza#SH~pC2o_V+loEtzF_rx+4MpruVlL#_X>IdS5^-DJOLR6 zrsWiJyrJLDYs-+wf6mVImpIavEpY9-)<}Wrr`yj{Gj3(;ix69u$Mm|`4W7EeJpBw2 z66eG;vv9z2n~S8p|DP^s`n-SsWho?)R3N<@pFfX^6@k<{I|%|jFS*Jf^1Mj|36*$onIWn2s7J#mjz%i^V?lsI_^Wwl zk~OQr+ZXIqAd7igI}nJ^$i>Oco%a?b_9%g@Gi-+)SaZ zE}5EM2Ih;k$W|y~x_HihTmNfmQbHWE8X%PUi`4Z;gM^Z5oQBYpp^6+}M}GA5>kKG@ z4>x?C9^cmN8G17qmu9V_-_S#y3BWv3RdbZ$34)_yMFvk;_&6eIJp(*@bI?cPWT@RW z^|4px1fp~I}-_4 zBWiH$L{xcOb9kh1nRq{GFjk0`GuL;Y9a$Bh+ii1vmotdQ=55rKDF^Nja?=5`pn*v6 zsWo4{;3WEIikuoV%W5!hgaAK&vn)X^-ZTFD8yWKb2Qr?Ovg3M)xue?*)5B=mp{WC- zPV8_UF}0%mf6x|M#6I2i13p!ttogBa3*=Fgq$){M(1JsejSPmhS#hbUsZ+E^9o~?! zKpzh7TVX_C-=}Rab4)((3m6zup<`PXN5)c4s2SYvmG@o^^jO0A9vW~2S4kE%w)8oT z*}XZ+`NcKurh}`(QMzIH|98a$k_=4w-|M2FZ4XhR6~N2h8+DV=VDw8^2Y<%_d*#}j zP=aeez*d1fP!}^+bnDgnhfXW1BE5-|6uM-ch}FEU4pptx4x<)zvn|DE)9Q%2~{>B zxPv2RXu4DW%^~6VRZ$%vV9~vwU69}KO_;NaQYXq}i>e&pp*7m&*tK7F2Wnt- z7?9rHxG&r8cePI)8UUiNRF_Znh~l0vIPPvf0-H|Q0bg2y2bL?zsCr6M9_AS^wRFV) zgVg&UjF34mo4W}kJnuHR)<6pVJ0YHD+H~*c@&!bO4kc$Y3EfErd*E9n>Au|~3$%&n zXS>3l2my>89kq~i$(}78d-X6ugqtx8`J^MiBkAi(`}?tW&_DyjNzL|)rnP!bUx(5&ZjF`P|B~A8^1Igq_G01^#HFozQdEG7Q;{=e2=`< z?NEM!V?dnslR62ij>E~3XA0OtFXjC}>1zxOiYl*@9qJclUm#ZbK&h>iM1YAq*g7JI z=!Or5PaPlboB*kmKA1&*(1m>CPZ#F;MP9CG8rxbRhhl9R+d2C|+(%~c@5aUP>?cDy z>PYq5;t7Ogg~O}C`^()ra2$P#R`YIv?;yA3jODTuF8BPy zF0LJdV=u`Ko@Sc{YftJQ4Ap-9CcGP!P~_gotOO5`j}1<6AIv&1aLKl>3XexEB5)bH zu93ht1%aye;|KkdX`=Z*8=sqvsYBI=l@fUzrxQ9i*DX;nE8m^G=z*6BTsQ*wmi{H$ zJNrk#{_&l=P*V;4Rz%H19`_%Lw4k;X2)txeR+Oz9z~0Zl)p0wv$6_uE@}p-c?Cdcj zfD%!1>o2JovD4#$3+6W!eJj6~FyBwBss{pp_c1naOJXRz$;hp0rK{b|#mQia`yM?h z@%6ojvRjA!cWnpoY$gZ+{CV(m#&_T;sz1In=7CI8!?@(%vjq!!Fd25R;80@8s;9rO z<>+9i7Z7FvfeDW{{m9Z`RuSMAJws;nr94;_sYvSYEjrzpDp7!C&VVWhA6PvpBJ>>7^x$qf31;Uu zbyJbq)z5!}&iHfS#unnC!3BMf$Qo{GhO$ESTWimMFd>pRzq2ixfC>kfS-rGwTwsL zX9**N+J<$c1veBqW;;|p=CzuT`S(9bNfW2zg$*}$aHqdnPd$=40l>2fbFc|f13hER zzWoGl90Z@VE2Ga=^jvrh+I!aj(l%V-5E_aMBoM79mG~xo%~`7->w6J|>ybI1)c9bm zE1BXjqp2F+^jqGyxub7{0D51_QBy){@V?^T3DyAg62X)9J|4U;j^b&=7HH&1fQHUv z!vUAZH*VifhKh2m3JFnS+-(66O9#>eZJOq_^DQqS<4x{PTj1PAn+eMYgKngn7<5t)>jEJn&h3Mk}ldZLliQGac!sXK4*={Ue^> z*2ye|YIx{9@Qh+VJ&p2b^;?tQ8HiyKmZ6y#MY63UWv{ugWP{Ra;xSL?{AV0t0vu0I zUT~f`QqY=cRZc#Y`{+@j*Y?%T@8}&ym<3TgVZOn*_1G15!2s=vQB57|5>m1}w1zwx z;8Pv(6T|tb59`l_s-Nz+c?+dRwjbE#96a#j12{L$&da?G)cNb^yVt+n9T$)7=PgAt zQ&eAn;Kiiw8{rmbh#iouIHdd^lhI=JDsr5fwmO9YHvXedoNy;pBjYs^arvgp%Mbwoh9v)t@igh&YkX0b}K@WgC01GeD|4>vZn(iGNp~(sV zINK@K0RJ2!po2<+bzVm-3!DzTdrS={@`Lzy_gx}+fk}#Y?#XQz+uHS%KCX|}FyKih z3*;J*FFP%@J=SsgBAM;N<8Ih7PcVGBHKpj@`3a;%?_^!bql`kugk@eXoQ zPkz4fyC~$RGJ!)5#_y0)aCN*N=!L!LiA>;PWUi!PN}w>vr!>MyoiRV#+_ej><@N+N{N;|f!hJ32mXHKCVv8F9z3zC??cjV@7K^3 zW@mI<7zVo;oad#;{z3OCadwIX0xq2&3y-fZsH%BOL)o^r)B!g9Mal}3{^buKKI2@a zKd5(FxM4tQEwcyQF$_xlHI?M(>>3C{J;`y72MeK?9q>P7Dtb?^_ z^}h8)DJ*~zqqiwA7JPWkm#BZZGatJs7sM8_oWDDWd&LD7*;tgN?Z;q{*!kjZ8%RIJ z_*cuAMY@vI@@|I;*uySlzDf*q#d>R$b`zuVcr=>-_FYD%j95_fiborYrx zHz#lWV~w^tIKD}&7i1LVS0uZlEdBm}kXR!7f(kiklwU8>sth3OiVhoWb|ycAkxwjs zPq=6$V3Qk{2radCC+w|9DVP+!5ZQTy^p>LA#)c~UAAG^@ttdWLq;o!3Oz^t%1Ug?t z3Xjw6S~OO~Z!=hL072mnamYu{xG5GCOS$uAVLA(WAaCpU8KdLQALB^lpa%;V^#h83 zfZukN>XBM6QzhcCZvB=TcmWld0ZUG5eQ*~TR!%SGpb-Cl{WL`9u@cjY6l-8%(rN0OW{+l!SO*63qeF5mmNZx zjf|Ut_kz##($oAx&a)Tx6>4*FhRpwojJ-U>2MZ80NV(*5t#|v#%WAPxqwm%F+q3C~ zw~>QLiFvEHT?vPR6HkB~!~%RcR%COxRxzQ=iTkvziPLs4AOJ?%mZ%`$b5d!4x(}?i zd`$1uIw9D^Pnb-S)vB(?#|gi!F*9qvRL}Pi)XevKAIZ2y_yQ#dQ7D5!6#<2=CKs1Tr!DFzrSt)v70RV{I z)(?C@H%61*_30MMOoY#^-L8Ro%4QK)43mOeUm1-h7J-7T>+I*T^dCgmdKbDOmnfU} zVQTN(J7G9|e!~DzGS?)A7!*2EItDR>f@JxD?Dq4(*A(YqO@mft zNmw^(icR5qSmlb9DYOfEWl+sX8}{*IS~p+%i;@@cJ0CCkfLcUszpn3za~erg@~IF& z!AVvD4dR>7Lz4fEMSu*!U$V#tsf9C!O;ro%v2kcJ^Ev*;fo*H)&2mtA+V>^C+=76I z`4f}>|8iPz5aB4TP+~v{7t@IxgxjYI0TNUmqWi z$~!*%QGNb?HUbEuI(_x`46uevMeC#ht@}F>PpJ&kaji4aqt<_FZ7kfYnwKW7f^a9? zelXhY&uDvd0*H{Wd>B%a$qIMopjWN-Rb+`Drj8)G_2pt?ht-xx=m{c_#Ox+$*#Pg= zkE}kE3PCI&MmPuVe0Bzg{S5wFEF}NKUs)aCBiC02sX<%wPwrZ81W%79wrY!t{x`oc zfnl`0@>FExwGk4iDOIfX#4%=;wwZ&9k{7mL3!&tH_k;0^|8il9l6B@K?3I`!*lJ>o z(H69+t!_eqP)773ts#!FO5`i0dIh0Z5gbG|dK{C z(!g9l!^TdVgXF@mu;FL&zh9Vzg<*H`+RFI7DouC~bYht=MLDtfCZ+gl ziTTs7PpFm{#qDL9HE-2Sza*#?8g?070|NuvId8L~LBYJ) zZ1%TmMbLkpnqw-;02vL*aCQUl&x>u-GeFDtT{M8@gI`OW5Bt)yY5!`Gj!u`25PK_> zw&t1k2Q}=jO=otq_e`Ie&OtnS9F?>g0US+pj#-x`aPQG@0=lW(z}QX#7n_iSy6_+g zkF6I|>WqBoSP!F_(WVQFZ(cJiHQ|e%Kq<4^_jh1hAmh2({mx&qWl&=g9hvUyn0`At zy+g5lNdSSVp|6wp2lC-vym(XZ_g`;#2Qc%05h()Zh@cs2zhH=+UH(m1qS6J{;P{Zd zejdG2#~ZKHY&dT=n!meD)8wxs_$*LxrUmfbzq0*Q@U+EON1ug&we$-xFJ@%mp;<5( z&poSfogAodGn~ZK6O>octt7(6CCVuSta8U?2^fehdpG)|twDiYd95ESEoMYjIrN$( zJ@TXmH3CcX1azvK3{WF3d)j;ZamVF}aSzoMvppG+|x&igfk zJK-!a#Uc`JZ-=&GtbP_P|YeUf^oU{v(%o||gBFswggQdxsjQJRfIt*0$JUx)U5Yu5|jp2Ev}f5h$2t=w~2mb`_~? zU3rVeO7#fDxga@j-_wI_G&kpq&YO|~ma1Ao_3-GgD&Ot>+HUNj13<~yuvEDglWy9@x20y-l=mS8N(qSinT+4oz<9-{8 zT!>Um4KG$jUT=^na}?n9_)=_9$BpBIZ1ctbZ`DLMWE5AxZ0PJJ`daM1o}%OT$?fF_ z6`U>|JroxsIHK3iAGp;)$JK`J9oZJ}bSgFe|1A+LwDFosxDB=CD-weL_vK$4nbjiY z4KueHVTB30@_;N%W%-1~rliwz_!AS}kLZd-LU^Fi5rK_6*u{#;GxMa=f{i&oJp z5j{-(qm6xYKz&Y7c@&eGK*CST(dru55cHOizWSCCG^`aKi^2djufI~Wb?PrTx+x5D z|9=r6>bgGypo!F!o(G`5J-f)>um~+jHdeq1=+zW*=AKD=WINCn57eSc>saZcU4|hnzyfeM*nUI zo2Z0Y6CWQ(4{92L!Da$Sw>U(XJsgGw39aTukaqT(r2+?4R7GN_;R*1GOso{(F(kqX zZe*cIXmZa8Rw3nWnoaiqrbUU~cM*+<&?VL+G`dN0?fa>h9kEhne#P@@!zPdEq==Jq zdo8_wn(*3Bt{I0V3ogo)FHcNBM71`*152h`$-3WI;^;rOmVn_ueUVv;3k)CV6;IFe z?$pO}Aq?&IPb3B^fvphgMiEPjb1F|u7;=nxV6zdP>+uuEdxI|;AvT_Mq5XDgLy!c9 zz?M%^vx8h|RNVo}MWY%9Rgzx_Yv;`i2cPwG)DSp$J5y^LCP>;G1FzG4y3B6`!mxfg zPgN3h0i;b?=rrDvRUO?OFR#V{&?nGNLrek!41%+x456oo2P;J;VjMRU%<5CN;0;?p zQCRl0WoYCg3hSNDu&_t%+^PT)2%s4-E}(H$+AjmF)x=7d1K zYS{jvQz(Vgt*n_~f+8yh8B13DD-Fax;#GvZ(Y9tH!9LyP*MIud40^W|!V=}o`se@; z-r9aTbn#`{#Ko*>jA;d~l528YZP>45+;8Z9&(+0SDu*$6rty^z}t8B?Lr${=W10QrWO#6!3|NOr;}G$C_7i7~UQd z1EFMZHb_6*>%vNU9_??dh#~lB33uwYHRQ||&8w)WeFSGLXKpe8<@3>Sg|ZleOIHVB zRC%HF27G>LIj7%TEiVYZT`qc9Oh2VKEV{AetRiQZztS;YZhQgo_x)u{2TWLmtA*UX+8#PVgU8Q$cI~i;DAtFe zOuxLRB^F)J$SmY_?MRd=hJbJ*1=s%pphU^jK73+Gct1YKzQw^8{Yck({gryz_%L8 zJR4rd?q?|WPk-)WP`?i-EfgET)~>8YkDP?T3a-VuS_DhV4`c1a2i{&bh@fHsn%;C3 z`RG>`ln+eh^jayQD)-t+lk6`#V#OT1)AnS{G!Hv2gfR$$u~ z1vkUyiyS#Tw|Z)uW_H0$8AKF_uCbcCvHDHiXx^@s4MKOW!WJ~P$mZ^E?4Zh_91~ej z2q^)iN&?MjE+1`k!DEOie(3wpxE(-7CI!san+8LXS2qoL z&_4Am`;LDm1thG#aYlZ$ZQ_n2tu+OQe17+|shoZkG);!R02{0q7GeW-rzk!})0G}4Q3@%Xo zKl5qn1F6(qjzU$51DECj_iWXAy+w_h*2a96)+Q7d3!Oq7+b(Eh7bPHQ_|?SV&!gcI z9Bvm>W;QF9a{$aUP>9XLF*Jit1B{qj4z!h0Db4);IlSXPE^s^fvBL$#6I^Gy((=ih zsj}@>2m?^C*u0JlP}-`P)(3Hj+dLjq+V1yh{LtScH(Aou%|9v?IAnF*-1HG|@JFd8 zdjZ~ivNM18jS?Aw>$Ohp-8wz&wPbwL#>D}cZltaj9b-Y$CmH#SVOx~sg zU0S#)Lx?t8^)b!30r+){ex^Wjb>}0}?I2r81QW|S=dw4MQmX&e!ay$%hd5qR~( zBX0F+bbC7;jE-n@DNjFXlh=(q){cQn*#YZyhi|w9SyH-FJa(t1GK{3%VryCLU(2y)kkfqE%@N z8xX_=Fx7YY1jXVp(nE|2)_tK+FNJNlmy`oc!Ri|ir39g==ku27uvb(|1Vmw9?!Pqq zybI{v3q|s8g_cTwmS_Sts+4yH-eNOqY|*Wvnzwhvg7Mt9&K{uDvmh@K`YIh`Z4~98 z*9KnFoG`7!>DTyqP0{kw65;y>*DGsIuRDPTGqySzEa_0-1Rzo>G{U)BluA88Dy4L zw1+=6Lufkjd58Mfyg$M-!QYuh?IK27_6e}UYNO{TA$E_Knt-w}Sh{UmoV_;?J_^>* z=PTZ=ED+hEW(Rx&w1LhJ?xS(ggXMAM+i`bgLIfS{ORXESq*UaHOSYy0x#sJ5Y*J&C zw<&2nadMmUFNEyBUM{8kt#;+Q&(6#93?V(P`fmELjRub63i|X32I7%@2Hnn~6Pglk zv>(k_FFTDe2X8z4U?hiA3?>dkgzuwTmv9{5{nuk$mAPhFlw527NRGudDo~SB+5+V~ zIX6*6Y~CiF0v%sBvRC6Cef=3&G^s4?vG1sm+YEv8h!p}qc4}5r2A+mHJCVVD*BjkSI^Q1K+12I%})sUN>CqAnc zOO;eJ1g7d!a?32HNAe9HM2Pg$4?e6})X_0-`NP&NeSmVjBz_7c+GqZvoZJ45sbCY7 zyS_8w1pQSE3x|j}sP=Fe*`}8etulDVBajabL31P~akjw#KWH9zt(GD7W8o+{N4p7k ztqw39GTU|8x0lAh(bO-ay{C^;i5N)Z8Z9xz3ZYdbGKBM84yQPCMOZ1P7q2UYwHdn{ z8U5IRPW#S(DmJ&4h*fqrK0yvPx%o1(ko8}C6=@

!`2b&x`4wChR1M8O@zV{>F`%cG=dCvn?aL zg~n@|sP3lxgE~L5pJs{QN}9qpLA?kKhlu)E_`qFkQ8fjg%Gn_}&9Evhuk7s`k(FOl z5v`J3JuljbTr$yQ1y-N(xrp*gM`i|1#^Ijyt!un`0{}EC-&m8F<_<=6$ry$2Gu|g+ zwRaar(GE+XHTA|0Qqg3l40ER=n*!zS_*&f+FfMYv^^pmizC&HHzO0#ECa8( zjT93-#BYxyuzLa#g&J$`mt}s90|G@rbb7;J%I&a8n9R4{|G$ql! zCw5vXbT`I_r5lwTL3s#PkwWQ*SkiM`2fh1x%B1Q<;XoBH>ja8btS$4aBiTOv{8Qk& z=-=&GK@35}r>&;a*;{}d=Y4Z(SBzf5tm_--BPe;BO!CmkFx8}-VQW$)QXTVM2yn)yu@?W!7jy4^zY=f+)) z4F?+=EpFR=KrbQG;)n`2^h?RJTC9QuOZtmH1r3bBwtuvu^taHpn_@yI`trsxBp-i?0H;*Qjk(UiVjqwcz`!mX767a0XpXs# zX&ZU+xuC|r;zMrI}3s}`=ne1v3U1_xc4!6_ecY@>!fW?mI6W@P(-`h(i5f< z2q>7!n7PDw`qk-qh%Ps3Kq@UrouQ-AP3s(2yaOh*!kJKr2gIt$J!_difeOy^S(9tw zl>4{dO9@)$-zx+jvDO-J^LgLAb}WET8wvqbJV?kg(yAEt16b}VW_xxQ;B$<(Yc~J- zC;S0E(!aqU8Wh8Ur1`^)3=T(;A*4TA`1x2X3*fpk(TH?(v6{rC{pYG1*aijqM@x>U z(f2SHCFPx8s&or_ZDk$_*^}t?J#}?)>Zora23;{jFxu|*t2f%r8ksK~D3U_XumTKK zpN-AG!(vL$xeXT3jr(&Bk~JI$C}Lh*Fc`+g%SoZL8bJSHKs+q zzAkZUtCvXUpk~aPWt)<9FfD`d18Un>q`R?}s1@#u&_fu}W7=RB`<_aWWrI|-anQT- zDN{d`==`S6e2HiI_&Jow-6mHl$Kb;lPA3k0J()^sSJf*QsM?|X)2^%Z zC@VUvdDTMeFz8fikx9O8fL3@T^abFS#a}UG-7}$cM|}~{qZ|e$B{(%r3Zyx}BV2F) zzCRQUZ|&ofWqKyePh9-PWC>xa;6;iO*72h zShU!I2&X$d!fFT>IXL%E@A$7@VO~I70LC4SjEvARFjSMkW`t`nr!YKhE&L1_6>jkd z`5x^)U_Ac>lz^7m2|@??xkUJFV}+i3!?X&`-Z=Fc zuU;4qckbn^+hB7Y^&1Ng6B8O9sj3cGk=kW6K)Ykz)_mUhXQprbreDjD{_^R zDpGk;(OA(*B1y?Q878S;4E-bu3?DLz9ZTbyEy$+xzWs7{7BaZyPj9(Kj{Q_a)B0%t zH+B|Fec1Ql*Gk*O=r{a5zmGcFs9)>86^a@ht1-Oqz4sb`^#KEB#3s;d#NZ6#!yJ9| z=MrR(PRRnK>2u4utI5F5z^I%mvkAgg_f{nu zkwkr+s1O_h5uPBOh%ALt_lzc!4HItIckCX+0V@|Be4^K*F{jSC8t5Jo6Bi(M34WT{ z6Yhbx>J^+)93mw)Dam^x?8ZP7)bhkBi%;>&PGIV~8UX#G*RTm=X|Dr`&>4CAH|{o~ zTC21|H0aAVca0JflGe9mXRWJ(8Y?Is*90Rg($C<&(vW`ITD85^wNighUwlc$b$BlS zysb|C9D--uRur1p)6+f3rbGAet|h{#Yvk5wdJ&yxw}OqJ9Fn%~h6Y4XG%rpRRc*hm8b`2RNZOlrY9+YHao zuyOewx8CqUJBTn1Yf$yznw`_b?BUX=O7-<+GLu|T9BtkC>)((_vLT zItF(H?JwhZh>eDyhH14ylPQMZBe0sU%XmwnXr6IB5ct7h?cN1E%*Jmm6a}GsaVL~- zLZ2-FYPz5WI#F&E7>i4%Jwum?gUy{=8mg>qQPu?-OV}*u=l|f>79;kPvIQ;8`hjs} za7*Q*+dBn)qmn=YF(8BD0pKXs{5njcbA!ui9WWopk3w3$xnFTlQAjvG<0~xCETkMP zZH26j{P3UzT7>;4(Mzi9H0m1BhoXi;5l&%uUv-Z)SwB|Ic-xJltqUj{#Z`a0&7@S$ zdP6@$eO};5DXbt}3_7OFY7~sd8rO0#j*eY1bz>q)OLgLz9#(M?z9U!Yb74{>?uG1V7_R`);8$}@efpsDKtiqSmM7lE{N6Rc|)}!r55<{xF`BJ`4e6vDZr*lE2cvw%} zi}%^)On#HfMIRV-C?vM04^>aGuM8B49yh)lYqfpI(qJ=Juho9$2nNb0YMgtS>e|mL zTL%YH@}q7K2tx{2xA+)Y z8=a4j$+Du=`UkgBmk!L zcYr@F=pL}ffPqMmz>rdmzBTG(Kq|#s|5e%QxKkhB<+;L%Q5Blunmtt*E$v9Prkpk2 zg8d^8km>yk+ zwZXKn10FG}C!?)N)&1-qvGnlOnjiU^XS`wUx;evO#Hjt9toP-WONq*#&EGu#Vsb|# z3tDM^+hX+T8vo5tE}Ydb_skM#&`Qh55-Nz!L5m^TWA!w^R*$9Wp-Jef?Z?TfNzU)( zwz1rl)vmNP0!D#a`s>zLw;u1Kt}f<2KX^+b#II#|0=36!p|)zrw>sqeyB5Ur2z`xU z=2cPJ&Xd!kOKnC!TEIj$>p-a z&r<{slipvrdJ#aD|2U**I}$2$1N@1lAvrgg*CZP^EPs6EsZno3{BJYVZmX6Hl5{)c z6of*=!*Et+v}}9Qc^zoJXlH`6L@(BUnq~Fe-`%X(H&%LF*lfI`mzAX!rz(v%6lG^% z;(!KBO9W%w@IbI{h}h*KBIHxarIjxxmgZ<}gxVOyg1#UI6GpF8=}#hAGt)ANrG^5< zEXL5CdRW@zDuR)pjVUgn@ilzoNyj$AzWKu+2SA5R1NM_icJ zQ2t{Uzz6!1Z8yLz`Ol)Hp|&3xKHg7>ecb7viQW?@Dv9>E1xXH_RDo6{@opZ)h{_LQ z7RDX=^+BT0W{Ph3G&Gc{s|)uM-3CI&HKB@zo-Mp9+s1h!LleXlHL~gfdzYz>3wvrU z`u7st$3o{zJ}H@G&T+9tid=O)@G0=G2P^I4j+NHUQ)tR;Rr;mA#V6nxUjeDuitWBC zLR@f=s3%v%v!|bYsDwbCMOWI;A872kF=!ui`rRVEZvFo{#R@?DLWg|({KuzR589ev zR#L6XJJj2*r_JXO8?QUTp~K;V9_P!2X*QiENjh8>H%g|L$5B4d+m@ohakAO@`C@Lu zU}9W>4Uo6zu04j-T0od`FMPfTt6g0p@cn-Gd&*W$-_(-@Ic{W5qbHv}!eS?gpsCn| zD`LDfRl3SUH?ldq?@boNjL>D%eH}gkI%6$#D4*$0$za$;Q8~{9GK3bqHm`1f|9U&E zT2v~pnlYzZH#2wWomqI^+;^p25Y2b|rKKb~0-GP|rGA~~Ei=H>F96}1Bu zb_y$eUnrifk$_{%dI-*H1Pd-~SaiiLSC75vte}Es1AJ0FrYM27v0+J14U~ryd=A*N zOwBgq8G4%4_sEm(9d@QXi)t3UV+-Uts-qM4r9QO}k!h{`raar)BaJxwV|=%j=R4P` zr=RFG9Ct%g1HLpezqqmM4y>ju>Occ0@c2KP=+fGj5(pZp)$q8#VELryN~^Tjyz01f zzg-YA652w+N?JqcjrzuB;1@HYj?vZVAG(W48%)UNxVzm#%Ha- z%#AJcZYA(C@QI!@u1I0;l&axdkirW)Q4*Ry9o}N4j8m!mo)&|X3iDI#VJWw1sPbi4 zsubXMCd?hG{V=1F&-BO352%IY{{Qi7Y@Da)178C z2e{nb-PZ#NZK9#kW#ujB?;1B#tz!F3%gwhu&wrW~eEX(_!=lSX$YqU2d?!+mUu8h^ z;;R=phiTct>23tMs$*&HIvwB8&u1FBwRXOi1+PPdl#gn&JtL*~ez0Zc`Dsa8qGj72 z79}53rdw#;FQE`*zMwysB$OPtt0~Jvo5x;W9Z3@@%fFNQ-xYCjoJe#v*@UFA#Hd;x zSxnsXS5@fsUR1A?g_L6K!;Z%;Cf-wm_(=B3Xn^7J9QwnnS?S+z&Z3ROi)Ze+W`H>$JG)hv3 zgEi>$gjL3@(C>&6HO_on2T`xDa%W_pBx;Ad=1kEIn+#BMg{SV2OK=||+kAoX^A3DY z71-6~Tbr#cCX44QqF1MDz{C){y+t>oo^(BplcZONbtf9O2XvOvo33CsPY7 zQLJ>a)i`W+S%x|<0RcKsY+KOV*jS9~{T!9DvT{c)`kz0NLVG1=M^|G$Ldhko99dW4 z=^h7-ZuJlI4rAcBZMiT^FADInlwan>A%XM9Eccax%3Da4Log|+^*a$#A=~!V+}-5E z^63t3b8|tP;}i?+pP`@UVG4<(_38M-@6(QrWt^|cxSppXJPES#m_(!_^b3Mf7B#Aw z)H#yGmD)=9?YEM&wmIa?AuwOIqdaw?R(} z;orHU%4u#bP7Rh`ZzpaCrzh>MEIOWWXMn~9cs%OQQ>InGT0^O~m;PM;K6?Md>xY2K z0`=6U*SjS5)6zZAgp?)t@q+C^NL9$JJ8*KEEP1m1_FGvN$CKif0%kN$o@N7uxwbLN z%_S>Ay_@07(X*ItmC$}Or}iaSF|#Ozy{xDYabY_T3{R*MldqU2E6;-k z%9L5YSLh`3Fh$V5kSPbWoPo?HgOK-!apePi#Uz>WD@D$5>Am6Kvzz0xe1@fvm5n4A zQsn7hoyZ4Ne8=a_yV_ITbWom0d?wei1%{D^!B~h0%fMZw{@OkMe*4TegwYcmfVr$C zo)?2^D9X9JrdIpt~h#xp9}s+@uX zaWI~WlJ5%~609|CHfpfT!8|=UWxgYr8oAl_r!M;{@PLo2@YMJ!y`lp*uBv^l5_#DI*=j^rRBL#|bwLP+CMGc;-;E_umeb+(M zO)vPqJ6ww0%~cBJ;fx-iLs?z+3?=#fRFT5~3d@pckkpJ&xr6hM7l+R`Pa=EFhJR*E z>S~l;f7B4BHE)}~_*Gb#6lj6+`+gn)rhqJZyMB6ly4rC^>H0lA8NZ`~S8~f%jc}Z( zuopXi-FbbHykZJv*ld_kDNR_e{|^c zCzn{A#wey0_R$W1SG|edl26;gRl+6iAgeO>Nr>ON@7VH>7W*hh__ejf^X;^L;S6hT zk#}!#3Aa*N36Q{RP;uZ>+TFrpFE_dzMFX9PXCQTh({!S<;r#Q zo@V&Tc~#ZKGAS+z+UO`lSstHz*UT5K4KSn+bv+Q(bNPuq7}104c*EKg2&VdJR_gn1 zGyeVhw(-~Fub0YH;Y*5n*?;8hS7GWoKHX+vIwNPh<7)D1?-{OJ+P(SWO*eg^wcp0% zjhW2H!(`O9$Y6x}KH<%&`0;mr(+&}nz+I=X{DWInB?swxm67_boc5!m4@cfs9&H&1 zqzMCen>jaM{n(ukaZ+4!)C-4`{-ogC!SJ;<3odKBEJ}y$aAF4X5ADI$3M&x4G20WF z%BoM(<4C<1XxxupSronrU?Kg&ZZEpe+b1;~uA$^GfBVs=tOrq$8gIHI!8;9?M?Unq zuUOkOLiaUqPI);qdt;nA-(xQy;;R7;_OpG-DHP88Mf}O58Re;SRd-QI)#(S8Que9m zMHotM}{Uv-2_(;&Ce1B zew#{4N{aiW7v?2x-@Gp%X>dbMS$|ENeWxZ&D@*wEOJ1IzHzA*ajm9$=BO@abpZD*- z3_IorBNu!wF}e88-E((yF$5;;QY~GLx~z}bde>-s?AIxMR>3Hotu&v=ZCCs8Q1#bT z&JLpoMXo!Ng{(a4f>}g9dZ&?fl*+(j+^HGOTxfYMnS{Xm&gP&O&rPtX-UL$J{}TLP zti5$WRcqQed_)OFT0mM#kQSr`q*FSjk#3Mq0VzR3S_A~?5H=u4i-3|!r+`W~(#>~m z&pa__&O9^c{pPRDX6<#aJFfg)u~f}M8hYLhYIO}hCrWuSazo9g*3@M3Zfr=+#2Cp0 zqhsFMDbLy03*lIivL}5)_>NOHrXlg{u#ai>C2<92C7e$4CMgD^bOURyoReX_DV>ZP zyH49}R9K&GM{mXD0dZ zYmA<1b2vTc zEz3BxYSOQ9Hrt%pl`{<+RldAzW~(Nce75y-KGh+y)OXFM-0#$@2Kr|Q8rxmLq6Gn4 zR3yKnq&p$s@{Euab3y#o%S$^+q5?kIXU^K8K}_Zzsm>qo>-OCQ;Uw@OY&>pf!g3#7 zt9-<;a_P8aPs_K->y3%Ok{E-KGwDRtLV zs0o#Rt_W0Re!gHIQal{$NT9Tl5YFJ%F*{RdTIfKxaO#R}OHs~mPD$CS)f1;}(=ga* zm8Q=6;+P}nsOtvBu%Z9qn+yVvrXC$M#j?l0oel@mXybmE@to6jIiflA@K!E|_ffmPIyN7qnJ4-tn^PL11<-T45RrOk!Xid8!PG@^YGQ@=N9@ z5Mih%KRyf87MWhgq4Q`ealCCQ?0_QRTQA%+AeoWylb_nb<)yYL%DBVs;hU%^wQy8K zCjMa8sJXv?(^s#RlC6YNA9@Z;m+TN|7dTPoV+^%4pN7WY+L*OHB{a(xRl`jt1FQ)9 z(j$}QuPp)$=9#b4x=S?P1m1JHXCrdtLbX=BC~tJ{9gE^p!Xe&4U0UXF-V%RJFf%Fo^Ff5z5UV>uZj@M%3Mq_IhcYo}%ApSk8*mpHxYk z1#v{m<*+O7@2m&KD7hQeM{OyM)5{BbL>Qxr_oy@%k4vt^z=?ewtI}dQIk@SC1{UvW zZ@oq1W`9plOHpoIlH9&DJJy*HW>edbjr{&Mdk`aoP1v5ro}+#>&9D;xk!hC2u<_;o z#FkNB0?lQlV%8-j+d0+rXK6;cFmd_n)vK4_xfjiRn>22PqM}q0C?oEPOlVqnQnjK@ zc+24Zv~qH2I(4goe!wW+^B1WO9>4V68++79Ytr(v&na^TtU7XbBZ(Gi+c3-T6iQ@> z=AS5u96r7W6aEX2IJZoqBkx*B(TqOgduYS0-_!5>{b#fIRPDpsZ2MiY0!~`Nc;|xb zQ4V`Pg*~jUZv=wfEkB7-QjC2Z=2h5!lwhgW<=`AK9qGIR9)Xh12Cz&JB(tt_F z;be60X#Ea;bga5m)oNVd!al0lmv>_sHsQQOsJq1?_L+Bzbr{FbZ4?@2DDLBTZ8FQP zRfl2K*#Be8cTeCBK8?^K7=) zeFSakc&LJN>S0Lc=d#xB(k|;HSGjFwO}S(GJeiiCIo^*AFC&aiw}nZ3W+l0vw@Ia} zQV~t2+gbUy$j)be5h&2<#Xrg7*0jszF+#j}Ke+fLvaxJBjG316U{RA~+FM(SXEnBu z0=*(i@3{%zN>M<0;1uEuyxz*2sg*ce38I;_lhOOu7A} z0RLRPnyQlg8(JPZl`{O>uxEJ8?--V7csx|w!*T*faS?&UvFn42?-X{UUt}KR$b7kv zBXeGw@x_DJ_i-tX^9<{5|6WL3&UWFh>LlL!HHNusWN3a3l6!job&I8%c_AMq_)T?} zwLZs16@89LtmwID$MsxNz!KszEm!i=Ero{A^l|z)bo3xUk=mB-=(JC& z8Wy>E9J?b{($)g0+1u`h)Z#xcj|ugCQorW8Idjb$`Ho+;2xsa3<5Ov~jiyPGcAxMn z!6}bI7en{(R_oQoU0gDJ!kHm@SQ48?gf;!wfbIci%3rwt-!nwOrg8b1xgq9o!*xkb$*@v5C(&gokwarOck4ryN|L-Gq_YdWxBYe5?A z9+$??S7^!ABNPiCEO&F9YDh|IFd>Y+d{pm5-{N9zJ}aH3QsBN_yKVGQ+qE!&Y1?L~ z@Q8G&zQ3K!+%zPrOR>j;P4JYO!~CeFHPe`X^BKD7s6rp7&}|Zhhp{E+dFGT3HZdIp z3$iDzlI62ldGcLZipF%&>SDzcGQT03KNiOG>c4_#W+cLwKpRvHE;fJ0ES$-m)DU6^ml{j+GM!- z)V*`OCg>tMml;a=!jsXiuA)K>**R3F)}`R?$!5zIvODc(!Hv6~Yc__dZU!a~p-N+G z~Xr!CK+&u?xowN%Z|o9ooMl@x0uXfova(NPHDz3<8@CwGLU zVJ?p@eKPyoKMd{&{}M_bY{`WtqgN+H_&J{6X7_zOQF)eZsH5|DxSHWeA z5Y3}XCZ$X|L(jm;rM!O7gBkEG=2oJ|qlQJ7i}&+b<8muBWn0E2o859BjV=bBq_qskSohd`p2w9#%#}gBsE#4&gQ^a=Mv8{oqZCQ;#Ya&vZ@o=tIr@&1;L)2AB zo-T)elRfZ>OIEy?F)%Q|$)m3r*=xs!fn`~YFw_cz1WuxV%kqj1NBq5MOe$_Ja z;=TiqSPK!fnWy(pc~PK4k&-#HHIOZvbCs4SzxO>#z@-4IZOnzA4WT_$31r>V!4_`e zQPx>M#9yoqDARHZP@8-nSz+A9KS3if>7dp^P2kh$IGy zc(IJ7E3yxr_H|{iaWl~Ek2MC48hHnYu@k@|6x2s81c3!7m?eBtfR|0PJ_t|zqN>p; zY4HeXY;!06YkoHj*?INlyclQ#+$&P{s^&liW@wZF^Xk`=SE*tf>v-D(QYm9>)%ayt z%yV<|xu3Vtr1vJdhv-LUCi~xCvy~JUBrbdQWIY;j*E7rIwsN?; zOzw8?l`71ONwSf1ZT;-z%iK5Cri20~TBC+~mV?)rf&O_9huZGbCcHD`H^3w76iwmz zfPCXRkIK3hHcWr+l>(VDFcd~A<32O`FDc}_!^J!fcw^SpOSjRHbpH4wJt4fVUil|J z?t60Bo$d>yN!KKdY!^hAYL=SsxhQ`(_M)kfc!lFWG3EH&Mc+4fkZt9q#tlXZ+g|kU z@mxim-iHzq-?Vl@(YWSui^9>HvPwB$%XNECFlXD()f*K)j%27HKtaV{oUGM#^94U; zvPplo?659IG+!mwyziL(`FNY1V)sMC8z_qIF+iz7lBweDM}95`#7YrQv6xPlveaaH zn&$OX5Y2BvIlX;~SBihn_7&ATq?BnZ#N#r#==)w0)&kOZZ-T!qa+HI?R?qYTtQYg? zJq%Q&M7d5vtbGrD9!MiWcPTA5YeVbHivlL>OKO^nWv3~LFFaZMkCtb0zPH*J=uQz{ z9WvBN5_Z)UHfe#@PMl-={q1<;n$weHT-?N~1BUG*EcQu>k2FzfW$rYY3>BM54wE_k zhEjps>sRM{>{h^khmBQx+6cO%g@^7B)@3fut{;8?E%BHJeC9xn--N%V;2IP zK@&3)FLl0st+uo?U7_OU|DCuwULah?W%+Vipjcj(qn)U&V~I&?5zxRQIvcFo5nF0M^qwQIObOvc$mh{M(SW7BsSLZ z$W2px={yJsh`eN?p`q%opv7S#Hqpi)8R-SX#R>BL~1-iq=QkKaXmVsIrPZV!Z^9%y+)W1%zI>}`sg*!$Acw=X~a9y;u zQZSlt+33&Qfq|oQ6^8PQDD-qdVq)(Szj~EBXjh$=>IUw=mXO&kB5Hpg-Jn7mW%zP* zZjjj|NNtX0GzoaKmrxJx_+~N`MuG?s{zikOT{Pn3wU;JRw#fuMY~`Q~gHR=JJPCv! zUyF>yWV!Kp)JPPMhBQzSFB3=2r_PSzvS&I6%MPAE^+=?B51$Z7l$SNPD4lElJYfn-@U`w(1 zt64Y;MZR3UxN#^$n7kMnfT++``ctwB`@OJDU3{ zXAbT2dwV^$RIT(zSa;1yBdSKylQ(i+qQjip>3Qf#DJGnhu}=>!^B)W_nEDtRnz(Oi z%%xY!EAkBF=Vx@AXTQ_e6QD2p;2V*Fpck$^iHKtb-Do6B zjX}#CuoNzva`!Ha@B9UaFT7Ooh%|IV0rwO*5sPQ%iH{XGS7Jq8kAdP)C8{WD;TNI^$52hr~z((DaU(_I*l@jGcR1{;_ zG>o7u{r%*-cjzY0j=VW{2C-ApUtS3;(!H&ecrujJ@6oCHP_b?GiU)Sz?}|(PC0IOd z^7npw1mb`L-BM5033!H({lbOr0HnZdA+LRwnskHboepmnk*Mt};=j`&+$aJH=bz8t zsU3~HmG60T!OFoQDcHeN@&?ztM9iO35%2X{LoQ`5A!PY?Dj4S8yZh5V6!ek|jm@Z0 zcfZq-WyAhFaa&Wqe(cmx?@M>|S|=RruV}&r%{3HykuefeShnDA^L9^w@=uzCB+PZXZq>x|n<=Z&GBioW{3r|Fb z3X5+%i6aIUspoKdZiZwLW{mH%lJ}@^9VV*o!>jvD$I8w6M@N%KTwh)aki%z`vXyBS zn&P_Q+vIcedb4S(!WT|{z88G)K6bZp7$&AYUffBqm#`+Pyt!Y25>Nw^PR8UGEbZHm zXc(DhYY@3h!)=*8>F52%dsDFDEh)3p@K6VTDL3DF1}ic3PlS10&!Y{jeOH6g06=8C{ka;BI|xhT zMu-?vKgdOH+adLXIAK_(GE5~5D|nB@;)}GcW0gxR4mpB)OaHy>Z(&%9iL6_qz~YUV ze3yZRJIW=67dhI{g87B`>Elkp+n%uN-!|Ou*vu7@z8zJ4M`k;P*>s93odL^+h|={X z1#Y_}#3+fWd82TI8ui`$ju<#=*B*OrCu0ZN3)|$d9FESZay7L*qq`D@kI4K@`=vD& zC^j28(Hp3+?To(>B( zEKIZJXnj6v^qc)!7P;pryhgSQNt!Mban9fBa5_e#f;hQ(V!gBrt&4KNcFx)Me@X8u zFGy!zrE?<&2k6I#am@GgL8I{4ZnQ|^nW2N{tzK;i*12{)hoIfE=I3g`1AeK{Q4VQG zJxvuP8?`CTJOZix5C4FFeR?x9Wd<`FiVQP^@Z~)vq=E7eavxGLWqK(;b2Wlf`uFcjK|cYLOW5s+7z2%nyVU4CRHtwE zx^vu|>J~Y6N~D~T;<>f3$UOZ6&m|NT?+vd9dvEW{oqWq@5&EfZr9XO^Bot%y{CqvG z;oQyQK`Z9@fmn(|Zr)E%QQOU$+Ov7!1_wLe&sP!63(H5TWF|pPZ+H2*kdw?fc7I?W zhGoS!>KFzaC*DRgNC$JGyP`>sj0P(sFt}xbw88@kLowpvtw}5c<^oT{r$m<;=2XVc z-yTQP5JqUp*1r{fp(r9Xf7)ErUuOPYWuc7r#id*{>6``Kro`q@!}wS`A?7uB;yIEZ&Yype?`;3DxL%Y1y z&JkDzbPPQ~%ai9@TuWSP>I%;kudf&j?3SqyX+{Wdt{yc!%C^@Dm0?MAdx_mSZ3talmVCS^Ir)AG&@KLC19~STM4L|EYOytTj7mZNo9=^RS{X7bld46NR zH%`!WMC25SA>yDs)o9CJd&|)IBU%qX{FWt1%*D`uv@)(K{D#OhGSxFeIu77E)Eaw= z0aM~wJh+?zOFmGjdAJ0^BtHE%>k}n73!mJth#ep(xi85?9~hKn5uz0-mm zPh`Y?w{Yudg_^G)il!6boddc?g49>wUL=h66j@x#Ww>c(t6Lhmv5`bQKpc`WPJN1V zS0s;RCDBz5Omgt#F|+O4?iT2eQy<}`l zFj#7!T=Twjl+#3<#CSm666Pc(-tnvx+@s9MGApJ#z`L25vr=ZiPD=UXy5?9e_>^54 z!wZVPA&l2o-<3PVJxrg{6aVBNulk|(!Z|J7WV3prdlBmkTGQu|K5}z5CZ5WAXIGA= z;yr=GDWPL-@$ZqYQe&yo(fare+SL7ygY^~?)TF_kWhH$k+L?#=f*dS}Gl1Jk<%M4A zl@^x99?cNRo_2cnm$@M-^&DTSyYtXTSEvf3u%C%c#|KQ;$JlaA_21|^+q>WI^BK|M z`!@m;K@wQSn?{S^(Qn?tr`4ZpCx#4>^h3Xf1Mj^?w*ysrv2-a7$|D|pI z<|N^&U5b znn4q$r8@7|DQ+fWH?b*%9!Oh1aN>F^aZJx3JCMR`_9Izyti~o!U8&7YSS0hd>c&vqX3^`3yUIJvya74J9aIXgZp&6QF_C*Kb_&-EANi*Ke^O)p@92q;={ z01isk4aP2Y*l|%czYS z&KH|7LOBi&@j0WmQ`ln-la{OY0$9yw=k;1U#HVVn3Fa4F#z_Rcf`qS^7*^aXw6>8* zwz}~wnw4xIPK{GY_*PtKXb?0jq4Y7>`4S&XQqGE4e%ER}qpG1raFW$@+OL*<82Rvd z9+j|zwX`dTS9mfS7n)|HR}+_uz}oFN7FUbnvRen+wD)(O)7 zs0tN+l-KO5ytIyyoiPOyVTX?v7nHt`Ms{9x?GniG@{72WKFW&4z#5zQNXw%BX546u z*N??L1OA=YpUf7Q(l`-Kh9BK1pDz^@45~d};m&QS8YzjB8ZcOUj$I&_+a*Id)G<@s z{p|hDVCB`XO*(G=XBM31xx{^v(t1{>X>ZF9Y9l1i*QLPY4|~vd+D?pUjuwz&#cU19 z*Z3OkgGY9f|8TJU9#BGk=cR$HY6h$XNHc7mPn*P>x<~dr9#!~kD zFW+A4njsmT5lfu2YNfaM0tlBI+}pZA=u-kmQfoil!mD{dd);S4h_Vm&p;S&srtTfD zPq3W2+hIsyB`;F6du(WHBn+0kV8^l{mYwW81X$&*C<}>_xrHJM0eYzEN@yc}3)vps z7dt*b#4m%$flnG-;4qV+$uA^`=2KFs+ZS6y7&RwPU*eH{Ej7Nz_4}fm{;fq5BB+0J z@Vg*v@8>)1zK=Gx@8I4s+iUqi$OMtxs=E>tOI8am#+{5Bnl(PM99qZ4Du$p~JTL!) zWQCO4#Y%#sh|{se&%7CUkx{)~&ErN-o-9X=nS~5`&(ZQ^r;Bh4u*Efqb3)3qSJQbI z`Pm#-^Un2xh3q#g*RFF%j0dWazxb?8ZC5H3P{5>mVGhxF>d?p@y5Yg&X$~+v(P`UC z_UZFD$-LKLKLST_+Y0M4xJ-hMxg$-`pjx81_Lv{cf5R3=bEIpGo8kvvK#Hn&5~E3w zJN+{WF=&#>Sm7=U0TZo$8M-IX{)y8&DO(Mg{m9#)Ud_wFPXd&^nb*-`=pt*W9_#kB zZzG)F^{WBw2oheS3P%(v&1BMy_$55rD?8EcXXmwAM^qjLkjiM@kQDU1}PjCir0sf?gjb0n?)C{nxo7?`_UVrRn>V^~b&~;ghP> z#gD4ww}gp6pjs+OU*JF4&NEvqpx~_9j)}D6%4!*v_~AHilO@8Px}<1L^cD%L=M*A% z)?P9?D3IBscZXd+wy$R%6M?vP6f*=ZMhvc*-4O-TJaz*`5d}0)xos>C=^ydVr`;zF;c&Q<;AZz8ZB;8O4#9TcQFk1DA2 z`G2{YvPi*Ks3>49zCFgAbBe`~YWS6%Z<$Xp>Lv%b0CiAwBO_AjzCTShb9U@5-Vi!! zjTZvOMQOahq9rr1)I~ov#rSq|?Uu!p3ew!sm@rE(XE~|ron$;lo>bvGmPh?kBon@j zP6zoTOn%Ir7-T^W$sh{u!GS+X(mx}&H#*P0L8AFO)7NY=tTYKUuGR`))S(?soZG9mP zKND;_q<>;A*Y~-zF&LtzJitA?Yf-tN2M(P2cHM3Pk-=49zHw zK24C)YiiWih{#d%&^f12a;e!}?+~T>Om)%`$7$+ul`K;{PrJQReMnv2RZH!LQ)R=v zgjLgtdV#%7yG!B7@l70@uBzxMmY`?{`FWk{dimP(Wqd2J|7D6@PN~OU%`t~zB@%~i za)J_W4+~C91eCd~{@#-$@O#9iSXL_mi2#FSPXA8*CN9~BhaG91@?ekE*ITiYgX;S3 zILkJNj5irfRT5!?>T=|BPTIoXl(m(i=RNsqj}%;}I68Sf?H3_NQxKZ-^T1wblc9N)HODLwETIauN|BzMUW+bdoe$6IKI0rEl|OAOQlRQ`u@Ko? zx^}IuX%iFti#}7aA@j;8dVnk*7}N;VurI^XRW>fz&jSi(KAAj2%9%!cg`KPIv+rd%+FCcc>LkvurigShPvAR%?y0ln5R+*MkLbyu z?|1zA!8R|+$G--e+=vOR$xiE&nSRS;8IPurvEmo!BXG4xA)py=-)khi9 zb@o-8Sz9XXovf5*@1F}lSGd2ew;8%1KQdtu`}NaP43mouzak85n$ME)$QCdD4-}7- z(@m)V)>>J3=$_`_ybewOYD_8P5Px>e(+dsE|kPBf8L4` zjv-RG%zM;hd6nS!IA>!W>i5Gl@0ll;)D+f|B5@HA>_(KG?1>M!@I_Bwbaq@VKxoR@ z$~-f+UC4Z8_{@GS_RO2Nqka2W6(_G`#05HJktNDcZYf#`*qHTBn+%y>L&r(H$ScC) z-E_hXNia(0T<2*EogYNJU7b@ClE8}UCHMynp14Q~z*<=Cp)PXa|4KKRUTY1J=(~w7 zZMN7~(p=glT}>ggs@V83bvgTN6cxaAFDvh!%gyna9YIlVEYpLc>ls7+_9`RS7O`?w zl(8#V@2^D1B6CVsSS?;G-6WN)X?3Z^jvyif91c4z#Q4X1u-i|v7&pX4y!38j9i#+ITq~6G>Y} zOmou95F!E?r&i*}_{a$=)h`O+V?b+1%Ba2PuZ3Q)sJL$cwm}fg;R(H+7 zuYM7FgwS4|(|n_(q<}ggyJ)kqPnGD$CDztkQD^>4BTqhTC|SgO=kei&N6sJ#;`Q9u zp$dbCEuyKdAt`sIs`on~&`1_}m2;=nP35OC)CA01b*$Bz>2sb>F37JGMG=I5db#94 zfR1LmAabUCv@!j%JE!@iL1#;Z9rIG=tfJ=g|MC|8IoxmoK|My|V+i0y!-uo^(0>A6 z{PJ5=2+1n5)LLfxx1XVRTcRm}_b1LNlyJ7GLv>XB@=ugkmdV>zb;mN>zR>xZ z`3u{Nm_nQ<-E1U&UKSi}PZXPC?ZSD7<=&={ismI)>QzfDE3NXBPDhP4$fI%37s z`X5t#jFbkz6|F2w&m#o$L|%}alda{;IyPnR-sI1hktEH=^B|jIkSR`fr6RH~6b*&9 z9VM&K{p(mgWP~^0Z#Ro?H#4}k?;NxiBPw53$Q3TUMdJtfn-}LxQjb?-54t`snd;%v zBH9BkiXP)$1s%M|Sq}aLFF9xZW8Ik!dHN|m3%US;?s!7x?7{*b~`sY5Wi}?%% z^82nyGI=QN8~lNX%iXk}d$rt=^510~$Wl#5wol0X)N>7eCXUl-agk4LrjIXm zm>nlOveqmOzo_UkWpZBn%#T_dK!XvA1e{Ke%4k$$q=RqvoBZQca{IO)njgh^c)tdq zU9M%kt$%l~q9pl;=(*qO(a9HyZeFMKe_m!?@8l8#Fjxl2>J2Uhp}b6!sez zzyR&P1?vwZ)%4#yO{mTYvH!ln)`}*wLK<9*OO~ZEX zxj3(M9`psh{AuznmE636JV{wSt;sPu{U+}!m-?UA3as;4b601%r$*WwPX{*q|s z8`P8(lD3=q$$0Xp2x=45eLDSCG^Hyfh&zC#YC0|Vt(=R>a`_WXC#xs3IFd7i^yW{}(FR6pMRw2%4C@#Nt9%-ukf}6bSCG9Gt!ZV&#w`r1 zQFC~s9+&LoO=;;EH6#ulj8{i@Vj{jX(yclo!&$V!G~Gx*42iV<$@iAk;Qm`o>YR&B z?r}Slhc6@7_+PpOU^h&Itx%cXB=vyp?iqbE^;+atGGqcy>)F}2q~nKzhR;cz@89lo z8Fvn^9+uzcFUqmE3cZtFu3%zb8IsQe?`sSnzrXjay_aCZY^RNq#LGJGs_;pc0`{|; zpt7vbBdXz*W=;{=yY9niYB~9fszch4z=R+h-*ETet9liEb3JFPyZ?OmrWL<~s|j%_ zKd7MD-&&2Cfr+(shypuBL0h5Uz}7{cac4QMIx_ns42!=uOe7Sbg95Wb&VPX`w3q$> z@txJ*oS7kfE7Wi6i2^yLT%B?rYQBCy^_qF>lPGzmSPxzC@t+E0kXNbxv1<3lEq0#H zoyB*N`&Uf`SZkE1G^?BJ^*O{OW(uTC^TXv`rPa1P;x&DU$ez328Bodp#_42)OjnA% zIDO>md?jp_{}%Jl`J-1NgC&bsa*HeNMq(~klw z^&RzYP=%3{8-7VmPktVI-&?YDmZ!$qje!YQM7DVQB>T~!-cLBo6qv( zPfu*v=6)q9ky%BAeoB9qSx*57P+aU<~wq&n5>ULsapB5q2JAib=atRg)Du}Q>4hU%q%sPvTIV{ z5^)&3v_s}52`=Be7L3e@i6pj|0D5kY62O%-{w1rftlY-VcsBaY@h1(XfQfjk;N#az zv^;zd-i3ue8^!&J{mt?y3@eI=?T0Zf&$W>Y0BAJc6>5v)EH|5eX|LI2?Y9|ml&JM4 zaXcq&UP*eBfvEJwz-gAF${=_M&+?=`njNI*2k3FxN?k;HQGLQGP|2wNP|7}Er~g(- z=>yu`B2Z|BsOCpcKPu#PZ9b|q8?kV}T+iUkG+v|8*+O*nV9;RV*LkoaQ#3D>@ePpz zWe&#&-n#-(N9Et3Q%nem9&V_XLIF~RfMl=!S7Aa#^V*RlzZZaATGNwX0qiQ#kEaEc z%XxRQhXT<&IWEcX+#2$us8sDz{BKpCfQGGg+|f@r(J3X*|>00*tMB8 zc88?wjZ8!Iy752ZLGEc(z2aVUd+V7W$?eztJ5wiRSI^v5VuQGb3rVfPY9J1~lV(mA zI*d8ec*jnz({ti%*?g(aLflyi?B>MNHI4qtv+Lm+w3Bqr*UAF_aRt|Yi%=^;<^tTm zg^~T+FHwW)LR_sRY3o?$rxHbppza)U=XGyIbGIR$YsVwY((`qH1ku)NHfzp0x7%#|CLh(^l~A_xDp%?A*uX63Uq2YxP(GIINsz8W%UGxng!@zLh+ zr<{dUoX*yO3jOa>rVghWOiwK)TZU{YTJ9CcOC;o;_f{ zYmj*Ig%@g;6;*4LVECReu2rOnHM}483swIb$=4zbGuRt?66Or{6Z~XS{x1JTM01>& z6bDvo2nNw)P&^*lT}G2jY9Ne^`h;Wv$qk~B;z0fHi%*$<3~nYzZY;$igkVRMGU6NS z(&LtXCP`WL3Rdg8WRW+XUfRdr`LA98BvyCS3Lw84fdruOBM3(rwJ=RCrv3l+!v6=3 z@Jl&fpQJlJKeZ>B9H+HuJk8^*Ez!#TbTfPIg!?-$QJ)|Uk3{8WR2~Q(j|;SVABKqbT+S>I`ak0o+fAPk>7I7JUJ`u~mJ{WEZHsq-#G$O;8h=`<8}gfxx6hue)a)*P+z=X1IamMR4vMI>WDhXdtFyu~AUyJ;j-3_z&lYCsoN10&CW1!OV-h!Y# z6e4iC*K~4+jw#E2Y)ui1w`gI$~)ZL^ertdoI*mZP|2&Rp`o!Uv=bN} z9{x>ucXwA+O^s_ssIuhePbU%*5DA3r} zA#*i@neb*5U|w9|Wosy~6yidONG{lgZn_Uum@qB9eEi#=k?9t37!!Xs061->euX9h zZi#srehyGtpg96oL{WH!7>DI{S0vp;|*RHW-sQH%(*$?f23T z8MHJZQKIe4emue+AU zdE8Ck^<&~*Je)q;tDZv_IS{=eRasdX*66vsl~`RbIdwE#-C6Is9cU-yzcJO|JaC`4 zL&GE>4$AYY1~`z3LR)i|#DxC>nMCyAf@JG@^A$QBsjD2v{8XOj%)>HuB;| z6>x0HsMw*!0Mr`USz*``!E!3ze~)7)gW{y{VOp;-9}hR%R^Tn|p<5#K60xemiZWd)#~(I76MZxDJ`bkzS4f1o3G1Mf*<8klok%K_EWQj2zl>jB`zN;}mua}a zG0kpx@`E_2k96v1l>4PEQ5GW1H{RcaiDygCmP<`nhD)3$T}SA1o+$86R#-%zY>S*% ze1+aMTZm|11-81?@^6(;=vT9l?mbIObr@D`*ktgg>Fnsor$QeOw6|-ZkXvCjYy9Ka0O2PiMIGV6Z-c= z+rQ6Bjj5PkcERPH@_5LB8r+G>L_n9Ao=%A@m77h}wZAN-^bUG%;D<(e4dIcT9Ymk= z#Sk_AeCBXzK!;9IRrCk@*$?(W71A?Uq*@}IK9iu^D#k2o!#c7QWiAkt4MQ>G#f#@a z1`6usd{7)U2Q@!$)FB<&K4BLEn9QWu+W(gY@Xx{xL35}y^^tIboF>M3bMVQ+j{6;% z^bv!HDaZ;!9R8|^SiXjz%43I5c*UhN{OIUtmYYe)=WH9QRSFNfEre;|1njp)yAv$Z zJ*9qe3g?IFrqi{`7J-r{iu^x!29(hE2V%+8UmU>3YeCmN-8rZoyvH#op`np*6@n&T z{Prr0216s*Y8m-NZU;(?-?y*@rugkcE*DJc`L8;kK7E?| z9!iynWHL_wGoQo$q(-{6wKb9rULhfh$nwpq>W2FUyMfWBB=r4@F-iPyW~i0mqHB+hgCvUhMWe(BDh=zAuV9o?c%Q4UZ1UaM)> zSr`4C)2f;p{x8h@SSSg!Pz~QyS2DO?b>t^vn{GHVI5^LpdXGmH5zw5uWQ%UGp22}L z`{|=pP*Zk-oR7J}8@rW7EZu7Rgq6}J;TSR(iC_>$0yd;^adBIz(YSqucX1Dp`0nE z6)$*^1C5lty!_@Xz7jp)b{k3gJu6jjikG7miA( zTq7S45fBhGFX;B@icD|$k-eq-X5P4?%ez9fo4-{3E0R?Tb+e-bV`Dn8K%-274ez8} zcTg!zBp55btv?yC`;bgY;t2Hr!xyf}^OTQQ7I zbR_cVP@zG!+0lv=PffM#X%yflW*wTM`jP;eAUbs7jXo-@BC_n zA--FvklnJ9|21xdk-lm{(#MaWms-j74&_?u5p|H?ihF}^2ivb+7>ygh*3WbTeKC-w z?ea};8%$dBV0k+ii7zj@UpewC>VMNX&54p9p%H1&9qw23`pk^9gMON+J|t7*MMD+-F61TCPrNq2MIc{{Y={tAjxjHI(Z! zpE$J6V-aWzmcv8pK%_oz=4R5m3!M+Q7W~f^n$e$>`phvmpNLrTc9+FM$TM{0xCE*W zTT)g7AM0PjCg;QLcFlQog-xgY2|LXy6HcEgHS}L4i>!TpPMb$VV))AJLYsXNM!tH% zYssY`p~in z+O&hK3%PGo$sn;+%Ik;hmpBP}spKr1G^j)UQd97}*Jxi&i>zF0XX5!)Q?7d~Cm})d zYam?;D=oO0)b^LW3VsAzXJLQjH!(hL?cLsrexc8z;b^Q%Rb&;4Sn>(15%zy8DbFq{ zVvfQ7EPT9L&ox?U6<5FCut9j*F9+xmZwIlVkBacgcbo`6l~-ATjp2o}n{`dcbNweU3=*(G`FD9Pq$P1-w}!k7<7-w01)oS1Qf3ZD83!{Z_5(RE zY5G6l4i^LKnzyxx$|KFW2iHHm02834*n`jV78o(|nBv+MV3Z{j;-Ir4%{eP$3>!_P zIrroHs~otKl$3?_nZEH-FXjHEdxp9bwEPKwj}@C7QNxun}h9iN`#<3Jqvr(vMuj zSjoVWvo#%c_;+k^a1kud$;rs#khTU=yLEhgMmAUC;9I2U^RrFXnA+)LnuxyigI+um z7lU+=YHDCZFHUB2+k zqh+loCc~rtq(J^2MS-PkBf5|jVm8p~Q`_nL?LR+JvEv9I7~QyqY^mlv^Nbhm_1#5A zY$j$s|I=M(XsL)!dz;0eo)4GMrAh9J{}zKk4LD7g?MrC+^Ob*stW)8!9lep3me%Yu zBIv#z57way{|{*F*I`3?rGeU{rTZyj4SZcREhfpA1wdY^R%Pq|iSNJzz8dE$jZg+1 zv_xYrsM8jNwE96{-D0ob4D{JfoxVEkDiGf4a!ugblc@R48LD$_5KrXoXwb&F+-ujm zX7;&!c3qud$z=*wG3oGwId=bB;9C%<;lD;8R1#Tri6SpKfoNmcNOLm|0d(i8^5`70 zf}NpLB8Cg;-+(Dc084twNFe$dS!Qz)BUKUU;ASaY;)if*%hemU0!aKeoD*;MDzF&W zTCXF)QFlM@Cj^e6}^w&XLq4fBqO5VzDp zl8|J}g$N`G>5+?dfzbG6XM@9PjE&RI`i3F%znwV<>jw?ZQNLqBKIO4-^$>g(v*9$G z0Y(smdbO?u`4m@A3^q)73X9RG9XC9ss7u@t>HV2rXt6IsK1H80unUQ0U|x6Ah0b*s z5Y5Zjq~?xcSjPRc26JGQD=Usy5)#1>Wirkg&LEo@8SKWdKnO!zo32KQAI6A}N!Shs z%z~Ch`w{fq2oiit(@zV}h$=Jcp@2!O4?dOovjHw3zRNgl`arwFZ4~d9lqNx++IIQ4 ze?{V*AQDeWJTfx~%{JKHd@6uu8+v-tY7mKk_l`0Tst%l-gcRY%nIB#+N&k2w-0Hru z1Gn~jmd!!l6`H%XWVSTSoZPtig&<7qYXM;_CCuFM398LHlB)AGCA=U|*n&q?T!Spm zNqc~|gavjJd;r4mW0r=!Yr_t;9W!`h!_=6$DuM_juQEzgXMc$h3&y~BVZ zB;U{UKZVoFzwSlKhV;nsHs4j6z$BPL(nBJUp@U0z*-HLw2siCwv^?>td~1J= zMnatXb4$u#K2Ic{AIAeCCC+a(ps5Xy{Xo;<+X)fG%$Jo`%q%cLAl6;_m5rcLBj0cg zj=w_g&Cskz4+)p(yf2%uhEL6;&~N3ipY13ilbT2rsP^uqBmi6h{YiZDecJw4oIL^J z>{+T-ONDG-}ox_z=o|lN~DqL4AC) zJvP}^v<*4Z@F#Fl#eCyAJnS!AgnI>CtKeQ_BZdD6RNB0E1<&v%>`_X7*%!B8+g`Qb z*ZV`<76Xf(mkC*$#Cb8VMj3fzfw8Ia$W*12fB`*5I5M&3_WPa4!jGa zI(b*i36EuJNz;r#iU@L%H*c=O1Unt`hW>O27-aWG=o_iuYlVi z2fo8Omxv$K2mGow`V4%(adh;8;{Q$yIC!Abv@K@4S=ZQT4rxs`W~ZMq|9B(NoVSM@ zWE9-u&~|+aqU--CP5(VDZs)?E6*&I2?gal(zW*=kl>XS@0MtyNy>>b-=!%vU%?|+j z;0mj)Ya`QOax4@uzXRuCWZZIR5rQ-8=gJM<1U2*IOT1trR9uKJkqn3(!0CRl=%kQ` z5~s|aNGoa|xb-{NTvQH>rR?>w5?A<_n%VSD$Z?mjEvw{sl7N&UiV`o4sNo7`gqRFc zcg;$)UM2#iR#wLNtFu?GU6cX!w;e^NluC#r!{18 z=xb;s9x~x)#~I?!20gy**AdO3Oj7LVf9WDJk*4(DDKoMqyPml&!$x~|9h?Izo-BIx(NYx$Vd=*Eu?HAtBpy4Lhg{; z@_|#)94m%r{vY@bPEsZI_iZ$jAgD+dT3AI~W;7`Y~vE7VATIEDa^K^xi9=|dK->i^RZ zV2p~m17{SIAohev14`(qQB;FNLd$cY+_pFevu=2C9COKI^^KRaV0s`iH>x zze8F4TbBaiR3wCxfSPCpIG)xt2lRd={BY=z-3+5dmmEbHI65Wr@; zbsdC)Q=ewImJqo8;d>b_edEB>VH3A_jQ$szBK(z;0>mE+Z#i2S%|nX9=*tl*mkYx( z6br0WeSKmXgd9Q7%Aj6}Xbcx7Pha@I^#X(r&bq$}1*3eT()DSqs&F>zBx&F;~m;r%&%RzJK%Psrt7qc~Gs_kqZdkUnP^E-D(+z z>x~wIIl67|(@+^4dPRkUEh<3L%h$2|;K+{HjxV`Sxo?B9)q1hoT7-}Vl|Rll>40HS zmQTG%MKs%B-*DOeryl~Ie7CjeZWrkF6Pu7Q`2D*j9W%fPRP62T_gGDZr7NhZsZ)Lv zN_P||CnuGRjkCK~_RIIz-KXn&^R+_j-+o(d0%4X)^r0OB_%oD`WTgcU`9}^B_!R%H zQ*Ma;ZTcKP;C_CWjdJ~`9|3;#h4m2&83hHQ|7&e}=_k+>EFHnpii$MQebd&?ZpfQQ z+;lgeTBAaboMHdXsm;@;BFYY10_4`r z9T~B6gqjpVxCRFzLdPHJPtX}CzgxH-40z=`hVG<4n%p?Y+k?@x`<*Zai-}e_%i^DX z1Y(Wk>QqT7>W}i|>ab}BNh>OAPD#t7h!->7MT)9U|wEA ziKLGW4JV*cp_6%zl~EqG_AMrgUZdaqNI`p@2hKTv=`a6pJUk)|poMzNKSY}y3@Nn% zeN}04-i>QT#m6=nNlw%lg_{T#wY{{W$=>Ric9P0_fFp`jt2<~pCfp=en&o{!^| zcH~grsS@WOU=Lyp|Gcr{Kb}2*+N;jW8-LeaT7{qn05c@(Z$F{mp=C<#=A%}VE{X^( zPZY}e^~OBp8l@s8ekc!sGO2cby?uQsRnjAmoEeT^b=RG@GW?%jirYyd1;LouA4~P~ zCLS`8%vET0b$81Rj}hybD7iI)mVVt_!QVpZ@++d)?+`KyuiM~8*|HKk+hFrsJZSnq znG7Ku$*{i2_h%;>JBj#=jdt%|oofzEY9$5-8&_?Oay^tB)y&RLU~LlPe4gjiT3LDC za!~ZGJG6z$c>EvMKirW9+j?=SG7jDHL~z3u6Ye1<@-vKe(l&725sreqL8H^sL-WMCAR_(3 z-~W5EjZw@t{@jo~pvG=n?J4v~;)OXa<7p;v@e@Qu~Sq-2D+RIDlxB{+qBVT9`IrUshqxTUY`B z$!-uSkIB%5{T-^W@0b%oYkSwuA%eH>aI)GN)XZ#blstB(Z}X0x8ve2`YzpBYcm~)U z(zla02^$yUpekX)cpT-?2ReGP((=S-1Mbh48=u>Jk&N2u_9wb<7d_UOu;01|I?*#G zh%x+Hbp(Lm9`j>va@h)tAgLmZ(%IL9FRrOauX7x8E8J#lY3(84@&ZI}KJeXWjlJ63 z>!QbEGM`ia_GPcW{tj|~+5c9*jtqYru=R%DZtsMnKO!E^TTnmNlL+#R_*Ox5#C(od0SBmaI5%w3J;P*S} z{xFokZl*-BgjAy6I(}n_;ij3z`1uo8I;80WB273uN1wPzF&omv$&DqAf)Rs;MWK(* zQ_VSBk>j)Qs^awL@rRlk(Zk8z`>wn8yCoWWdbx)c+T4uD7)nwQ!o)x*6Wzc3QK|u7 z6F$lXpDX)!ul)N*#Ub8-&Ybq*@Bi~}{!X4DI-A`($@Wj3zw$(gPYqH4@30AdIr!JU z1bE9R7nE4g4a>vx&cFP+|M19CjNlytG6PTk%?SPHTh>V+n3Nw;n?zFrp&wti^R!(O zJ_mqwZA?`kjp)(FL`m6PigHn{#0X!Y;xp)GSpptE`N6p-!Fks%k8aEh-+IgJkcs>$)t$=RUWNh1DY`+$l^?ES& z7qpTwrwh(;ylV40?(5`(d(R?1b36LaOs+aDVL~hIuQ#VoM)PGSTBI)AhX5>QrBZ(L zMb9)00GKS7fMURm;jHfGJ-nh1@*uM5b};w>Xm90R6SzHK?jl5E!5t2QVmFtEW#ZM{ zgep2lR*flHbU`E-0_F{?nO4-wRiDe}Qne2}^$2Zx9E54UPB=6jd}`cz45g?tOs&e+ zj43!&AP&0BN%6nw`B*c~S1DG7I^9sNZ33v}roGL#6~!W6L8v5TKj+^0c?YQEJ%UWW zPWBO;**ux4%jpdFc;P0hMnLw+A+cEfQ&>|9lAwIz@|@=a4z1sw^clFnWqe;%Y`+`! zaibWEe?-jHxa4ahy=FQ~Y2GYd^xYetRHP`RurZmK&4Pb7YIbJm$GusE#_H!zu9Hi|J_rQfu5z-;I z!fr0tBnN~=dx*6Qt;O8!i;xTmbZR`{h?l;oU&v$*3=D54+Zor8yb`Dj;Haaw;b+vR z+q(eh3EDI@k-88p-RhMsM7{a}Yqr47tDP~2IsenSE-e+V+v=h_wN`iotIV2dQm;K? zYEHqo^{jrdO=*~G7L4(7ZWs(?kmnntMl@WZ#lXp15^GtmolHrpFRjnngX-~ZCLIPn z6`B@z8WwN?6C%^DA)P6V##1}v^7z`}evs8p>gxX6mcwDuvN@?W`b+b+>*EShT(zbh zsiib-hSAcwWweXs7@p2Sp0=z@qMNJT0Bup&v~39vpao4Avw28&1e5k^=Ae~&m(>-` z2g+xF7u2GT)) zW67XVpw|qGJk$3-{?heQh1;O+ZtpPL8`IGR3!kR zOIEhwL8d;;$aW-6A_x79+7`iPo$;bS^;1vPbEK$X_J2;EjHSN?7&>Y@+ zRo7*eHgd6FvqrQ?KCJQ$!5aQ-ySZkDI7K`+1TWIVhLlbp z?lQ=weNo+rvgsfv{>|9axN)a`M$53lwCy>&(vSCO`bgTw`?~tD zFsCtj(%bh}Ju%#(rT9HmMh5RM`0*RLe|faw@7TQGtI{&*gv6+HvEvw!xh)<;iGK;M zX?j1f?+Y$NWA*(x+3~R&dYu5?X^0#5EA8Viz>jN4pg~CP&$($wXy`O?vHsZgAOJ^G znJ32eLC$@r#dfCP!k=^CEa;?oRzBuzGi{HC*;bo@_~Tc~nSmai;B?8ptge0HJ@Puc za^7dF=w$nQl15IvR+If z1x!{KD8I-ot`c)bHGlH#{Rb=kXYcAnpda135jRO_U!>N{Jv>U(J;{YWW(wRPB<-?d zKO3l&#+0FwYg}EvwVn1dO+`{+PcjULk*u-l;s)Aid!VMN^Y-e7`I0557kzI$?BjkX z7LTerw~xf=R&3r3xEX~k$7i!FSsIq>e?ne-U&DFpNKj&{yyoSbm%S)BINL;PygaT) z&spKBUC1;3J9Xd29Z%wj$i9eld{?CP7l`8cYT~;SpCiLI-@oky;JV^un@AdTkqOy& zI>D{-rk@{|5@m58;KeRGpSPUix2ro%(}2A)U>o!93z1+Rk_6Zm2m|2Jud+D8h|RkeiD{#2F2F4A&pOGz`9|{CCZG4}Xrkn!(`t&| zdyF#jrYEwWIV#a@!7RhK;) zOZ~K=WqML!eP9B&NX_SbHLawcC7!o*lq6#ohb!H#B~G9m($;(~LbH znBStKs9e8SEFhy%YtY0cDFls$@~8vrlWhAkClizM6Z?779Q1LO;WWM_-_@Y$^-H;J z_#FcU?{BZKjE2;o`p^92_;Er9cH8)M3#fzWbnc^0^XmZc`}1n;yT?l-%q!Ye#^rNQ zt&ODv$)MTHQCynV4wW6Z*s+c-cH3z8gcnPSaD3PEBZ`-J33hp}3X{~8sYAaSSZ60s zA^R5j5Bh*><}@H^@p>>>J)?UK54N!kzvCgJnDfeEpFs|2Ct|vE?`9uILn43DO>x6y z=`3Vl2ZW#n7)Oi0QQ7YxBG>of2=XFIHV+ZL+Mi_N3&Vt!-d3&}dy$%e+^)4v|M*qF^|=b4{Uq;T zA16T(TOphL7hm=}%M%CJTiH-^yieo6eR{ENS@%Vkqesel16Yywn2c}ORrq8dJa#C_ z_P7JH{?I)yRV02oTSHuvoLq2lU(M>dn|gAAO$j|>?KL|}`br?%@mM`&<7&G+D0|i)Ba&)gMrf^OtKb24IU7P=)`LyJ1mxZy`MFX_AK%ayAleAwf=Ogd9J zQml3=r&jA`lqa*lq9bq4!};IUPFZE96gaWgQ9cK?9BCv5mMiMr%$8OWSuV;$p=$I;QA_NOLY16z>H@+d^Lxa+lSg)Bj~-Fx zLosD9-rt2w%tTEcYR%Eei!g7RC=+9*74%mVXZID2NZvGCc!QjW{kE>D4jM_DfKncL z6W0P${W++q;X22#k?ga>*|=)6iI}z*z8BCCBE{jM(n3Cwl|};ado65Y@Jacb`8ExY z#)MuIL0{F)kfC$SFUdUZGjSPUruMZGtzuNL1qOk~9*jA4tNuWIwyYU4e*%y03{xY`1< z#6f^I5O%_~wdbH9`i&vT3f(NXYQ>)2ie$qp?bFsFje=py=@r$p6<5osHS@#VfPHz} zy>ANw4kavW1bgC=&-cYIV@ptlOMp>YzaW^gokAt#` z$IXD%NTd9vZs27%Qj#ZJDD!PEgRd z`|50e){Z9s^l~Cd(0NEq>k&MVF-?#wc)CC~*1sp8Q@N@{^Jz>sHtwS^9Oyd!0mGf* zchWPz2U)BWXp;@mY8+i+yp(gB_S#hPK)lNSz_n9`O9y zo9m0T*2s>p=J_iYRG(N|LRu~l25m{qjYZ_gdznj(IGY#j!JSNHNi6p0%FHknLG{?+ z#vZPQS@Vbi2D!HTa!4;A&U~R}2F;0j-|z2ZUb}3k+Lk8;tUYO3(T2%-YI^2ULmM${ z^qm0pI(i5B4m@3%Q@9k}gcvIh!>7X$;U{evW~H^Qz{P=^t0j9Hghta37E1m35dTQw ztpiZmno&GvQ@r+=k3-fZ%1nWZknuQqgaLzs*D{IDCT^lp-=-Nq*~DupOa&MT1F(yI zF_1T)}AThMk({H`EflYW=_8Hc8# zc{!n=500Ps1xH)yX2=Za@logtq~z>gNLiy#RvXUupqG!$w1_oPAC@+3Ji0{G4tA02 z(NwmlRyl6pZt8h%+XjY`98$Mk-)Nlm-jalVvA2oRoNBu=r&79!OY6wD34OB0FlWPU zJxW$Rd6`#lzy!p)ON7tw$|avQMfd!wSL)g|me0$X8{ZGcwVJ9>zIq@nrcjy2DeNXV z_#AVm{_T66-Ffa2-(|G3!B1EcXqVQ_Axj*2*c1Y6?$Mkhv3+9Qm}s5GX(Eai%)T9Pvgshr^80xE`4m5>fJpXS-1Kt$^NkAtSRHF&eb3cXQyzZ{uPUq z5Bsys|L6r^@p}66#8I!2G~@}i@*Qw}%W8pHD8|NG<$-xSQ*<05F~IjHwKY};Hj<)W zDL}=OoP!_E$88kZBqk1v+*cZpJ>$XC?uap?(&JE52pe!B!scmjjSgGg5DrbsfN^oQ zBp@xPf{2JG3fAA=u0mO_sR5UgC1u_lSK*oCQ&ldGm?P|F^cRSRVR|S<*BsO~552Vn zVf=^(r+6-sG<5TUPpf0&QaF1PdMswcvgxIicoXxL1a)Yo_T@**l){Ogd*6(b?h5nP?g3jq;T-wfOOHi@ z9EB>H)%Xo<4_y$2o!$+NuT>+MOD3I0<{TU)Is3-e6N@uT)Xf;l%F9IGg~mS3RkFT2 zOxS)CBgKh#`fX~b&etET0G7>@R}6WT`RqpAvruHa7`N9@PNfGP;3mdL#*|@t6#2gT|^qu=eGR z4=S=wg7>z-HWnHexf}(7U}C)8i1?{$XSn&t7VLP118|~b`7rRuB{vBa62Jw_!3^h~ zZds481Wqh|HFV_qEHI%uuoU(hi}{@P=B#hh@a4ys^ShJ2DawD-2$(n3Ly&l$OvZ?9 z5%}*LDXhy|Q9^K2d~_{c{uY*H)aGV0YYy%Yic9%y0LRe1k=_8#vjf|7?QRAIc;9%* zDV4&MFVbL!_$Dv4vKJ%dAv<)MgAQ-)+t!)pfkF^b)kjsS58^Lv{EQ(#?28C$#>+aT zJZhll%uuD9DJch2B!eSmr=D7`&t8X7bHo87Ym7VRNvgs4DEnlE8${+iB4}cMh%4JN zUYw`Dpv$;(!@&ZfRV(a|MO zXI+cl)lc8SpmG&W7D?VX8*xdUTdw+!n?Biu*HJyE%mQz;0XeWUN;8ljQwqLi%yT*?viu3)V;JaMOTOv=!t70rY} zqYQ%VQ)`N-ZxWCdSE}&b)XQ1h?X?pu41S*y6dvfv|A(Fv7W1P}E-;P8XLEEW);Z+d zGW`w&J<_>3bA zL|~R`o%iJzLzqQrTV_FswKguYRh|SD!LE2|pK3ATFykLKYr+hHGw+_GQ8`H+2+z7r zs8MfRWc2YqLyT)_hvwJF(SA9F;-}z=9w6OtKR3y_vZSRni0KS}as27%<%$-3RE7Hp zT|y1wSV;3)4_Eq|`I~0t2pJ8`thz7Z1!kN}%bLV&jJ9lg6w$%Tx4$rLFx!f0E1}6R zew@}M)-&;b;v+4#63c&wI%mR%EeMDvEcX> zv;jub*17&~aOI^}>V=3=JsLmVd8x_VZ`6@(Hef_P$~`A-oA>CV066Y%ax3UoIr=WYUKpFu;WQEr3@ z^`kxV^SwS_Q;AiKRf<#fUfqeKs?4KSjb51L&UdI+$}@&Oo#NWT4HwH#LS4=T6|urC zYy0o}-26~2T`h&AQj2UFc0S&9C?N@Y>@nz5n^1P&mM5vKOn*IDde`id6ilM^tD-p) zjPsVa2(v7Lw)DE)gWMz$#wKyL7%~phL@#}6_(Lfq?r+V!|0v_okcG5sJeh&bS~cr3 zn6h`e!ZR3l&eM8kPAK>tR&gjMv5!SmQYt`a+-6*+;7Oj?6<2Ar?kPcLiqscg8vG9ueW*X4))5;c_3(DilI;3`2 z%=4FGfE2+KZ}bU-y18;hg=TBMRo~}_j%pi$M>tWQHJp%l_s+68>>Oz4l0|N9N8>6n zHJ9@t7-o7xjW&@lX?8(vxC#TBP6MLF0EI*tF(Mk~F$VXnmvtD%hxBr=(>rsHQkLq& zueCMAQ?$w%pXZrH2t2oi-bb9orn7AezTC)AVM{x@7RIvdn6O(s9&tP4{MF+^c7Iy~ zJsiuYs)6mu#~i^_o1e|4%VjA8RT;lP!~bLY!SBKJubs)45Bf`)6*P}>B$bN_Ehxo( zx|jlrq2a6Nzcy4{hlQ;_>D2d9JS&8~vw3!Lt*bt|@D6cAdN-ZnkTE3MSzB_hP`_Dc z8XmljVsE)R<#+fw$t>L?rXWjoU}TuE^hv`9dp=P9%L+r;J+GBLd%e;F;*V^I6REM= zLqb%Gv}t?mipZ_|GI^NIK=)P3R6Ht`uQ&rNM+Q}E&uy=%!D!X5uM=OsvM7+&&Vc~cU6IDy5>q{CdSqM_J^lFNWEexvs%@hke2gn9m?*Jh;$44nua;+?3 z7bt$?b~T@M&8e1@i~jJz>j~XU>SXeAR_^2Z%21Msa*@a-OwOS@J7?|ZgjWW|;?zFE zR~;A4iN!6HkY|FukP=1g(GzLPkom3%Dih_gVwUSUwIWpmekdp+C1Y5WqE) zwWHD}9@7J>uup3Cc6HbLlEr4omu3eS;tXOM!wnz@uHy^ZG`usy`DU9e#ntqL(ruRB zDg5T`Y_39H-27gldK>6BGS2ZT&%qXE23;6M-Q98QaNsdNLU^GBjQsG|c!#748pA z7`NBiy9L?ce7F`A%yt~s0Es>7G_}bO8{Z=qhWE(^KXGD6IE#Kp0@)NLQxy`0S{kr6pJ6vxINOA22ZA*cmpWmH@Epoy%Op8kw)mhVvKNW&q47BAj(bnQ;xNJZ z_ADH+I7H7V*Pq1hZf@6%sN0(rPk0o2@)?lp8aej}FyT+s8M5hrd<9-IsXT z$gldt=yNlg;JcTkh1VrE4H=2KJ1p(K;!~Mb^5N<2U*Fu8OKjF4aP-=)>JuU9(o<0l zX@t!5PYqO8!od1ealdDevrLSQ0_Ng^rOmld;gTVU1zj`2>LJRB!kj@xC2qi+!8Eho z`r*!vpB^SoYQE$ZaX9ulU##Ty87#}W*Vc)Pbjuc6uoU5iQaC~3e>CPhz2Pmvm?-79 z9s&2l`DWy*7HOd$2DiZ(+qnJfXd@{eV)!~1tZ&$xvSQ4tU|c-OOQ*<=XnbF_P!OB* zcGI~Efs%O{JGM|E6067!dX%l6WDR;NS9EPvK6#;E>#w^IX@IW@Et4{;ShsGDn zg#%6&D2y^LiZ z*^yf7C!&9I{221kiOBqE-w`P)7X2+ELabuNHNEPZ^d5RdQj`8pKo_`*nxbHj-b~Ng zE6}oGKa5LmUo9P|{IaU=Xo|C3K3PUMU!GY;T>Uy$8T?i)`!mTZb+?YRoHWPF&5QXW zTHL+!>DzZC1?}`-!r-tWZbI`#j&75u^tXs*tanJ7!CGzEJ2?KN9nWJ+FM=gQrFBP@ zw*v|T0U-K>D!&?;eeIZxn+K|$un4^jv0y0t8H0GP#qJS4btAn9vka9<5xeN5gKtqr z95JhLdG6`-vkH89%q}WY`!DJFOmfaLk@-#K5) z#WS?4l8+R>FRPl7Q#__Cu~C>Lv(215-Ojlsdc~W5HO3U=<1y$r-8=S>%KX*~eBa6< z&zfb2(h={HOxxlNttKH%FNj2VdJ(It;FKjjW=IlFU#u1VtyLK}CcK8Y<&#V9ARiEX z+SeUh^o8r(w@8CqyqC09eAe)#RJg?$5td3I`GKvnq|wFlx}EQQc+c3_XQOCffutt+ zmLKM{ebf8cnKFc5-HYTL$#8g?5#2P_#(<)9%JJCf%o_L5L|lhD2A1RQiO zd1Ls#hlRa7>U?%d;V9CHLJp4zr7htCG>CpQPHWkoEUb>W_(Xv*7PiH`P}~(G^q`4xJ^4x zMR%w425SbP<{G)gWHGk=6Nj|c5NRGNLm?}IjicQKNeq09z#dfZy@^-doO*)Lny}9r;MbXR3AWzXt zr?XI7p1#=82eYzH(+mKw8U4y65td7OLl!WlYmmDO=goDS@A;u0@v@|Zc5Y7tCVOAX zGmU-@nt^gZTpp55FD0+Q7-*74d}r!1E#NRp3VD?qHS8R2kwA?v^xcH8bfUjI@O2sz z%1jh4qlyA6f|6HK4X(e3uG2vzSV$F)l#YWKJWG)bPPyUJg82@Z|rigJzKUj z^QU0$8UMY8oK2taet_B>9YDe+wm@1}ZPSDbee#KA9=)cjZQ7hiUH0iL$7;qC$Yh+D zueSq2d)WB!Spd=mLW}?(k_p*h!id8MJ1CpFRF>!DbHBCtWBb;}&+C-~1^I>r?6S6D zDx6i_PMMtqK}2!CDEqj#5+4w(QDzL`t7|n%I^`{~$<`VUUE@zk6#GWiV~W36f1oq3 zPUFz0o+x!2#N??b3aU{iPj+syV_$ROhNZ~02!#wpMyS0!$G#}0`?TcGhkKFQEs7rK z>bn}yupEFPpbnOV?hK(lwol6;cLO+6nQ0+|PaD2O3b$lKDe4SUSI9LKw1Vx#EUw7dLJ|54uX8OAmNkzIfPI%j4ve@ zLKho5D*Uy_`r2Z~ug^4-e}3O2T)cNlFA0gXsc25wS_!&_PVJJ<&C*3P0|(jsm;U^C zFuYwixCHh6Z;zk7!wqmY`KE!P!x)I86>wnS@bi0m^e=oOYCBR808S0 zH#omZ6hkTR0=;{GKB+Pg#;UnB{}H<+m1bivzeR0?Y3|uqn}PCrcrfP1|WmttY|k z()#!v1%j~{~WX(Y-M(sBuAidM%B^=(%@K!P-)h@rce*$77OGNEAE5=z7T6dbTXS$0=Tvqaf9^ zW2;x|nT3ao>Ql<6zNKque_L?Iu4dkOIj87P7(4|R0G=rvV0~YDAO+cB4s=O7vMh9Z z3{L0Glc$cm^K-ADj~S&zs)268T*PMR`uSen%vauzU(s3e=Hks3f4=8}_HQ47ba)}G zSak(XQs^M}CnTOdJmESj!a?(~KZ2AFRMPGGnuqXF`}Mi1zc{OA z-v$j;GYRSRydv)PfI+jTR({QH*t z*Zay1E9iyVBoZ_XXU-h+FC)3UE(B!c@;LZfa9)?GD8;|Z7n3bP-$YTQUjv3S9F^3_ zDU?=wKEE?uwNvA-^2t0*nktjLX6=Cpj5J&pPrP=fAJD{tM?dxd2|HX$mB7`8QS27`|Mc#s2>MhepKf4Pm^F=qB9RK121wx+^ zYQi;%68M?_ag6@e%3W7S(9=n+7pZ^oDV6eAz_EXCUr75ePBdN6PbOf#{@>U53*+^F z%WSN>KqPN2UC#i2)|Dq6GTzz?=r3NNdPhXDyeb_$-NF6sa^(49_yJK2w1PH42S}F+ z8Q`(IKukspAfS)j;%q0!0B^nzpi4Yh^#BXj>jVj(w4Wn!DtY%27MJq93u)F00vQK9 z0X`s=18^FSCH#Mdw)$U$@Ap#j{MJJdiV<*_e>x2cOaIvjt{X#tBkce*f(EM+UxptJ z@oTJqt1y+vhlYX6YpF#9J|i-FT{2i)$q?<^dZl+?w2!YONYbbSz`PbJG3b9X#X1DO z@}sX#)lda)h~(9g_8HKeIzj%51PJ19iFv7?l~?^nv4e}@0utUaPz5W<+-*ajt#=Kb zfLWq$Okl}!>Z}}K$qbFVKi@d0S(S_6X%U^UgVyE&FGr*M*tSUOatJskwd%k<+uHUy zeied-_nSs;JehtLu_D!j0GuO02TCDT09C_U&wpTH@iaT7gS#Qn z5m3RPgh*K@X-LpolF0gjVhZ-WXkK*WPdcg1?yL$CkUGyZK?cs~))mAV6t zIYL`qn3-BsRvIJ^1Yv+ep#Evpdi`t5>GQpBp-Ji@?4Mt!?Et`s99L|weJSutz;L3S1+7b$01=H90I)+*T%rZvG9Bf%a|uv35c>K4f|wlwAN?%BPxd_QVh6Y< z>ISJmw3)A*^Oz}ezhn*iNjGX-Rpfz1+0Sy+j)E5os#1hta%OT0=pt8ILL(rHAk;H< z)&e(=VSa%^uFFZ7D76K=;o9N%dKCuOd-OfG;}|BbVkwWn9}5MFDzD$KjS}Qlice#Y zbPbrPyuZI#(rf$mg{>nVZ35&+C_#!fBRMq^EkXXz_K{6Ck^vb=2z{so*>5PMKGwZ; zQcEQatiVb}y5xpo%|=BkXE`rG!HzRkbJ9iXn@2GpFyI8-v9lpd!HSoIt{Y#2?a#`G zXF$>aSSIy5X!SNglOYQ{4m%dgHz$DQD;o@Ntg9@|BLM17P(t$UD7#$FGPV+g z6WNTw#<*Vk>`brXI5^K7;{2R}F{l&hc7qfCT#ka{h#1a~v2-9BVhOz(+Rk zbxf_}?=sPxu}LzUG@%5|--*9jh*<26r8Cw*p6-fC5N_Jn&pcg%I{?-E$ZXk2Y32B0 z;YOlt^YL8q7Kr)&xr;(ir!RCgD%%Ct;VY*hvBRMyP~Y+(D1RtoSo=tQ0f5IS*TQ}w zu@9dEFSQW6%3Ii{{07|6FmOLI0Y88~y8_%{q{0Q09b1FPR#Z1$`B4?pu6cnzw4V-E z*xs*rg49Ex_}EtFMM$tyP&qcGeeFXEElBPe*nt^7!EFF+8(>urObOgV!2aRy+F5Tv zHi&6y6qBPoG2E=9L>;MqUDk0j{Ln@6a!*!-B|Nuyh0z01m&8mf-ldb~D9N?mW}=Z- zQ&r>=7`fW0QMytkUyTGyke{*2RtUop!&`4uiH)>B+WN+GU=Y9hao*(`d3O60y{mt3K)l15pljJcp1rqqJoGH#iimQBTxlt*TAx8J@`gfu zt+_duaQy*T*++$Mg&`wFmrGrPXcnmX#Gk{WZ-Ca1-ACoB`evKF!3Jy}pgYzZo`F<} z7A7YRMCYJ6nG;Z_wvX=oJY5Ko1Zi`wjr(}=emt3HiA5i#3;_Ndc_N*4%U1* zbu1GV_~@I8yVOdTilfmvA3(X93ti(->z?s)30Nn>T?jtMFVEW>l+myu-(|=J^uiL* z_SN1lzLo%HP5b0~GmhV}o{_UW8LyetD*Jx24SA0TxchOS|CqU30}tryYsy|20f+#dU! z<8~&TIBZpL)~PG`Vc2j>WMU3D$(Mc*xqbu&d|U*bx?f*T@W{xE=EZ8z%95W~czN@| zCtHu2wXBAX5~vR$@GHePp2oMk+J0Y>WxnVp(g+2|rbJ7g=8MDZv;;j<|9e0#7J1;W z5OxF5Nbb)o#QM9S=xX=ubJucZ7cLf}*yC}6n>c6JN$Dto^j@G8lXQOdnsg<=62fom z8S$w-J2EA?X9P-tY-!OMy#nd;7U`p=HJoK%3mB{a$$su4%CjKqpbLN9J`#Cm_T`Zt zP5`)5LEA(3D_v!GB@v?9Wf}>23Y7PVyW12r++jZfT+EM2;b`Kksr&t%XGiNw@}Tvj zDzxM&2Ib@9N7Xz&WkjXlm6P?~QBdv4;0%o<7La9DG^ z%pYTnb<$q%MOc12>tmnAbNj{Es3Kk7E-sZC!RevIk9ku$2x@!ikL4V{hHp@7Xt;;h zF3vBflNh^=36%5C6pS|c1J6yBT<@!~=iHJ{7AbB|MN6!dwS~%J=Z^(ozmh+PE_Hv< zaOpVdr-^s6zB>M7XMk5 zYMuykH76BFS}t8*>}s^MZp`@)FsBvL5i^8=9WIW@Tb!@qqjR<#jV}TJSAzlFFR5^i zwNaTag%iipml;XE7b-_!2{Q(HZY?RVR0FFGLKsRAYxoj#yNV`f30Dc4Xpo>^3NgeO z!O0%P=&~+TVPI1Z1gW_78aGF!1E*&S0#f#qD zc*o57?MU3l8@Nf0pb@cPrN%Z*fEc;55bjUO;Cj`yvq%hq8RRN2?&rG(P!UQ)21C4T zP48Ry0n0SfLm_8@yXbRgOjdG=hIb<^*1m+|a1K@~TB6BNh2roZiOj6%FM#r(qb!&& zs72I<1JA4raGpf{oWre^o|w*T)f`T=+rX`1PkQhH@P2Evl)y3Z^0C^>dTf>Pq!al1 z=_iA(D9MxpF zqIhFVgQ?SrtsUf&@x<_vbdAX{yuB;(${u)08`6isgAM};Y0;g8q=NCap!U1R${$LR zd#Kn0HRtH<=u}Xm#l{!d9ZD4^&i}dDB`j47q^Ul`#qeRUq$3Lrc)?i_c^gp#GpT*s zlo6R(4)^lmdYLbxwn_u?2JAQePtk#GdjL6Cl28hMWEsuu%PDT )EK-bGG=6hMEi z0f8OiD_Bb7W41A5u%mx`SJ{cVR6C)rl3nsN$@x7roH;Nr$Yh~m-sc&GN7x6b97gNi zX8r*55T&$){)kTT(6+3}U%&5M`9f65TmxzSNfp{NvQUoW>bM${FS1$cc6t7x1GI_ZSj+NQ zl*BmV1v-Hb9F_WEi8k;(yO43%swoncapl=I1)(TnQc4*$pbRGr5l4w5P1D<%nc5xZ zh2Sbb-9Er35ZJci&HwTK(VZv?FJ#zdbwMR;kgEZETh>{Y>(y}5KUW^4{2@@!b#`?$ z#!Dxt;{E_ZIRl+C_*yYN4N7P%;bnO%#0=6+Q1k_o98kobQ0|ibe5XY9# zd_SvYpQVjNBYTwr%hUtm+T>WmMsh$;Y&Rg|0w_L_6X46%jG~C8>DrOC`96dP+Pel2 z9u@I&-s;@_snEg%-1ILI;=$#}?yPT$|(UK7~)F(nj7tOmb%f1B|F zL-s==S1@if6{7)tKlCA0XUi8?b+qOGIyYtYrsHjDz$~#3LM)~qKF|iU!4}WcE%@N- zq@OXN5Dn56BWt?X^jKOv4l_Oi6_3ayj}818)sf|JMBD@er=cp2Y_Z{=#7=ZUy%_lA zFtQoZpN)*}Mhxp3NXce+EuVmPwK_Qn@`oT~_P`Waea9F}XMc*Q%nrqcmb8LulcVfK z!i^4$v-fphAG2%AG=WN)NKh|1m5dX-58t?_+N*aNMjSi_P1(RnSa=yr96{q_Lm@+# z8-jq`kpzb-i9try4KwiN2d0ua#s0uVG~d2HwOvPLE*MG%>EkNcU24)W9CT6;df;bt zIup9hg9`fbU2+QzkLr{}u1tT4!MscP`p!}IaD(c--O&+2&21?C) zFj~)4Ah4L;mYhp$LAfg2b?96pdq8+5z-*(3<;PiIRf)B*S_KOxy-!5omknnDdhMTP zPCgT~3#+w9<`Ie8Z(ykVIQT4$KTB!Xau$kec863$-3?BGO!;78|JxOiW4vD+O{H{( zBi!UCaf@(vx4liEmWZ4P&|j});hWNRvYoVkKD2 zaEUENS`d?-eo9W+c+>AUUXGKLQJ7kZEO`Y&5b}C(8oBwzc}UrXg>A+9Kt+fQJA13V zyD=wEvGJtEHH_|%rou|me-c$Q@~WbCJ)9&I+`G)1;0&T(6cp!$;pV0YWDeork=K_?vMjfB zTp}rL%99AUKt=kF((S%Pe1u5R_b?t*1H^~ixzJL9Yu1mz_5hC1(Ef_{fg<%GM>IE>m=4zees;1*7Opz zpvZ;f9chl74gmv8hTxW7b>6D?@4~sy#797BHyN&H?Ea-V6vvVy|Go&a6*yyv3}raO z^Uz=@1t>qG5DdxHBZe5Y3BKDGZ&~c5q$2W`fCH^+K!g9_cORI3nGj`NKaiL?VcOzgpi8}(cFJzaqv06f-1Np5}d~zaJ4>DaOT(D-sCf( z^THoZXQ-q2#ILoH7#=q%B8JX}PN`H8;^R90fE5Qge$U_lDHsyVnLn2Wu`w-v_W;6tU?wD&>j@Fr~k0Dm!b<43PK{1l}N-%jYTI8 z#X7_FoUw1%13FYOXB8s$CB$A->@HE1LYFk>?+mx{PdO^~-|IjV9S={MN@?E#0ioUt zTx3DN(wd>WDm~}w$x_0|kcN<3IGBjA@>N)F_600SO_1E`pa^fJUR*7t$U)ziuU=F6497Hr$zE z)j71cBa{Mwd-9Ua5x_YbBjR`E>B{Nmd#$a9KV(I;F^>|V*Q&trTd#6=mE*2)XfEB^ z<6X6*Wp8);j^DjQ7-{RFT;;429anhN(22>BygVq=SJqt>sVur(Df^0)lDi0=$J0X^ zAAi2Tr2PtnDC3uG;cW6fM(l!fT#F^-569{ zP|w8j+$#b{a_O=UHP*d-=2o2)T#@xWEkEAWWipsr`9{H?IA)2Ur^_AzFwdWh$c{9g zw~z_l=?T>O)e1DbRK!JqFhL^Uj5Ka=c?mAi?r>@r&R_@)bm%Q`~}Gcnp|onGz*~E zRdjPN{k()Y%-xn|3%14gTITUC*o{2$(sF&x=>vtt@(R=!D`e)@3-RZ(XgzC4BVdf0 z%Dcro4|yb28b>S?7vm3MA@6cr`#R3#A>xrz#5c(j34EI!rU4U`-1#q`=i?DkKJvki z6F-Om6zB2x5%Duh<8P}9xR5#>cPe@V%7T76Ihd4kT9}9rs5MskbienN1vanNR^m0< zmst>UztZ_yD*o^dUp?u{%EKXeBJY{Tp9IU|@lr>15r{Yn$5mzbz-ZlHC1DCteRnK1 zGBJSf3y{sC?Ml_x_qj7nH+GaYzOm_|BkD>bgQJqgJ>u?tRce_!S4)^h_>uN^ZY6aN zu;x;km%K^!PXPkHGl&X_w07NwLxyUx@AK_89~P?;#&)P<9?TqQgU_5;xlly*|{wCbTK{0UL7qc3_+xR zZPPKU!$&EzmpFd_G3I*{2&+GgM3YnY>jv-U+7Vi^&>Jda!^6qW)cX3x`!&A|)hGd- zjQQ?8BhZ9o3DlTjTT;`{*DgnM{15itGaAl^{TfY_gd}Q+9uY*85F&bwgdjSRf>9!( z*CEQN36kiE-XlS@QKFkcl)-4x27}St=#0T&&ON{9U*7Yc^PKnF`Sf4QT3Ks8gf;hl zwY~SXukgDb+TPI+MxW22m2bWRS|R72NPXwKpKCFJ@*Vl_X|3|Eij=GLoIC(tI?82W zItSaorz+!AQ;&o2ecl>!`=@XBgWr$!|D4NTNPHRNA8++d6n0?vwqS(~FP>_8*MMSV z*J<`;{8cq^01maV0(gl2uZG^Afl17UJTP8`?)G3=6b;a{#~Da^2;pujIpwANiNKIV zvwm}y*F=RJEVRBi$o(9ed-BnJx$x9}=RTZ8Oe=x{X^K7SPjGQtP~>krPa0Xucg9I(}W6=8JZOP^dXYG?n2$W7GF9qFRq49R7qq zha?_}$=(RR0)Sph)Ngep|32;>==Pn{;8*F?hf$=+ihZGA=MRqNVFF;itADQRDa1Pk zM2UA&eSIdOS^40DQY3e|;@0(Wchyz3;6#6N!k3q^Z2-)i?%myRZ8z#DB=<)FV8xN| zoI)tR>txwB8lif%-v&^pkiPksJ3*TJ>lu>C}*Y;q-Pe7 zqIkp;{@u;BO*n1rbHCd>Q|QdH2@ty}q3~4S0S@fhtaXI8*>Nh~lz8Jd zn|t4DbmxMXbm*tWj`eRTio7 zXuZtM`41*^tNmd17mG9JtK02N(}Jvt5*@g8ZrK zk`uux{n{PCm&dm!D}lBv2$n@ct*8(rYcFF22#9mR!0Qng*57;6+dMjII}1MJiOUl$ zyW`Y_3*z*+-9GoxP~^*(_6Ig3ccVA5q=SCLO5OC2?|6K;8c9Px#8W;C=M$#?XvZR6 z0W47FzwNw~#NS9UK>OI$DVB9osJJAd(%{xLp(G%j>fpcvg%RnI37UFPWM2;72i_Wi zC!CGyOPb`AAMGk~yz^Q=I(&Q7(cE$4H6QK{v$(@Bkl3aBRxXA%_yK63wu97I-dsnR zLuvKcmAljPDNi-cdyp6BslH_?+`9YBABbJUG~qzt@{v!~$AyK~57y}sEDp+F{>>xh z8?+#!@Y*5#t>TO)rwrl^d>s0C>EjYmOR+mZcVCcmL($8V-r*B}($kPvetU*H8VN(` zDq`@bh9O2&0SpI!wl(ZKOjDmn5j|#8y@(_~FFC2uzK!(FVMo^1k0f)4mSu*%Yn(Fo zn}E4(El}06#p`tOjkcxCjmMHpZ9xl2f%{)BTvny{69!=0fY)ugfz|`5UU#OCGzTuq z>xFqku%|VbYM#iQeZ2ANZX7H%r}7C|s*1d|w$m4pD{tOfK0N>|n0RSuLN*O*fuZg| zu(Lgm;pMZ=2jL>~B5H3L9;=)hM50!>R}ft+$HROFuayY{bRkJaQP}u@(&*g(%mB!}Z)u192hBH_NRrU;o7dj^|I&c~tg;^! zX=+`3TQL$=^yt8%w4`6WXI^YzddGvqO>**(6 zQ2mLx38IYX`%fImntYe*ih<|qWh*J3_9`m=^O#p%NWvGwmAasR|J47thPH?#u3?UT z@b7?;|Ht?HKfmLDzN`OtaP$B6>HoY}|9`|F(L!|=0K=*Ok*`Qk0q%fIcTy}v?@zwV z`v{b4%5N6l2_!WRFsSOV0#>03G^C9+^G7+xLj11@z-oWnhwHj2PR<8^HMSEV^?&0d zS|Tf<@hee^jDXa9n!;~*KWt?e7=5^aZItw^Zkyljeb=e5KYu=BXb|+szH4v?8LOsdzdFEkYIuTKQpa9Di4yl?buJNq#C@*L zT{?fFyOH^i=85Ft_mN~F=LEDFR)A{d8nCfsQ78rTF)Ruk`$xtEkmY|!Dl2FpuJ}HH zDw=}c^JRJhGebCuZ~+`hZ(9K{(>2nDOCK-4zE0kyU2Wcv>U597fe}fTbSrueScTIf zyabxA;RE#=14p))6~ICAnYVm5-`5XijBiKeF0Ggd==Z@3ZIl3>JGjo4B=-V11#eum zr1%QJA5o1{kg;#+eR$>M59~_n;su(86Q84y=-WFz=i{_N0f@t2diSEQzpFL>?e8C* zt$D^@!SLbZ$j@vhwxMD;i5M@b(A*FbPiqe8)eoqkyUC%M@TMtPvkP)%ah% z(&|qNQkNLxqaC2sq|G>Puc1LRbI~n;2%P-Y$y8cDPI%wkLDxp0fis`p1lSc9zu3Y< z(LSXYS&(w2?ULgo0uGxz(}6Xgd>dOebC`j7K>gbfdJPbfO+1wP%*~InIWnvhNL#zs zaWcg_WjX~dLCQ;QPfiI#P7Z!i1wDqE?%v9fF}&Uar{1tp^P(!fF<#)(pQ+)PNw1{8 zKKtsY3C4fxuIlvk({3e#1KhC29hk{JlF;;CfQYjKo*ylMH{0;0d$Um=yaK8ed(#0G zd*E^4)1$R2*FSOi=4;ADGWZUM}|Sz&6Q!#mGX&@u3L&gat41bT>BW8p&ws1#kep?WX?*L*N+}gfiPCIWhu+ zt-r&_sM!wjEondPA_F;1GoTBW1k?aDb3V-k^Kn4=(gyIvlU;WlL_GmNl#irdNCbd= ztN=u>RJgaWzT$@6-tBfUQcSn6DWJLznPiIWry={;G-YLH{<3HSMZ%Z+fx^yogmJE& z{sXYHn6ffXHbF(3BOhT-o{w7|QaLG^`h`oj z0H@Po3}8N($*B?9k@U*pSJU`i`~;Wxv`gbV{Nbc8Lq#(1ZYww^_<8ipt~|puF31k{ z@2`J$o=YLUgr=O1fL8$E8G45S0cgf(|(KQ^0>=D%2l(MVAVm(^K! z@b;!6Njr--0SBi=|J!aK#cW9;sLZc7fbGpigzzPom)mW0%GXF%-+*e^@;yM655s-A zEdVBFcy|EcmYK=DN;&O1!hEcuw+7c?e_$l7wHsiJ^8p}fqym6c)@>RVz3hJtq+5N< zgF#fusm2{p@cB$Zsd(Q4da-gq`53X3^jZ|GE{q?IUd4SzY~bV5F9a%4SdB}=Io5|r zz1k^2X3+rp2S`VlMwA;MFv+(S7TpQDz6*oaOgdEGpe{X`S-bo9^8h@lZVSjpw8>^3 znlAuMu?=8-r{@}re98L1sfgVH`rD8%pwAO*;ivv?N9WnxAA#4C@AMRhw_26PM1TTW z<$5RcV&&hmIyY7PVZ9#9(wXJ_`Tmx|a5&sZhxO!u%O+*%^e=(R`1*b0^ce!xsL}#S znUVo4GR|caPwlR#47SYK){g!`t^@EUBgu%beCN;A*tl!-qhtf4{d2&pL`&jxbpS$N z)^FkZW96L|@;2b>jt@OIXGr-T0tTWp{0WP>Pj_(ly7 zPNw-5f@b8E{DUbiH!8Xxj=kU|b-?uZdv&vcJb8H^AP5SQImt$*nq@-X^N{?76Z9il zRt>7XBFt;iqLVO~x4HJlB7h2LioqgX3LD>R=6fMLbIeypP0+w1gvp!fvj57EaVoMQ z^YXX1zXAc}+cTuHa)E-HqJjQ8ea0Wx;Yd0vaWd;8l1=j(g!AGdunL=2;}1CQeE_4( zYftYFxmFtO%{+_yqQbF#%}aN-?i{cY76>8dq8tMj(5y~1oYG?`n&@~oRyf~ZJ_dBJ zhEQ+0njM!YneXGd`J)RZ?{{=Jb}3%LOv^if|62~s)OK0x(4=y9S(kU5XFeCe!X6$U zRTA?V_2t<9rgQqQ+(hDqhk@Pmg*W&kzO`rfM~#iZPS%g1s0y3rs1b@CMg46Oy{IV? z7PL$P_>Rtjjz4e2fUfn~&qpf=61S}Yc=ig031@Z-B?8~1GD$|1_!j)kaTFg2B!t?< zDrllAuy}y5-38z$l6w(tJ}59s5So`VhqjxcR#Rf<3=h}?9|@k*aGjd7CwvdUiQnN; zj#6BBV|DNeL1Qy;|0QcsccI8RZt*imk1DPbBl)OreDLH+0Z-JQo6IW!`kl-^Sn7K3 zgBqEe*vF=#A$+0DhogpZ^}}A8F9l#3T;3d>Eknn*&@}u9A=*5=@$~`eRJn z+Ag*}4Z>4?%T0HI&E;;bp`!M_EnO>K26QOYGX4Lw8Mv{ZkU^Qzl&3O8P zY?M+lBpX@Drn{ye7T(;P_b2O&EMzdM(hK$CysX5Ozu(Qc2$gPVu(44`7>4MsMD$JXI^`(E2!c z=gaMME#&WYe^)R}3lUfFuT2mY8(@YVnKfpYkPN^N1Vy-J`w5@!9NNtx=-^sv2KQk= zk9xl!T0phXD{i}5+v_A4rj{+P0j-_9E6!~z5F}}+|7^JHyK1aj2IdFf01c~dc>Z^$0EsLR}*#9Hmd?=^mws@_ieaFrk%1|9IMa1Ws_sSU+G1dZXU_LtwDa#GH^Cu zxtgnx@f@BNJ4PeK5`pS;|bnRhuR;@ zw$L*s2$x9=q1^qe|r)V&bbn@*I;P*xk9l1QU(X2 z3-r^b1boeHiSF=tvNGWBRtjx_8F#r8+^5)C9?gZw+=kBYwkDocaz@W zm+5V;0E%HCERNTejR*4U_S#MOdAHV7P}h8#UM9$*{bzg{Ym;k{0^na11h5 z`JV9+9^i2)wrDaUuh~4FZuK?N8}XxMk`jN$5q`F0bGJEmUwoI^zm5f}QB5eSgElE- zr%GcJ_eIOTVyEUkYbNEEP@il*-b2q$?lqdRqI&kj&dSZpxI`>F?MnA&Gh}EtqCLAy zd~be(CcmswSh*s6h-j#Xi%IlIwfptjWg@L(o34u1-J-^Ee*XRxx&7bKjHMp*&Qt8C zPb0PAv(uF+-WlFAfY<2P8RFWDkeuKRz;ujc8cb$OHQW9Csh-6Qh^@065G?sL0Ijf=tvAgFE*cv_gQ}M) z+mGwb(LL_V3%gZ546D7cP~|JH)uucF;RJt>R*kCDx8AxRaJv^uGs(LZCPE@##A0Gh zred#Em~r;dWlO`qlLRiBtRI4J+ljHoV@>3|3!*b9&Pie%G%xz{ABu2l>!-KFcPxO- zKgTN^UF~7Y8F->ut@r55=ld`e8n`c;2B+0{$sV)X=6@d;YaSk zYF?t4RL_}u-=|92QH~H}!;;-}Ca1d_tVCK9E)2ETZrtrI4L=@sVz0(SM1yfO+#Q#)45ld8jRAb62A+(wXThjmQv71>M-f`xgSgu4+wH5_6a8oY5ZPQS1Q!Nq-Rqhi>!L+ zM^18#@4MDuEB15dVR83>Ua{@gs6EbV`I6Q8K5bq^KY1+EUQ|`*$wn;LQCv*9`7}Li za{Mxi`B0%t)}>65&L}3;CQP2t@pfDQ>9%R5#hY4kp~F#~H^XM-Yq-=(%GHav`adOW z)wKDPT}5Skw&+gW7@oLv_{2|@{gH6HIEN-p*CEw6YvFM+*q^x`uQD$CYqAy=Q4@EX zDx96da)fJI-Q&vQ_7|prnN2#%+4!o$%Kp2v8- zC<@kjyA9x3qADmkFSuz$e}PDh@hfuIiAs9Sk*;$Hyk3@br*q4!$PLgdpXN_`aU0+P z71ru{20}cM{3xE|y831h``<31lq&-@R6E0JnUK;;t#%cf7;3ogbJ)f{O`?KTj+b}vdwqhPPWfyBtnZsEb8KH zcn3n7qTfzlsHY8O8@L`5y~5$GHhuNMx>{D)(!h#(w#{+fp1=Ul`2N(-m_=D_7?nU) z-V!(fw!#5+w9Dc)<6WOMOO_hCxnTA_X~${#s8CH-U?XjR*0;r-8JpJI@EkI5GETpF zZ9@P}kE4?MF#KT#JYEG9RfE9)+Aj}bPz&}x2SDAdc5LL0G2j@;$hq)V<{VeocNKKD zljD)=I&--APp+r_v4;z!kx*B5>D%G}$^8VSn)KwuRg%rLR#?B3M{a3Ma0vcN<&E0` zv*v5@DVH)mXCbNA(vBFveDt|E))vv~{Zg8s@>;fkzUFMkpUP|5Vh+MO_lLB`1;}Nf zi`Rt`c&m8ed^6}Si6)?VV7BuKKnz4;7gJndv~f$Aex&7gt+}hWGG&UnHnb;rKTz|2r~@51-WWvdV5 zsVY*fPYrAT-ufQ@m_AjVQS4H0fdp`spxTITpEGB+$vtv-#}57DN8ZK9DruV%bl?Yy z8`R%y-N}b1q!(br~FlIuQA}CvRZ!$G)-1q%BWt;UkSE; z`14-djh&$a{qPRw`H^~(=J_9wI@>=f2cZ3 zvQ?Up3Sq5jm8}d4y-zXtE|Qt!1Q;dP9;cmN7Pi=d0tm!!?oKDZ3y-ZVXrGA(@;TLO z%nh9V5odhC%RCQ^n_2Bt(HrogT^4+``8CczAy}1i>YGzxRDyMv%&KnFZM_~6+p})R z_r`(W1vmpcYJ$F!J(P!*%36R5t%4N%x zYhoBvqUfd&9e)&!Pj4r+P?$&iGvP(|cSbeD5z#w^=947<4O|FOQsBrbz z0>4shW&~rp;}>TGQ%iIPi&NFn_ZfH!X1#uel!jk8bGF)beO)Y( zoMPUskXpb7@6s0jbYt!FraR=VkqBfHM}L@d?F8nhU|Y3@^_%0e0hD zxeD6szO?1az!4ErCbH%p0Zr*0#Lc~IdpJ*Lz?1sSt;cZ1Ak8;s3H&0~n^Mr-H|93Q ztWp|HDdHOYlj;$t8|?*+=MD#{@1{x9a~zh3Tbu@zj;H#pCdR*DPDy`33zOV~O(@?uv>9Mgs7LU)C zVAJ+1jMwhy5CKdiGr+OQ&QvIx)9xoOV{D%q$Udza-W|-Qd>HXSwSc3JgeQten!Y~$ z@UeM`07OBMG1JQZACzpGyUeyh&K6Ra(|uhL_}{?pUi1!bui{A7ff2Z{qmuC{*I`XR zG>8(z6vz^AVZwAODz=>cUce(D^v7x}?~M=2F?M&9Xb%%@ zE*y347R)046ohkgg065q5n){YL{X04UVteB_k1G!?>zK`L;fXAE4wc_W2P2 z#fiK{yBkuL3We)bd-NB0Qo6rT_uS7IGR~lQL{Gc({ZXtI*|c#lMUp$;Fl$dJIoS(^ z6gQ_Tb21m>5sJ%hT2)&B1ei^hu<`Xu288i3?=&-=*ci{Nv5 z)Y~~_-P+uDhtz^uI_?#@`n9zs-aMtc^xk*AZZ9ljb8bpZ2)wp*g}0sdz;a2tRm9@= zyQxaW&_fC-k^yu~i*L2(8^0S;Id>oLv`Wc(TddA@aeKGn7QU)bnN={yaJW-yJ-!L= zfIPo0Wy!&*tST1@}s4jchG{2gyzcJvjPyc#wc0ta)l;H}) z;!h@yhfVsHM9Ijy8qEr5u>Pf6^r^w^?BrtS8?1@pWa_!C*T|oD_}& z&y0U%>J&?0o4bt4J_)nCkN5)%8K!)l2T?sU*BNh-oey5DkINP)!E||EKs(jY#9?ny zu#8`3%P;itg}XJ4M7;T}I{P7x7X4FlpEIxQNB`n+tkEZNgI|umflJu-Go-8s?Lq#W z(vN7BU-*E$n_XfZeM-u&oMU%;3re5pNv%7TJp}U)xLb5F+*n>YB06sO+c-~bku{^z z&o(Y!LSK`L`#sFaM{`^C68}ZE$ea#7UX@_RM{Z1l@$656Z?m1}s;YjH$hp-XcSG}) zm%t-;Cu)YcV5*QXwh`4^X@hqzvOB506SrAd>yyX6E~EuZ`)5}T%-CT(xBasdX5?44 zsvJ*7ii)bowoQvCiTNETl~@QqO0JO5Zy5e6u$v_@x9o3Xu3Q1XRBy?-M(~A;WE52N zb#@e23`cL?4L0%c#~q*8mdTrlxya%&H#jPCy$L=+c%Oky4C8mK<9L?eY~c^=iChw2 z_{j&Yrq#Km^{=Cd+5&Z_yX9uwsIr#5y)(5M&*QReiZL=mf8yj0jwLC$DymXn8SPip zH|o1MA#C+EkBiJf&}w@1B;L{QQgVF3pl7d>Q~SM-f3KMs>amw{TC&N(EAnlo)& zU->B!+_yQ`W-XdnZ5@T9OMdfQ`5A=0;8`*b2CA8|S$TcJiFJf3m-){O4OBsuU8cn_ z=iV_WJBz3vN=MdKrZ`nljo7*Rti_GGU+v+*j~dZ`z<;KZP4!dXsy^r00)FP;)N{b~ ze4coO>uWr(l6F1M7CR~HgA&oi4o%|OSTli6su_#6fjBo0u*W7dd_zZF+w7~WaM|Nvh8=Sdl(?mr`ntlW zqjLK4ow+6|ShSrm8vO1LvV<^EBCtLxcyEwns;l2{`tl}wbftU@l1iqRFsvY3dwMDt zm{%#gJ@9VTydJ>j!Hl4z;s?e^KI#K*10OYk)IB=i8b?Xxxi_%one?LTvV16v zr4}>~CY<@Qoz!{ftW|u{rx4rgC(0c%f#9Dz|I7bTg=V{di@w zi#4-U*e7v1zaJX^P%cPGqHBfFdK$6Y+%vcOdpzp%>f5H3y_f|fU*~4jQeZ07v71%U z#^No)_1bG+1TJSgdX^Y)tp<7-DWRS)63}0r(PK6&j>=y~f}@PZxg&bsu8i;uS76cL zfLEXuWMQw%%ucwplX~2d_4Oo2OVnP^Fjbkwsk5YuG%thguI=$8If7%;q4u z{xrICGa!>awJlwIzQ?RA$&qkEKyIT_p$kVo#4SuF-imLh7tt(npX_8`vC#48-H!o; zBV^#kST%N~f&lOJ8OBE&_29(upP!Wpv1~59XS7i=Wocswo{8$OJ3bjxbmO~sFR0<~ z5*Wrs)6n;nJsnzQV;tH?wFoLHCUmZ{WFu6WjSsh;Q%em_G@A>@*auY=FuR+sS_Yp_uVvEY97 z4~${FM$eofI?erO3*S!0S(ohK%|6f2P+YZ#mxL>PQ5*h7X)I6eXkMnQ)W>{VSdI6W ztyH1zP#|=B)OqQS(KfuwAX}sp%X_$tc&rlV) zIQXM9jFnf*?D>8UK+176W71PzwJYdqbEU>0igofbv7)-a@Aa*=!^O_BJ7i7*&u-(Y zqS~wKp^Zwwb?HSe$OV6vniV=z9Bez)8E$GmX4mWrR|4T>Z%%U8ZndpVaZtbaBn~!! zmMXy~93h04Iqy-~Q7LQwGlb;JEBqThp!cf$QVN6_I_L9^h#=eje%GViKG%TOARj|R zHBE-?qb8y^@Zo80Up8@Ed0Jt=V7oCJiEP@`wpLSZ6cWV#(lWU?EwuGe<%-rlu5~27 zp~a`|<>lThO)O5FZ2|+Ce<@EmkI#^1G!5x{3B?DN5O*uR`$s5I)JCI)lj!#GTGhWf)v_$HHaf7BWvf;kzhJ^Y(UsOOUs5#4`QOk_^Ed#iq1hPbGAzSyKknKu@a4f#Za z#9z&SJ4ypdt>ohMb?Ur-==r@@E`Gf-c#QE#@RyD0@x40JTlwvudglC!N>~b^)+XiP zOs{g|CM`F?id-#2O;zmP&^8wVe7nQ%_{?Id%oEBNOSX)))CmHaIygLL$w6nyEiZi9 zJ#OyV=0R7&&d^vPdbsYJwQWkIJe0T;zbaYTAlCmkfo0~XQ@Ve}MYW#&1yXiwfYH8a zaLkfj4`tzW&U0~=v8Z#Ek42M-RDQeV3IVn#-R$s()8DAKP(w5XKw-)6qjyf@^`%Z}KoS2Dv+xRS*QSgD<5S zs5Sa)t(c%NJ%?lbYd+GYPSS+DWlCeZE9+JA9zs2FKJuJ9%GVb4DvSdANg*Sa& zpX!Vd4q5Jxa&AmFn@Kn__Fb3Lw?iI|$j(5lLDp*BwEP3iO*i$Fod=YNgd#!TgX zH|iQ6nj-Of_{sskMiABG$d#u(z1o1U=s8NDkeS(TWXq$dK9bMhy1tMZwcjw-{mpb5 z6H_1bdtS~twlp^SZ{4!)vfQ5UYfy7zXOmM|^KsklVGfXgLCP zdnfXIqWKb;{a@LDrLYW5&a2q|w~bbDr>a|>ggg50l|fy*|qt`?4^=-1W7HRy7r z4<6So8nkUkck94yLN1;NG(0mmn=wIT5qG3UV$DErDk4r0Sngk-(g?htOqEh{SzpH! zIlY+FvUPtBaYX<$` z^+S{X+JeebguYC=xB`Jw0lTez!;Y6J3h&`OAVF+YSK-2d=S|&`V>Wlsmt-%%9P>k! zC?_4BmSnSnTy7{rUKN}i{-)5m^7*$S<#!a3c}Mg?WB7BzuvGOayE&HVaLdGC$~1Qh ze_W|ZnQ@i2*)vMXAg+nxLrH^`54o6(d9Z70=gzKYPJv3$#VB@@>GrFYOiWH6KDkt4 ze{GMf3f&zaKHb#DvZ_5FNE{+dqO?jYn;hV;$Q8JY;uRIa`lHb^QUYn;BnJ|JfBh4)y$J@z?8 z-k7F6b-tYCc#xD*E-H35Q!y?5`4P>cIK_MhXQdcxRoMvBhbiVk`_~9ZOVP?wx{0LC z&4HyjaJ3$PrQ}}s5Rbv~`-uXsmHs?AY9WlX2F)-2*;?nIrx)tL(Q`tzc9}%TGi$Zy zlzLE4X9rsXBZpkLB10u)O1r=4l(P|Tjwsk1M`?PFK6Ha@azb}fpjP)>6`AAirA)}8 zjYj!H!RD@uU(_E>_;5lWfUquJnT)=ES0G%AF30@lBdsI5YiUCd_a!hj#BoTo0~|Xe zeFJHP1S9LWVDsZ16?|!4K_G!^Q;U1ATxjZ-&KqP(gOWIh#W9KF`t_?>C`kFfwIX}B zA6xSDCD*`WTYlw^>vE$H9WgL3l!6j48U1TMk;zs8|Cx;Ezi13SorIoE%1?44N{-_` zgg~5AaVJUb;W+YFigAW8c!W=p>7j7|#I&?ZYGyA-1>#Vxoy0ed^GL?fP4q7{1~SSl zn$;II%XX+;=jg=J`ow&1RAL1F+7s34Mn#OD|7jrcpP2)rRS|b`bu2GxB-?=7N|P>n zX0R#y_3445p3BVzG*5u6<6QJxF%|1NQ${5c| z3hvJRM8j2_WaL)tEJ{DN6B5mCm@2epX*#$96hV`a?3(&Z_BSZ8jwR1;7Bz`MrGJ^3 zC5{7Y7nC1SOFuE`u3Y!cIG7Iva|;(<-_}{O^OyT!XvB!z6N(g4kIr!$t=YJe!ChI_ zAJSv6Tx=X{qBbnCWZFxF!VMqKE4Z@LAe-$iBrx5H+`XBl}#Q z`tG6epVxY&EJq?K)3_}O%(AjKMl5|hh0tjT*Sm*8ObN%_V?koG*=ch7Yi%34m%z%K zGc)?oOuEMWpIH^c6}gMa7AzyDnMQb5GlxNTejm| zzy7jcU@yofb$CL4T3JIgmxm4p9`yUHU<4sY*HCK`?k53lu!OQ=HM6&VDPHuMZV$Kh zFKWgS$~3LpM47W0NPQ|Wu)FBxL`Z1$m(sp&)&=a=S|B}MPye(GX}sk4fwJF9yOJ~t zAKzLjQ6ywyjx#+gA&?KFpk%AR{?+!74BT`nme`mS)aAXEHee^F8=?lZrPS*y#rH<9 zZ#JIT`MoGs>P&DQ>&BytkZNAIz7p<7uY+8DnL8Sq@&9X;GOMvu{88ORc7|>~Lre3k zn)%56*oL4aVl!P|j@d$DAa|q z0vCwCkMn|#4*R|`ax53DC=vISgSMJP2d95W&6l6uJmQL2na`Vvt&KV@hwd6g1W8wB ztqad$bn@3K(>ogWOc8#q=W>(yGgJ+p&o8kqd=Nx5&4Dj%YFm!ueWmls=oQb*+E$V9 z4dgP1rz*U%rFo76ZWZWr;BBo&cy3w87pZ>#u1Zsrp)~;#GQo6L#PY=>6~asXT+>U& zYnz|;l0cL`nof7?d?x~DfVerV{S|B&a-Q(uN?h_A{k%%6K_LN3dDJjG_&KItXlFPM zP3e5~S@JtW%TkI6_A36b-;NAyiAC+(q-e5l8J_>I$K{u#ly%WEP48AmYR_p5~X3l&Z}>I)$shB~BxmeHrl=0pMk4rz9R z+iY~9M{KzbYkB||aE)I97E z_z3N}3W!GfbCV`km7-t${p!M>@X#C_B)^yjhY;_o0)yKNd`Qd_aHiasMxW=zRljM) z>INkEI!Dr(h?Ns+)!^k*r>Cut{}ZQ@rnu5DHeeX)cox9VQIg%R=q*gA$y(TFkhx!Z zh%oe-8tvcwg5p#&wJ;n_zTR7Z|}<#LmU!2Tx^Y=-_Aq47eO4s0ZKGr~U{D9`A zd)t-iPZFE?b4(>3^NJcj$DjE;y>twx2!~|jC4=-{-dNmRTgcL~Fui~EbJ9UMve>uS zZS0oF-<%S3CjR!zg z>ZVOxiZO+zjE~;6p8;Nd*x?)&zJ)DwYAUI)^#)z!tp3SeRLrI@JV<$cEbEa?54apC0{=rMo$>W0<(exN|OxeP) zUkr_m5A1a%L3`5vTS6I(pVq5HYiHu&J=9WyQ4(sAQyzo_g}-{u{T1gYERQ)U#mGsQ zYxhk0Sd^^PZ=7K8-kbi}ASn>qeFL)a%>%OWPy1BNlh~&t>dN-V10($96$mwB4*kBG zW?Z1t?VLG~3=ZFFq~r7#iPdp>2RZXug*dVO{NbmAZ!ibPC7Ce3(fjg{Q5Th$U%cfS zVK(eDt(>%f-hiv9@$@x$p4mO>oD?rIIdeYae;b-UzH_yC=W4nbaZfB8t3N#T7N`d;$CM7Q$F_=TCf=nep6UmU>O70!yLJ$#k@sWlyim^(Wjx(rpyLCZ z0iT_LVS&C~@3>aYgM7{iJm~s!I0lqKhEgo5RWbL1Ozg)?DXKcgT*{lpa%G=PU-@O^o5+x}-q zwsH}NFjujK@#8UMeaS$@_U3x5{0}-i7;W?Q_dL{3oyOfIvmx6NQbC7p&6UT~&3I+k zncZWIuZMM;GGT-0u-{KS2%0!Ojy)Uhdl1wh&-&Zu!PZ1<|E|#I(7#|&lzt&YRdF61 zb>*9;c}FqF%S>#vls>;R#{X-Qikj*WnHbV*w9oq3Keafzq1;jEMJ;X8_DTQV#6K^; z^r`o!tBvd`-~%ygibh5bho>gK&e>OYBoDxeBhDxb)xTf2VRv%cKQ6!4=os7@wL)|Vlrjh85T)&;xCYI;l{zM zC`O|(TRYIKFKi3aTnj~dZ^|uWkH-?zAW-_d{tbT(*w1p7Zc1!Up5q#XJfly`6@gd& z-C```Wx3>9jp0=cWu;JZ^x~i|{F(NJy8ZJM29Bg2=Y7}av0Dr3pSs>~t;zud{UO}* z&2Q67SGr7&osJ`(A)M$_!z|_IrfjpOD2a}MZ%woDZ)2(zxMy^U4K6n$lSr`C1y}LduTgs3v@#< zONN+L`{b=!w9!iIVe#F`Wb6z=Xu+IUUK(${Q2aVzUpi<%fXm!Md-PBQ9}oHDk99nO z+io`malYm*zBvm2_QOCxU9Yhe_}()SC>^J}?xolOXm!!o08_R}V2+2PxBp{kzeu*i zQs{y%{0OVeZC^S$tRYb8PJC8lyyVb`3NJqUTWE&{BSP&InMS&<@((9(1dWa;jc}vx zmg7E&KPBKJ>Fm7qYrOf&F^yQ?l&rm zZ|y;)8c-N|zq&M^U^*Mu9^T__TM-yqmX3yez=<2HH)cs&cb9$0f;_?vh(PWDkR0@% zN@06I7@`Abp4Wl7`{9Ado46zC5SGuu?crP+d&hQJ=%y8cAjm#}cua&J_d!q2zk)P& z(?K|lYvA@aac3FkO6=?NQUr3qXbh`t(?CjK6TvJ=@^YjLzB- z%-$39jBBub-c1wzc2M}TAHpwW2>j@AY6)d7M`?}#eoPGy0$1G)9vcp$Ec7peZerD| zpHb5Ez(w7rz0Dr}tAk3dgq7u8dABWwRUb9}SuT*!_;OUm57-t_zD%e=fy#>+#83QH zk2dkS__G?o;ITD6#=djYhFEP*Blg_-$;GzNm&o053)Bfl`_Gf;A@eq6=|>uQP9>Q7 znyhALq1Sbm?I$p2E_C?8Yk|VwnTnbgsdwpN=DGGkNq^;PFvX+yQI!3~iwErSxZlRs zhm?Wr?9HY=2WeljiO1Q^R9nw|i>8t~%))#KU}^%{02aJWfGzy3SswEBc-i4({;Ruk zcE&L9RiJHs=X*jq7t>Vjxe})O3=7^YRv{G`upniTvRxCgN#g6YM0IZzM^Zqh@N+N4mr#6)qY<`f;d{@!6}{+gocQWC zA>Rg7i#_@sQ%u%xJIqpO<_e-&F7uZyz0M>Qp?Eg^Fc0g#Y%A#8IQ)8m=tmvm zy-qxI!7x|W(->2urF<0af#^tCgz`zAvRqK$C@~4Z(+**_qCg0~;q=GI{50eCht?ID zZJ-g2@#BMv)s9}M#+tlvtcNQ`$rObuKOb`h`SoDs;S2KT>O+&>^>rIcZoL&v>+HY9 zD&|rMYaR{_mnhQB-^5 z#XZ4#12K-XN;teY|D?k~8jmqVprr?RsL4k|u)M!q#<3ekj-s1Wfn#{n%-@$QW&j9w z%iIoa^11;YypF}KG2eILPChvjMAy4oWH;|n#weOP=s`M613Cf%M-h#Ie_&EZQ28=5 z*I0+iFsta1iGd#QFx}Z8UEi4+F0X0R*|l*MQS5{o+id-D@`yta47rKfJhnLcIrp(K zB7#rq;Pd3o+t`qVo>z1lA}DN+gZsyuLnR`Iet9RHX|(mO2h(bV8I;Bxsg$ka|26KH z65I}PNLMJLnYu}zf=_BgHi7%78iP6gOLv+r5Ib2OHMSFhAr7b0oj3a^vBB*|@u!HD zr%vw*idmUOL{&_diYz6xyXhfBL=WsPlzo47UNOQb)5_YWw7RC~ov z|A=K|w`_s0Q8py66{SDPJea0uU(31R^AfEXKVC_gXnyU@wXA3b86D9n<2m$ScRM>+ zgb(<0d#ip#!4IoFEejwnu!lWYwjj%Pfy<{KY+5nlgvqQc=O({41iO|ryEb_y8;@Cj zh_rwzRTHNUe+tZ9ZLbOHLrXzH3l*?3T=Vu@Yc*_X$aqcij{dD;9tsYR#@VORr!!FV zoQ4Gxa?jKgpH@^jhsdDON7xsuVPN>@k9*PbqeT<(!)922>ft3V)~?ddHi^?$$Z3}M zTdt-$nH)%}t6bYZhiV+*;Gb{Va5=r$XMUfn1u~HeeLrt`+3v*IqYj~{YA9K?a0L&P zED3p79h6xW-D4OB<4Y3CrM!vy1V~UL>y1zg3 zyf|v~#l5zAlyJ#)dxvjHoQ58Ts7m$e)l++TKXJL(DKVom@Z@Ctcrqd(qk*=zfBQIV zJ5?j)9fV(NG{+vIm}zk=`{q>{5jBmsg+o+{HrpHp->^2@%%+K7AcQ)gZ9LWygJN3@ zleA>kPuJKDAUOirLRT}3vF+hyl+(jR1^B;+!LM&(A^U0nBEW#)?HATpEI)ZkH}_2j z^hgze14O*fOw+yAh6Rsd=fMfP9A_JqYe1=6hAK{x8{jG)YFV{btpCL^8`Ir8LPrKc z;EK#%Wje^}u+|F~go`vEs~S(8kBAnDm0gZnuXP;hQd55~<0W5<+Qf(jgct$xfNES~ z(r+W>*PE@A#<S01-yxP{uLIziK?3dv0>RP;Yk5I*D-z8$*~7U#&b{alE)Z zY0AGpk_6K8@rUCY&H@3Q0MLxeZjPz|c)eO;hgcF8d}agCXDpS|0zqAnJVHw@WZQ89 zkP3+9VW8`tQ==q0Yyu_L47*9Zv-607Bn-GbLkyvF$tN$c%l3Wi2Y9gOldOCFfE84_ z4D43m=0x*R%K z_7UNIkvBUFc8%ms*r`7_OqI|V#4TO0cF)xpDJf(+TsQZO`#7Fk7B0uhu?AP*sK!mDS`^(Da3>9CJqBd2}gr!bUM6s;?e2*BoqCT zW{GIat^g5@@^|bLUxteV0yaWXkrB`|g zZ-w|EgmmE6N}vln^UVL%-qpu5-M#;wR7j#k^Nk9(D2Y51-Kin+aBCDXk>q(bk7Jg) z<5zhox2FoLl;t^zdE9gpg)9w|hepyc4BKd%ZQqYdeedpX?f&EU`{(zvzxMjC_I<6wD3c@27ez?zB>f_5fYkLnoVgoCoQScpr1f(J-3T z0%g!CtoS%qWnWy(d&cWDToAh=kyF9?=onzY%XZ%%fA`hoPE2;YUc6hg7AT8u>=@IF zZ)SwCgWtxshoysaK4>K^o9u3EZTe#sac#!Y?29JyO+YYr&>n@9mLlImPQCGX{h7ID zGkcCpI4IlAN@2M>=;}@o-Hy->d zQqB zfM9Q(TlJp8gyLgVm+Z#tG@`l4=+HH)X?aR)!c#L8 z!;C=TjcDDfS{>h5rvNnZ+FP+5!UQ zNSkhLO|yMd4ZCjbI|9X<^r-pZT-u~>W`3u7o>$!O&%NXB?1Oq$uMT#R5l-Y>t(#{7 z-U|-aZ%}96UBv)(oRhoaO5dLYRDVbyWq8m$`teayYfvy;A|y3mAy_U{Y=YNWTkJc} zM!-Mw5+=Dr-2l9Gp}9k5*c+D^GT)O}bSb5DGP;vufNvUEPY-6PQWKz_XB8`4J5=)F z$Of~>hRE9D`5>`T#OvTFWNo19H$OjID9^Af4T3Dp#U;0}WAjVnZ#YqO#>^Ygy``PQ zdC3+ ztYnN|NUjZO4-=bV$R`+&O|P04az#EbYx1?3g(3sX+Q(4E^2PMrSqPVwDW9oaR<`f} ziyQW=>ZPkunRMR9opQFUJgY8wqolVXlZTLU&oUd&cEVZtiH|gU_MbAU(Vg28fEKK0sc&6*fxCO** zAko;G+3W#YmHhV(q*=+8GAsHWp>R$UhLF@W;)r)Dn?SD-Kohif&?alIW3y`159)p!R^TUX7ttxiOdq+T zFYLL_Qb+mn?q9ox4Yk*bc5HgGbFHi70iq;8EEsP?OY6DhtcVag)c9)9p03`6zY}b+ z?}W5g;O*30u&=O$P`E*(l4)Eh+GglVd*yt}A=^q-z3R+#qNd2h&E5XBLL{Gcn=0hC z+=Yjaq)Yt|i(SlT>p$-`f*df=eZ0TnariCpYKEU?mU|`^bNafZ%IwkbI0>B(n$9cl zH-dZrI3<+@>a4^TqI{`+{n*yej$Delt@6{PpeMV1<2Gm*l9ulUTUHF6EHDaMXAo$( z{-z^OQ$75FubJ?gmJYII$M3;uzl&fdiRew)qj4C`+xjIJ@B=V<2xnYP}_WEl@ z8AbqbBW6qRCs*%zk$!k^<`|;4 zEZaMGFZO6Ac}RFv|J=hP;pDA0`h=|-Lb{^2iZDgaE={`}X$VsFhx48KwnJq@C%34* zITc|Lf=ji(UJ9?7d0kR89_&vA*;kGo_Yb7ZveQuV)ig>~V1`psVx{BwE@{UJ51vwb zP(lH17huVT(?iizxjxogeY;l?HNW`qLYZ+FXN4Z@Dt4J0i_FbP(}F?PyaO!QgMyQo z^{*;nZ?qqB-pVL>;hKW{_gIu7Y>66kB_DPP=SJ8GyZHNpCql2NWJ*)pa&b`Z*kevF z`&LvdH~Vp9Kl^yEHrOql_9D1SO*lhoDj2GwQJ`sScDSkBw;R%>R+>u<@I0GqR)%!J zsG7P^^2iKY^NgqPB~+OfUwQEaw?ZX)tgx)bXK1S?gj}rFPw1I1L-|6xyzKFPxLm}U z=*MG;OjhJ%ShpMbSpPY1xC5bxe_t_XtYiN}J80W)D5!ci+eaMUaV9tM0q@z^5Yu#P z54g51q=+;zCylF9p)}vKhx%~cJd$hm+pD7<^>pnwf}os)MOuzks!QP41UH&v;JsD2 zw45G$kNfW{O=_ch+n%;vVVhFL$LI6J&qn~($-kJ4r-?_SwB%M7Ot_+hk;|E#wA*97 zg{Zu+qrzgD%FNLOJ2(Y*eNvPgakHo@@FbRoJTX~9SECoBs11*9V!XTZlp=7=M|VBZR2?YOwv2YbwkM6Q&NQr~$S@^mVJ?E!5WE@oneZxcz|Bm^LjH1L?z(t z80A?7Pw2h>mtF{NHapyKR=b>T^uZ}|BL~+K#3>!FLX04XI?f|?C#8BhVQna^iKW>9 zTdvzH-ktamth6`f`7w%xjYUdbr#%l;1=jietj83B_I*FGOX{bLDHv!tg0`>{hT@@lUM{7jTE(h17&v9L+Ois&=ZDTDQ znpBoPkVr~hl`6Rou_JH0h^*2*OcCNRM6AQW^+?Fr6<(IH#^Sm1q8}J5oZLZtzCK>B zkLrlv24#nR_7Ll8f7w2)!kQdO`P5uT zo8(thQ!Vmnx$u`^V_Ycz_tAEnuxE_AdQyZ)336eaKp|x9n+c>Yty?mPL>%Xb=%>1dPG7xRpE~0F6-58b7nq5txnBi3G0Xq6 zEjL>W+=R7{xNu93IE_D&ooEdtsu+;4tTgco*PwB@;SD*Xrt<7p4B*newrm}v+4S~D zlcb@u)xw`xwu=biXI{@X*aGlv&uF#$a=`nA;P6*jeJYy)0VH!GTrJ~E&BcY~?=|!e z10_`uAC!Jv3D;Tz zD0ww2{J#q3Kl>hCDq`-Zx2yQ_#vlIIqQEd~fs%&_HQ!oU{N=kRbti-rbuv;&e+q8h z9soHr?s);3N`LTQi!(A%0K8*u&Ax+z3SOvF0ZLl8?^FLy9sYCFUtVknfK=r*Tm}=| z`X-_mXS0w}Qoo0yzPF6g&JxFyKv7g| zlrMtS0yRdC0>n&)H+t3^`&dm1=m)B6)S1qS11{LtEx~Q^Ru;mK@@#g>uS?GHvHZYqPo!h%S4 z2o_0z%|H`sz6k>m$kAD@b5*3H&?zD4-rAXz;tbs!0*dK~3TD3pLMfb)+y*DGd%aWr zH$v$5`twCQ2M+JBm8{PPjyfWKi9RJ?-4KJ+G!_im1h@duQlb0`*}nr7-#-~&h1At&%;?ZJ1H8!6{HoHLcRvv;2(%J(9T>uW%$KL*ExnMyV17qTxjued6^;H1? zkF&`too`b9ksP{}x0Rei#}Nd*_RFH3fzVnQHMVZ`f5S2VI{TajTHqhk!2N^*1_h%6 z6B@wX(7jAh$pda^U`$I&Dp2+QkCZe(yJXN)J>ty*If&6E@ekfBSfEG^kS-pb6*=<) zU~|deUdaBQtveGvhZ?wa+msRoT+xZapbzCAmZcNA-B>u zRyH!lVRe&6V~f9QJx*vs3S%ZvnC r)`3RAT=&U55E7);|6fJ=k-uC!^;F5N%e58DfRDvt>qA8c-7fwc5fFRR literal 0 HcmV?d00001 diff --git a/examples/multimodal/clip_converter.py b/examples/multimodal/clip_converter.py new file mode 100644 index 0000000..35c8b23 --- /dev/null +++ b/examples/multimodal/clip_converter.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import argparse +import os + +import clip +import torch + + +def convert(download_root, output_path, tensor_parallel_size, use_te_layernorm_linear): + device = "cuda" + + model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root) + + state_dict = model.state_dict() + new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)] + + # Indices from mapping pytorch multihead attention to megatron. + kv_channels = 64 + hidden_dim = 1024 + num_heads = 16 + indices = [] + for i in range(num_heads): + lb = i * kv_channels + ub = (i + 1) * kv_channels + indices.append(torch.arange(lb, ub, dtype=torch.int)) + indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int)) + indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int)) + + indices = torch.cat(indices) + + for name, tensor in state_dict.items(): + # Skip text model. + if "visual" not in name: + continue + + # Skip final layers not used in our model. + if name == "visual.proj" or "ln_post" in name: + continue + + # Map parameter names to ones used in megatron. + new_name = "" + new_tensor = tensor + if new_tensor.dtype == torch.float16: + new_tensor = new_tensor.to(torch.float32) + + # This is used for chunking some tensors to target tensor parallel size. + chunk_dim = None + + if "class_embedding" in name: + new_name = "class_token" + # Our model uses class token that is expanded to input dimensions already. + new_tensor = new_tensor.expand(1, 1, -1) + elif "positional_embedding" in name: + new_name = "position_embeddings.weight" + elif "conv1" in name: + new_name = "conv1.weight" + elif "ln_pre.weight" in name: + new_name = "ln_pre.weight" + elif "ln_pre.bias" in name: + new_name = "ln_pre.bias" + elif "transformer.resblocks" in name: + layer_idx = name.split(".")[3] + base = f"decoder.layers.{layer_idx}" + + if "attn.in_proj_weight" in name: + new_name = f"{base}.self_attention.linear_qkv.weight" + new_tensor = new_tensor[indices] + chunk_dim = 0 + elif "attn.in_proj_bias" in name: + new_name = f"{base}.self_attention.linear_qkv.bias" + new_tensor = new_tensor[indices] + chunk_dim = 0 + elif "attn.out_proj.weight" in name: + new_name = f"{base}.self_attention.linear_proj.weight" + chunk_dim = 1 + elif "attn.out_proj.bias" in name: + new_name = f"{base}.self_attention.linear_proj.bias" + elif "ln_1.weight" in name: + new_name = f"{base}.input_layernorm.weight" + if use_te_layernorm_linear: + new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight" + elif "ln_1.bias" in name: + new_name = f"{base}.input_layernorm.bias" + if use_te_layernorm_linear: + new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias" + elif "mlp.c_fc.weight" in name: + new_name = f"{base}.mlp.linear_fc1.weight" + chunk_dim = 0 + elif "mlp.c_fc.bias" in name: + new_name = f"{base}.mlp.linear_fc1.bias" + chunk_dim = 0 + elif "mlp.c_proj.weight" in name: + new_name = f"{base}.mlp.linear_fc2.weight" + chunk_dim = 1 + elif "mlp.c_proj.bias" in name: + new_name = f"{base}.mlp.linear_fc2.bias" + elif "ln_2.weight" in name: + new_name = f"{base}.pre_mlp_layernorm.weight" + if use_te_layernorm_linear: + new_name = f"{base}.mlp.linear_fc1.layer_norm_weight" + elif "ln_2.bias" in name: + new_name = f"{base}.pre_mlp_layernorm.bias" + if use_te_layernorm_linear: + new_name = f"{base}.mlp.linear_fc1.layer_norm_bias" + + assert new_name != "", f"unexpected layer name {name}" + + if chunk_dim is None: + new_tensors = [new_tensor for _ in range(tensor_parallel_size)] + else: + new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim) + + for i in range(tensor_parallel_size): + # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage. + new_state_dicts[i]["model"][new_name] = new_tensors[i].clone() + + for i in range(tensor_parallel_size): + output_path_tp = os.path.join(output_path, f"state_dict_tp_{i}.pt") + torch.save(new_state_dicts[i], output_path_tp) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=""" +Convert OpenAI CLIP VIT weights to megatron format. + + +Example usage: +python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 +""", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument( + "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights", + ) + parser.add_argument( + "--output", type=str, required=True, help="output directory for megatron state dict file(s)" + ) + parser.add_argument( + "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size", + ) + parser.add_argument( + "--use-te-layernorm-linear", + action="store_true", + help="Use Transformer Engine's LayerNormLinear", + ) + + args = parser.parse_args() + + convert( + args.download_root, args.output, args.tensor_parallel_size, args.use_te_layernorm_linear + ) + + print("done.") diff --git a/examples/multimodal/combine_mistral_clip.sh b/examples/multimodal/combine_mistral_clip.sh new file mode 100644 index 0000000..3527341 --- /dev/null +++ b/examples/multimodal/combine_mistral_clip.sh @@ -0,0 +1,21 @@ + +MCORE_MISTRAL= +MCORE_CLIP= +OUTPUT_DIR= + +python examples/multimodal/combine_state_dicts.py \ + --input \ + ${MCORE_MISTRAL}/iter_0000001/mp_rank_00/model_optim_rng.pt \ + ${MCORE_CLIP}/iter_0000001/mp_rank_00/model_optim_rng.pt \ + ${MCORE_MISTRAL}/iter_0000001/mp_rank_01/model_optim_rng.pt \ + ${MCORE_CLIP}/iter_0000001/mp_rank_01/model_optim_rng.pt \ + ${MCORE_MISTRAL}/iter_0000001/mp_rank_02/model_optim_rng.pt \ + ${MCORE_CLIP}/vit-mcore-336px-tp4/iter_0000001/mp_rank_02/model_optim_rng.pt \ + ${MCORE_MISTRAL}/iter_0000001/mp_rank_03/model_optim_rng.pt \ + ${MCORE_CLIP}/iter_0000001/mp_rank_03/model_optim_rng.pt \ + --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \ + --output \ + ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_00/model_optim_rng.pt \ + ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_01/model_optim_rng.pt \ + ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_02/model_optim_rng.pt \ + ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_03/model_optim_rng.pt \ No newline at end of file diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py new file mode 100644 index 0000000..2f70284 --- /dev/null +++ b/examples/multimodal/combine_state_dicts.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import argparse +import os +import sys + +import torch + +# Add megatron to the path. +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + + +def combine(input_files, module_prefixes, output_files): + num_inputs_per_output = int(len(input_files) / len(output_files)) + + for output_idx, output_file in enumerate(output_files): + combined_state_dict = None + + lb = output_idx * num_inputs_per_output + ub = (output_idx + 1) * num_inputs_per_output + current_input_files = input_files[lb:ub] + current_module_prefixes = module_prefixes[lb:ub] + + for i, (input_file, module_prefix) in enumerate( + zip(current_input_files, current_module_prefixes) + ): + # initialize the combined state dict using the first provided input file + current_state_dict = torch.load(input_file) + if i == 0: + combined_state_dict = current_state_dict.copy() + combined_state_dict["model"] = dict() + + # copy model state dict and prefix names with the given module keys. + for k, v in current_state_dict["model"].items(): + combined_state_dict["model"]["%s.%s" % (module_prefix, k)] = v + + output_dir = os.path.dirname(output_file) + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + torch.save(combined_state_dict, output_file) + print("saved:", output_file) + + print("done.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=""" + Combine multiple state dicts into a single state dict. + The combined state dict is first initialized by taking a copy of the first provided input state dict. + To avoid conflicts in model parameter names, a prefix must be provided for each input file. + Model parameter names will be renamed from to .. + + + Example usage: + python combine_state_dicts.py --input language_model.pt vision_model.pt --prefixes language_model vision_model --output multimodal.pt + """, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--input", nargs="*", required=True, help="paths to input state dict files") + parser.add_argument( + "--prefixes", + nargs="*", + required=True, + help="prefixes to use with each input model's parameters", + ) + parser.add_argument( + "--output", nargs="*", required=True, help="path(s) to output state dict file" + ) + + args = parser.parse_args() + + assert len(args.input) > 1, "must provide more than 1 input model to combine" + assert len(args.input) == len(args.prefixes), "each input model must have a corresponding key" + assert ( + len(args.input) % len(args.output) == 0 + ), "each output file must use the same number of input files" + + combine(args.input, args.prefixes, args.output) diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py new file mode 100644 index 0000000..482c605 --- /dev/null +++ b/examples/multimodal/config.py @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import torch + +from megatron.training.activations import quick_gelu, squared_relu + + +def get_language_model_config(config): + if config.language_model_type == "2b": + config.add_bias_linear = False + config.bias_activation_fusion = False + config.gated_linear_unit = True + config.apply_query_key_layer_scaling = True + config.layernorm_zero_centered_gamma = True + config.bias_dropout_fusion = False + config.rotary_percent = 0.5 + config.apply_rope_fusion = False + config.attention_softmax_in_fp32 = True + elif config.language_model_type == "8b": + config.add_bias_linear = False + config.bias_activation_fusion = False + config.gated_linear_unit = False + config.apply_query_key_layer_scaling = True + config.layernorm_zero_centered_gamma = True + config.bias_dropout_fusion = False + config.rotary_percent = 0.5 + config.attention_dropout = 0.0 + config.apply_rope_fusion = False + config.activation_func = squared_relu + config.ffn_hidden_size = 16384 + config.masked_softmax_fusion = True + config.attention_softmax_in_fp32 = True + config.num_query_groups = 32 + config.kv_channels = 128 + config.rotary_interleaved = False + elif config.language_model_type == "llama3_8b": + config.activation_func = torch.nn.functional.silu + config.add_bias_linear = False + config.bias_activation_fusion = False + config.gated_linear_unit = True + config.apply_query_key_layer_scaling = True + config.layernorm_zero_centered_gamma = ( + False # Zero centered gamma not supported for RMSNorm + ) + config.bias_dropout_fusion = False + config.apply_rope_fusion = False + config.attention_softmax_in_fp32 = True + config.ffn_hidden_size = 14336 + elif config.language_model_type == "mistral_7b": + config.activation_func = torch.nn.functional.silu + config.add_bias_linear = False + config.bias_activation_fusion = False + config.gated_linear_unit = True + config.apply_query_key_layer_scaling = False + config.layernorm_zero_centered_gamma = ( + False # Zero centered gamma not supported for RMSNorm + ) + config.bias_dropout_fusion = False + config.apply_rope_fusion = False + config.attention_softmax_in_fp32 = True + config.ffn_hidden_size = 14336 + + return config + + +def get_vision_model_config(config, apply_query_key_layer_scaling=False): + config.num_layers = 24 + config.num_attention_heads = 16 + config.add_bias_linear = True + config.add_qkv_bias = True + config.hidden_size = 1024 + config.hidden_dropout = 0.0 + config.attention_dropout = 0.0 + config.ffn_hidden_size = 4096 + config.gated_linear_unit = False + config.activation_func = quick_gelu + config.kv_channels = 64 + config.num_attention_heads = 16 + config.num_query_groups = 16 + config.layernorm_zero_centered_gamma = False + config.apply_query_key_layer_scaling = apply_query_key_layer_scaling + config.bias_activation_fusion = False + config.bias_dropout_fusion = False + config.attention_softmax_in_fp32 = True + config.normalization = 'LayerNorm' + + return config + + +def get_vision_projection_config(config, hidden_size): + config.gated_linear_unit = False + config.bias_activation_fusion = False + config.add_bias_linear = False + config.hidden_size = hidden_size + if config.language_model_type == "2b": + config.ffn_hidden_size = 5440 + config.activation_func = torch.nn.functional.gelu + if config.language_model_type == "8b": + config.ffn_hidden_size = 16384 + config.activation_func = squared_relu + elif config.language_model_type == "llama3_8b": + config.ffn_hidden_size = 14336 + config.activation_func = torch.nn.functional.silu + elif config.language_model_type == "mistral_7b": + config.ffn_hidden_size = 14336 + config.activation_func = torch.nn.functional.silu + + return config diff --git a/examples/multimodal/convert_llava_pretrain_to_wds.py b/examples/multimodal/convert_llava_pretrain_to_wds.py new file mode 100644 index 0000000..0092aef --- /dev/null +++ b/examples/multimodal/convert_llava_pretrain_to_wds.py @@ -0,0 +1,31 @@ +import json +import os +import webdataset as wds + +from tqdm import tqdm + +llava_pretrain_dir = '' + +# Paths to the dataset files +json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json') +output = os.path.join(llava_pretrain_dir, 'wds') + +if not os.path.exists(output): + os.mkdir(output) + +# Load data +with open(json_file, 'r') as f: + data = json.load(f) + +with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer: + for entry in tqdm(data): + with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file: + image_data = img_file.read() + sample = { + "__key__": entry['id'], + "jpg": image_data, + "json": json.dumps(entry['conversations']).encode("utf-8"), + } + shard_writer.write(sample) + +print(f"Dataset successfully converted to wds") diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py new file mode 100644 index 0000000..5fcdb45 --- /dev/null +++ b/examples/multimodal/dataloader_provider.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import torch +from dataset_helpers import TaskEncoder, print_error_handler + +from megatron.core import mpu +from megatron.energon import ( + LimitDataset, + RepeatDataset, + WorkerConfig, + get_loader, + get_savable_loader, + get_train_dataset, + get_val_datasets, +) +from megatron.training import get_args, get_num_microbatches, print_rank_0 +from megatron.training.checkpointing import get_checkpoint_name + + +def datasets_provider(worker_config=None): + """Create multimodal train, validation and test datasets.""" + args = get_args() + dname = args.data_path[0] if type(args.data_path) is list else args.data_path + train_dataset = get_train_dataset( + dname, + batch_size=args.micro_batch_size, + task_encoder=TaskEncoder(), + worker_config=worker_config, + virtual_epoch_length=1000, + max_samples_per_sequence=100, + shuffle_buffer_size=100, + handler=print_error_handler, + image_decode="pil", + ) + + val_datasets = get_val_datasets( + dname, + batch_size=args.micro_batch_size, + # This is the total number over all workers + # limit=args.eval_iters * get_num_microbatches(), + task_encoder=TaskEncoder(), + worker_config=worker_config, + handler=print_error_handler, + image_decode="pil", + ) + val_datasets_without_source_datasets = [ + # Limit the dataset to eval_iters * num_microbatches + LimitDataset( + # Repeat the inner dataset in case it's too short + RepeatDataset(val_ds, worker_config=worker_config), + length=args.eval_iters * get_num_microbatches(), + worker_config=worker_config, + reset_after_epoch=True, + ) + for val_ds, _src_ds in val_datasets + ] + + return train_dataset, val_datasets_without_source_datasets, None + + +def train_valid_test_dataloaders_provider(train_val_test_num_samples): + """Build multimodal train, validation and test dataloaders.""" + args = get_args() + + worker_debug_path = None + worker_log_level = 0 + + rank = mpu.get_data_parallel_rank() + world_size = mpu.get_data_parallel_world_size() + data_parallel_group = mpu.get_data_parallel_group() + + worker_config = WorkerConfig( + rank=rank, + world_size=world_size, + num_workers=args.num_workers, + data_parallel_group=data_parallel_group, + worker_debug_path=worker_debug_path, + worker_log_level=worker_log_level, + ) + train_ds, valid_ds1, test_ds = datasets_provider(worker_config) + + train_dataloader = get_savable_loader(train_ds, worker_config=worker_config) + if args.load is not None: + if hasattr(args, "dataloader_path"): + dp_rank = ( + mpu.get_data_parallel_rank() + if torch.distributed.is_initialized() + else 0 + ) + data_save_name = get_checkpoint_name( + args.dataloader_path, + args.iteration, + save_basename=f"train_dataloader_dprank{dp_rank:03d}.pt", + ) + try: + dataset_state_dict = torch.load( + data_save_name, map_location="cpu" + ) + if ( + "dataset_state_dict" in dataset_state_dict.keys() + and dataset_state_dict["train_data_path"] + != args.train_data_path + ): + print_rank_0( + f"Not restoring dataset state from {data_save_name}, path to dataset changed from {dataset_state_dict['train_data_path']} to {args.train_data_path}" + ) + else: + train_dataloader.restore_state_rank( + dataset_state_dict["dataloader_state_dict"] + ) + print_rank_0( + f"restoring dataset state from {data_save_name}" + ) + except Exception as e: + print_rank_0( + "loading dataloader checkpoint failed. Skipping. " + str(e) + ) + + valid_dataloader = [ + iter(cyclic_iter(get_loader(valid_ds, worker_config=worker_config))) + for valid_ds in valid_ds1 + ] + test_dataloader = None + + return iter(cyclic_iter(train_dataloader)), valid_dataloader, iter(cyclic_iter(test_dataloader)) + + + +def cyclic_iter(iter): + while True: + for x in iter: + yield x diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py new file mode 100644 index 0000000..8354841 --- /dev/null +++ b/examples/multimodal/dataset_helpers.py @@ -0,0 +1,521 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import dataclasses +import json +import random +import re +import sys +import traceback +from dataclasses import dataclass +from typing import Any, List, Dict, Optional, Tuple, Union + +import numpy as np +import torch +from PIL import Image, ImageDraw +from torchvision import transforms as T +from torchvision.transforms import Compose, RandAugment, RandomResizedCrop, Resize, ToPILImage + +from megatron.core import mpu +from megatron.energon import Batch, CaptioningSample, DefaultTaskEncoder, OCRSample, VQASample +from megatron.energon.transforms import CustomTransform, MergeTransform +from megatron.training import get_args +from megatron.training.tokenizer import build_tokenizer + +try: + from torchvision.transforms import InterpolationMode + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + + +# Imagenet's mean and std. +pixel_mean = [123.675, 116.28, 103.53] +pixel_std = [58.395, 57.12, 57.375] + + +def convert_to_rgb(image): + return image.convert("RGB") + +def _transform_train(img_h, img_w): + return Compose([ + ToPILImage(), + RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)), + convert_to_rgb, + ]) + +def _transform_train_aug(img_h, img_w): + return Compose([ + ToPILImage(), + RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)), + convert_to_rgb, + RandAugment(2, 5, isPIL=True, augs=['Identity', 'AutoContrast', 'Brightness', 'Sharpness', 'Equalize', + 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']), + ]) + +def _transform_test(img_h, img_w): + return Compose([ + ToPILImage(), + Resize((img_h, img_w)), + convert_to_rgb, + ]) + +class RandomResize(CustomTransform): + """Resizes the image by a random scale factor in the given interval, but at most max_size""" + + def __init__(self, min_scale: float, max_scale: float, max_size: int): + self._min_scale = min_scale + self._max_scale = max_scale + self._max_size = max_size + + def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]: + scale = random.uniform(self._min_scale, self._max_scale) + new_size = tuple(int(x * scale) for x in dst_size) + + if max(new_size) > self._max_size: + scale = self._max_size / max(new_size) + new_size = tuple(int(x * scale) for x in dst_size) + + matrix = self.scale(scale, scale) @ matrix + dst_size = np.array(new_size, dtype=dst_size.dtype) + + return matrix, dst_size, (self.__class__.__name__, scale) + + +class RandomResizeLongEdge(CustomTransform): + """Resizes the image's longer edge to a random length between min_size and max_size pixels.""" + + def __init__(self, min_size: int, max_size: int): + self._min_size = min_size + self._max_size = max_size + + def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]: + new_long = random.randint(self._min_size, self._max_size) + if dst_size[0] > dst_size[1]: # h > w + new_w, new_h = int(new_long * dst_size[1] / dst_size[0]), new_long + else: # w > h + new_w, new_h = new_long, int(new_long * dst_size[0] / dst_size[1]) + + new_size = (new_h, new_w) + matrix = self.scale(new_w / dst_size[1], new_h / dst_size[0]) @ matrix + dst_size = np.array(new_size, dtype=dst_size.dtype) + + return matrix, dst_size, (self.__class__.__name__, new_size) + + +class RandomPad(CustomTransform): + """Pads the image to the given size, randomly choosing the position of the image within the new larger image. + If the image is already larger than the given size, it will not be padded in that direction(s).""" + + def __init__(self, size: Tuple[int, int]): + self._new_size = size # h, w + + def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]: + h_pad = max(self._new_size[0] - dst_size[0], 0) + w_pad = max(self._new_size[1] - dst_size[1], 0) + + if h_pad == 0 and w_pad == 0: + return matrix, dst_size, (self.__class__.__name__, None) + else: + # TODO: fix me + # top = random.randint(0, h_pad) + # left = random.randint(0, w_pad) + top = 0 + left = 0 + + matrix = self.translate(left, top) @ matrix + dst_size = np.array(self._new_size, dtype=dst_size.dtype) + return matrix, dst_size, (self.__class__.__name__, (top, left)) + + +def _get_ocr_document_visual_transform(IMG_H=1024, IMG_W=1024): + document_visual_transform = T.Compose( + [ + MergeTransform( + [ + # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)), + RandomResizeLongEdge(960, 1008), # Note: 1008 comes from list(range(960, 1024, 16))[-1] + T.RandomRotation(5, interpolation=T.InterpolationMode.BILINEAR), + T.RandomPerspective(distortion_scale=0.1, p=0.1), + RandomPad((IMG_H, IMG_W)), + ] + ), + T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)), + T.RandomGrayscale(p=0.5), + T.RandomInvert(p=0.5), + T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5), + T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5), + # LogImage(), + # T.ToTensor(), + # T.Normalize(IMAGE_MEAN, IMAGE_STD), + ] + ) + return document_visual_transform + +def _get_ocr_document_identity_transform(IMG_H=1024, IMG_W=1024): + long_edge = max(IMG_H, IMG_W) + document_identity_transform = T.Compose( + [ + MergeTransform( + [ + RandomResizeLongEdge(long_edge, long_edge), + RandomPad((long_edge, long_edge)), + ] + ) + ] + ) + return document_identity_transform + +def _get_ocr_paragraph_visual_transform(IMG_H=1024, IMG_W=1024): + paragraph_visual_transform = T.Compose( + [ + MergeTransform( + [ + # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)), + RandomResize(0.5, 2.0, min(IMG_H, IMG_W)), #FINAL_SIZE), + T.RandomRotation(1, interpolation=T.InterpolationMode.BILINEAR), + T.RandomPerspective(distortion_scale=0.1, p=0.1), + RandomPad((IMG_H, IMG_W)), + ] + ), + T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)), + T.RandomGrayscale(p=0.5), + T.RandomInvert(p=0.5), + # T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5), + # T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5), + # LogImage(), + # T.ToTensor(), + # T.Normalize(IMAGE_MEAN, IMAGE_STD), + ] + ) + return paragraph_visual_transform + +# Type for intermediate batch, after batch() +@dataclass +class ImageTaskSample: + __key__: str + __subflavors__: Dict + # (c, h, w) + img: torch.Tensor + text: np.ndarray + prompt_len: np.int64 + img_clip: Optional[torch.Tensor] = None + + +# Typing for the resulting batch data after encode_batch() +@dataclass +class ImageTaskBatch(Batch): + __keys__: List[str] + __subflavors__: List[Dict] + # (n, c, h, w) + img: torch.Tensor + # (n, seq_len) + text: torch.Tensor + # (n, 1) + prompt_len: torch.Tensor + # (n, c, h, w) + img_clip: Optional[torch.Tensor] = None + + +class IdentitySplitter(object): + def tokenize(self, *text): + return text + + +class Tokenizer: + def __init__(self): + + args = get_args() + self.args = args + + self.IMAGE_TOKEN_INDEX = -200 + self.initializer() + + def initializer(self): + # Use Encoder class as a container for global data + Tokenizer.tokenizer = build_tokenizer(self.args) + if hasattr(Tokenizer.tokenizer, 'eod'): + self.eod_token = Tokenizer.tokenizer.eod + elif hasattr(Tokenizer.tokenizer, 'eos_id'): + self.eod_token = Tokenizer.tokenizer.eos_id + else: + raise AttributeError('No eod token found in Tokenizer') + self.split_token = 313131 + + if ( + hasattr(self.args, "split_sentences") and self.args.split_sentences + ): # default false + if not nltk_available: + print("NLTK is not available to split sentences.") + exit() + library = "tokenizers/punkt/{}.pickle".format("english") + # print("loading: " + library) + splitter = nltk.load(library) + if self.args.keep_newlines: + # this prevents punkt from eating newlines after sentences + Tokenizer.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer( + train_text=splitter._params, lang_vars=CustomLanguageVars() + ) + else: + Tokenizer.splitter = splitter + else: + Tokenizer.splitter = IdentitySplitter() + + def __call__(self, text: str, padded: bool = True): # -> torch.Tensor: + sentence = Tokenizer.splitter.tokenize(text)[0] + sentence = Tokenizer.tokenizer.tokenize(sentence) + return sentence + + def pad(self, content, seq_len=1024): + out = np.pad(content, pad_width=(0,max(0,seq_len-len(content))), mode='constant', constant_values=self.eod_token) + + return out + + +class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatch, dict]): + """A simple task encoder for captioning.""" + + def __init__( + self + ): + # Specify the batch_type for default batching (batching is performed here "manually" by + # overwriting the `batch` method) + super().__init__() + + self.args = get_args() + + self.tokenizer = Tokenizer() + self.manual_prompts = json.load(open(self.args.prompt_path)) + self.seq_len = self.args.seq_length + + self.txt_to_token_dict = {} + + self.img_h, self.img_w = self.args.img_h, self.args.img_w + + self.pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1) + self.pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1) + + self.ocr_document_visual_transform = _get_ocr_document_visual_transform(self.img_h, self.img_w) + self.ocr_document_identity_transform = _get_ocr_document_identity_transform(self.img_h, self.img_w) + self.ocr_paragraph_visual_transform = _get_ocr_paragraph_visual_transform(self.img_h, self.img_w) + + + def get_visual_transform(self, img_sample, sample_augmentation=False): + raw_h, raw_w = img_sample.shape[0], img_sample.shape[1] + ratio = float(max(self.img_h, self.img_w)) / max(raw_h, raw_w) + scaled_h, scaled_w = int(raw_h * ratio + 0.5), int(raw_w * ratio + 0.5) + + # if the sample needs augmentation or not + if sample_augmentation: + # further check if augmentation is a global flag in args + if self.args.aug: + visual_transform = _transform_train_aug(scaled_h, scaled_w) + else: + visual_transform = _transform_train(scaled_h, scaled_w) + else: + visual_transform = _transform_test(scaled_h, scaled_w) + + img = visual_transform(img_sample) + + # Normalize pixel values. + img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std + + # Pad to target image size. + delta_h, delta_w = self.img_h - scaled_h, self.img_w - scaled_w + img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h)) + + return img + + def encode_sample(self, sample: Union[ + CaptioningSample, OCRSample, VQASample] + ): + + if isinstance(sample, OCRSample): + yield self.encode_ocr(sample) + + elif isinstance(sample, CaptioningSample): + yield self.encode_captioning(sample) + + elif isinstance(sample, VQASample): + yield self.encode_vqa(sample) + + else: + raise NotImplementedError('Sample format not supported') + yield None + + def encode_captioning(self, sample: CaptioningSample): + sample_augmentation = sample.__subflavors__["augmentation"] == True + + img = self.get_visual_transform(np.array(sample.image), sample_augmentation=sample_augmentation) + + # randomly select a prompt + if 'CaptioningDetailed' in sample.__subflavors__["type"]: + prompt_idx = np.random.randint(len(self.manual_prompts["CaptioningDetailed"]["raw"])) + cur_prompt = self.manual_prompts["CaptioningDetailed"]["raw"][prompt_idx] + else: + prompt_idx = np.random.randint(len(self.manual_prompts["Captioning"]["raw"])) + cur_prompt = self.manual_prompts["Captioning"]["raw"][prompt_idx] + + if cur_prompt not in self.txt_to_token_dict: + self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt) + cur_prompt = self.txt_to_token_dict[cur_prompt] + + prompt_len = len(cur_prompt) + + caption = sample.caption + if 'SplitByLine' in sample.__subflavors__["type"]: + # caption = re.sub(r"\n+", "\n", caption) + caption_list = caption.split('\n') + caption_list = [caption for caption in caption_list if caption.strip() != ''] + caption = np.random.choice(caption_list) + caption_token = self.tokenizer(caption.strip()) + + if len(caption.strip()) == 0: + raise RuntimeError('Empty string in caption!') + + seq_len = self.seq_len + 4 + text_sample = np.concatenate([[self.tokenizer.IMAGE_TOKEN_INDEX], cur_prompt, caption_token]) + text_sample = self.tokenizer.pad(text_sample, seq_len) + text_sample = text_sample[:seq_len] + + return ImageTaskSample( + __key__=sample.__key__, + __subflavors__=sample.__subflavors__, + img=img, + text=text_sample, + prompt_len=prompt_len + ) + + def encode_vqa(self, sample: VQASample): + task_name = None + + no_image_flag = True if '-noimage' in sample.__key__ else False + + if 'pretrain' in sample.__key__: + task_name = 'pretrain' + else: + task_name = sample.__key__.split("/")[0] + + sample_augmentation = sample.__subflavors__["augmentation"] == True + + if no_image_flag: + img = torch.from_numpy(np.array([0]).astype(np.float32)) + else: + img = self.get_visual_transform(np.array(sample.image), sample_augmentation=sample_augmentation) + + if "" in sample.context: + sample.context = sample.context.replace("","") + + if task_name != 'pretrain' and sample.context[-1:] != "\n": + sample.context = sample.context + "\n" + + question = sample.context + + if isinstance(sample.answers, list): + answer_list = sample.answers + weight_list = np.array(sample.answer_weights).astype(np.float32) + weight_list = weight_list / np.sum(weight_list) + answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0] + answer = answer_list[answer_idx] + else: + answer = sample.answers + + question_token = self.tokenizer.tokenizer.instruct_tokenize(question) + answer_token = self.tokenizer(answer) + + prompt_len = len(question_token) + + seq_len = self.seq_len + 4 + + text_sample = np.concatenate([[self.tokenizer.IMAGE_TOKEN_INDEX], question_token, answer_token]) + text_sample = self.tokenizer.pad(text_sample, seq_len) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavors__=sample.__subflavors__, + img=img, + text=text_sample, + prompt_len=prompt_len + ) + + def encode_ocr(self, sample: OCRSample) -> ImageTaskSample: + if sample.__subflavors__["type"] == "document": + visual_transform = self.ocr_document_visual_transform + elif sample.__subflavors__["type"] == "paragraph": + visual_transform = self.ocr_paragraph_visual_transform + elif sample.__subflavors__["augmentation"] == False: + visual_transform = self.ocr_document_identity_transform + else: + raise ValueError(f"Unknown subflavor {sample.__subflavors__}") + + if sample.words_boxes is not None and sample.words_boxes.shape[1] >= 5: + # Boxes with conf below 0.9 are skipped + filter_words_mask = sample.words_boxes[:, 4] < 0.9 + filter_boxes = sample.words_boxes[filter_words_mask, :4] + for x, y, x2, y2 in filter_boxes: + if isinstance(sample.image, Image.Image): + draw = ImageDraw.Draw(sample.image) + draw.rectangle([int(x), int(y), (int(x2), int(y2))], fill=0) + else: + sample.image[:, int(y) : int(y2) + 1, int(x) : int(x2) + 1] = 0 + + text = " ".join( + text for skip, text in zip(filter_words_mask, sample.words_text) if not skip + ) + else: + text = " ".join(sample.text.splitlines()) + + match = re.search(r'"text_sequence": "(.*?)"', text) + if match: + text = match.group(1) + + img = visual_transform(sample.image) + img_clip = None + img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std + img = torch.nn.functional.pad(img, (0, self.img_w - img.shape[2], 0, self.img_h - img.shape[1])) + + # randomly select a prompt + prompt_idx = np.random.randint(len(self.manual_prompts["OCR"]["raw"])) + cur_prompt = self.manual_prompts["OCR"]["raw"][prompt_idx] + + if cur_prompt not in self.txt_to_token_dict: + self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt) + cur_prompt = self.txt_to_token_dict[cur_prompt] + + text_sample = self.tokenizer(text) + prompt_len = len(cur_prompt) + seq_len = self.seq_len + 4 + text_sample = np.concatenate([cur_prompt, text_sample]) + text_sample = self.tokenizer.pad(text_sample, seq_len=seq_len) + text_sample = text_sample[:seq_len] + + return ImageTaskSample( + __key__=sample.__key__, + __subflavors__=sample.__subflavors__, + img=img, + img_clip=img_clip, + text=text_sample, + prompt_len=prompt_len + ) + + def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch: + batch = ImageTaskBatch( + __keys__=[s.__key__ for s in samples], + __subflavors__=[s.__subflavors__ for s in samples], + img=torch.stack([s.img for s in samples]), + text=torch.from_numpy(np.stack([s.text for s in samples], axis=0).astype(np.int64)), + prompt_len=torch.from_numpy(np.array([s.prompt_len for s in samples], dtype=np.int64)) + ) + + return batch + + def encode_batch(self, batch: ImageTaskBatch) -> dict: + raw = dataclasses.asdict(batch) + del raw["__subflavors__"] + return raw + + +def print_error_handler(exc: Exception, key: Optional[str]): + print( + f"The following exception occurred in the dataloader for sample {key} and is skipped", + file=sys.stderr, + ) + traceback.print_exc() diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluate_coco.py new file mode 100644 index 0000000..501a5df --- /dev/null +++ b/examples/multimodal/evaluate_coco.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import argparse +import glob +import json + +from pycocoevalcap.eval import COCOEvalCap +from pycocotools.coco import COCO + + +def convert_to_coco_format(input_path): + """Convert input files to COCO compatible format.""" + output_file_path = input_path + "-captioning-merged.json" + + pattern = input_path + "-captioning-[0-9].*jsonl" + input_file_paths = glob.glob(pattern) + + captions = [] + + for input_file_path in input_file_paths: + with open(input_file_path, "r") as input_file: + for line in input_file: + res = json.loads(line) + + question_id = res['sample_id'] + caption = res['caption'].rstrip('.').lower() + + captions.append({"image_id": question_id, "caption": caption}) + + with open(output_file_path, "w") as output_file: + json.dump(captions, output_file) + + return output_file_path + + +def coco_captioning_eval(input_path, groundtruth_file): + """Run COCO captioning evaluation.""" + coco = COCO(groundtruth_file) + input_file = convert_to_coco_format(input_path) + coco_result = coco.loadRes(input_file) + + coco_eval = COCOEvalCap(coco, coco_result) + + # Evaluate on the input subset of images. + coco_eval.params['image_id'] = coco_result.getImgIds() + + coco_eval.evaluate() + + for metric, score in coco_eval.eval.items(): + print(metric, score) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)") + parser.add_argument( + "--groundtruth-path", type=str, required=True, help="Path to groundtruth file" + ) + args = parser.parse_args() + + coco_captioning_eval(args.input_path, args.groundtruth_path) diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluate_mmmu.py new file mode 100644 index 0000000..1f609fc --- /dev/null +++ b/examples/multimodal/evaluate_mmmu.py @@ -0,0 +1,66 @@ +import argparse +import glob +import json +import subprocess + + +def convert_to_mmmu_format(input_path): + """Convert input files to MMMU compatible format.""" + output_file_path = input_path + "-MMMU-merged.json" + + pattern = input_path + "-MMMU-[0-9].*jsonl" + input_file_paths = glob.glob(pattern) + + output = dict() + + for input_file_path in input_file_paths: + with open(input_file_path, "r") as input_file: + for line in input_file: + res = json.loads(line) + + sample_id = res["sample_id"] + prediction = res["prediction"] + + output[sample_id] = prediction + + with open(output_file_path, "w") as output_file: + json.dump(output, output_file) + + return output_file_path + + +def main(): + # Using the validation groundtruth file from the MMMU repo by default. This assumes you have cloned the MMMU github repo here. + default_groundtruth_path = "examples/multimodal/MMMU/eval/answer_dict_val.json" + + parser = argparse.ArgumentParser() + parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)") + parser.add_argument( + "--groundtruth-path", + type=str, + default=default_groundtruth_path, + help="Path to groundtruth file. Defaults to the validation file in the MMMU repo.", + ) + args = parser.parse_args() + + result_file = convert_to_mmmu_format(args.input_path) + + # The MMMU repo has a script for running the actual evaluation but no API. So launching the script here. + output = subprocess.run( + [ + "python", + "examples/multimodal/MMMU/eval/main_eval_only.py", + "--output_path", + result_file, + "--answer_path", + default_groundtruth_path, + ], + capture_output=True, + text=True, + ) + + print(output.stdout) + + +if __name__ == "__main__": + main() diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py new file mode 100644 index 0000000..f8de860 --- /dev/null +++ b/examples/multimodal/evaluate_textvqa.py @@ -0,0 +1,86 @@ +import argparse +import glob +import json +import re + +# This can help resolve an import error of an mmf dependency that is not needed. +try: + from mmf.utils.m4c_evaluators import TextVQAAccuracyEvaluator +except ModuleNotFoundError: + from mmf.utils.m4c_evaluators import TextVQAAccuracyEvaluator + + +def merge_input_files(input_path): + """Merge input files to a format compatible with the evaluator.""" + output_file_path = input_path + "-TextVQA-merged.json" + + pattern = input_path + "-TextVQA-[0-9].*jsonl" + input_file_paths = glob.glob(pattern) + + results = [] + + for input_file_path in input_file_paths: + with open(input_file_path, "r") as input_file: + for line in input_file: + res = json.loads(line) + results.append(res) + + with open(output_file_path, "w") as output_file: + json.dump(results, output_file) + + return output_file_path + + +# Note: This is based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/eval/eval_textvqa.py#L17 +# and slightly modified. +def prompt_processor(prompt): + if prompt.startswith('OCR tokens: '): + pattern = r"Question: (.*?) Short answer:" + match = re.search(pattern, prompt, re.DOTALL) + question = match.group(1) + elif "Reference OCR token: " in prompt and len(prompt.split("\n")) == 3: + if prompt.startswith("Reference OCR token:"): + question = prompt.split("\n")[1] + else: + question = prompt.split("\n")[0] + elif len(prompt.split("\n")) == 2: + question = prompt.split("\n")[0] + else: + raise RuntimeError("unexpected prompt format") + + return question.lower() + + +# Note: This is based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/eval/eval_textvqa.py#L35 +# and slightly modified. +def evaluate(result_file_path, groundtruth_path): + with open(groundtruth_path) as groundtruth_file: + groundtruth = json.load(groundtruth_file)["data"] + + groundtruth = {(gt["image_id"]): gt["answers"] for gt in groundtruth} + + with open(result_file_path, "r") as result_file: + results = json.load(result_file) + + predictions = [] + for result in results: + gt_answers = groundtruth[(result["sample_id"])] + predictions.append({"pred_answer": result["text"], "gt_answers": gt_answers}) + + evaluator = TextVQAAccuracyEvaluator() + print( + 'Samples: {}\nAccuracy: {:.2f}%\n'.format( + len(predictions), 100.0 * evaluator.eval_pred_list(predictions) + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--input-path', type=str, help="Path to input file(s)") + parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file") + args = parser.parse_args() + + result_file_path = merge_input_files(args.input_path) + + evaluate(result_file_path, args.groundtruth_path) diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py new file mode 100644 index 0000000..6c76782 --- /dev/null +++ b/examples/multimodal/evaluate_vqav2.py @@ -0,0 +1,41 @@ +import argparse +import glob +import json + +from open_flamingo.eval.vqa_metric import compute_vqa_accuracy + + +def merge_input_files(input_path): + """Merge input files to a format compatible with the evaluator.""" + output_file_path = input_path + "-VQAv2-merged.json" + + pattern = input_path + "-VQAv2-[0-9].*jsonl" + input_file_paths = glob.glob(pattern) + + results = [] + + for input_file_path in input_file_paths: + with open(input_file_path, "r") as input_file: + for line in input_file: + res = json.loads(line) + res["question_id"] = res["sample_id"] + + results.append(res) + + with open(output_file_path, "w") as output_file: + json.dump(results, output_file) + + return output_file_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--input-path', type=str, help="Path to input file(s)") + parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file") + parser.add_argument('--question-path', type=str, help="Path to questions file") + args = parser.parse_args() + + result_file = merge_input_files(args.input_path) + + accuracy = compute_vqa_accuracy(result_file, args.question_path, args.groundtruth_path) + print(accuracy) diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py new file mode 100644 index 0000000..ff3754d --- /dev/null +++ b/examples/multimodal/layer_specs.py @@ -0,0 +1,115 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import torch + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TEColumnParallelLinear, + TELayerNormColumnParallelLinear, + TEColumnParallelLinear, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import apex + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + import warnings + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + + +class TorchLayerNormWrapper(torch.nn.LayerNorm): + def __init__(self, config, hidden_size, eps): + super().__init__(hidden_size, eps) + + +def get_layer_spec(is_vit=False) -> ModuleSpec: + mlp = get_mlp_module_spec(use_te=False) + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl if not is_vit else TorchLayerNormWrapper, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl if not is_vit else TorchLayerNormWrapper, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + + +def get_layer_spec_te(is_vit=False) -> ModuleSpec: + attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal + + mlp = get_mlp_module_spec_te() + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": attn_mask_type}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + +def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: + # Dense MLP w/ or w/o TE modules. + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, + ), + ) + + +def get_mlp_module_spec_te() -> ModuleSpec: + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ) diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json new file mode 100644 index 0000000..e4bf3e4 --- /dev/null +++ b/examples/multimodal/manual_prompts.json @@ -0,0 +1,29 @@ +{ + "Captioning": { + "raw": [ + "Can you briefly explain what you see in the image?", + "Describe what's happening in this image in one short sentence.", + "Write a short caption that accurately represents the content of this image.", + "Please generate a descriptive caption for the image provided.", + "How would you summarize the scene depicted in the picture in short?" + ] + }, + "OCR": { + "raw": [ + "Can you read the text from image and output here?", + "Extract and document the text from the provided image.", + "Converting the text embedded in this image into a readable document.", + "Transcribe all the text you find.", + "Can you extract all visible text from the image here?" + ] + }, + "VQA": { + "raw": [ + "Given the image, answer the following question with few words.", + "Answer the following question: ", + "What is the answer to this question?", + "Write the answer: ", + "Please answer this question: " + ] + } +} diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml new file mode 100644 index 0000000..f27bccb --- /dev/null +++ b/examples/multimodal/pretrain_dataset.yaml @@ -0,0 +1,15 @@ +__module__: megatron.energon +__class__: Metadataset +splits: + train: + datasets: + - weight: 1. + path: + subflavors: + augmentation: false + val: + datasets: + - weight: 1. + path: + subflavors: + augmentation: false diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh new file mode 100644 index 0000000..f6dfb60 --- /dev/null +++ b/examples/multimodal/pretrain_mistral_clip.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# Pretrain a multimodal model. + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining" + +# Check that the user has set an output path for model checkpoints. +if [[ -z $WORKSPACE ]]; then + echo "Please set WORKSPACE for storing your model checkpoints." + exit 1 +fi + +SOURCE=`pwd` +OUTPUT_BASE="${WORKSPACE}/output" +OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" + +FINETUNE_DIR=${OUTPUT}/checkpoints +LOGS_DIR="${OUTPUT}/logs" +TENSORBOARD_DIR="${OUTPUT}/tensorboard" + +if [[ -z $LOAD_NAME ]]; then + echo "Please set LOAD_NAME for input model name." + exit 1 +fi + +if [[ -z $TOKENIZER_MODEL ]]; then + echo "Please set TOKENIZER_MODEL for tokenizer model name." + exit 1 +fi + +CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" + +DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" +DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" + +DEBUG=0 +if [[ $DEBUG -eq 1 ]]; then + BZ=32 + NW=2 + HD=0.0 + LI=1 + EXTRA_ARGS="" + NONDETERMINISTIC_ATTN=1 +else + BZ=256 + NW=2 + HD=0.1 + LI=10 + EXTRA_ARGS="" + NONDETERMINISTIC_ATTN=1 +fi + +OPTIONS=" \ + --img-embedding-idx 1 \ + --apply-layernorm-1p \ + --attention-softmax-in-fp32 \ + --use-checkpoint-args \ + --use-distributed-optimizer \ + --transformer-impl transformer_engine \ + --use-te \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --num-workers ${NW} \ + --exit-duration-in-mins 230 \ + --use-flash-attn \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout ${HD} \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 4096 \ + --ffn-hidden-size 14336 \ + --train-iters 20000 \ + --micro-batch-size 1 \ + --global-batch-size ${BZ} \ + --lr-decay-iters 20000 \ + --lr-warmup-fraction .01 \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --log-interval ${LI} \ + --eval-iters 10 \ + --eval-interval 1000 \ + --tokenizer-type MistralTokenizer \ + --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --data-path ${DATA_TRAIN} \ + --valid-path ${DATA_VALID} \ + --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ + --save-interval 1000 \ + --save ${FINETUNE_DIR} \ + --load ${FINETUNE_DIR} \ + --pretrained-checkpoint ${CHECKPOINT_DIR} \ + --split 100,0,0 \ + --clip-grad 1.0 \ + --weight-decay 1e-2 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.014 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --bf16 \ + --eod-mask-loss \ + --freeze-LM \ + --freeze-ViT \ + --patch-dim 14 \ + --img-h 336 \ + --img-w 336 \ + --dataloader-type external \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --language-model-type=mistral_7b \ + --disable-vision-class-token \ + ${EXTRA_ARGS} \ + --distributed-timeout-minutes 60 \ + --allow-missing-vision-projection-checkpoint \ +" + +export NVTE_APPLY_QK_LAYER_SCALING=0 +export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN} + +torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} \ No newline at end of file diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py new file mode 100644 index 0000000..24a2e19 --- /dev/null +++ b/examples/multimodal/run_text_generation.py @@ -0,0 +1,378 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Generate text using a vision language model.""" +import glob +import json +import logging +import os +import sys +from collections import defaultdict +from functools import partial + +# Add megatron to the path. +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +import numpy as np +import torch +from PIL import Image +from torchvision.transforms import Compose, Resize, ToPILImage +from train import add_multimodal_extra_args, get_image_token_count, model_provider + +from megatron.inference.text_generation.api import generate_and_post_process +from megatron.inference.text_generation.forward_step import ForwardStep +from megatron.training import get_args, get_model, print_rank_0 +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron + +def add_text_generation_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='Vision language model text generation') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') + group.add_argument( + "--out-seq-length", type=int, default=1024, help='Length of the output generated text.' + ) + group.add_argument("--output-path", type=str, required=True, help='Output file path') + group.add_argument('--input-image-path', type=str, required=True, help="Input image directory") + group.add_argument('--input-metadata-path', type=str, help="Input metadata path") + group.add_argument( + '--num-partitions', type=int, default=0, help="Number of partitions for inputs." + ) + group.add_argument('--partition-id', type=int, default=0, help="Partition index") + group.add_argument("--drop-vision-class-token", action="store_true", default=False) + group.add_argument("--gt-path", type=str, help="Optional ground truth file") + group.add_argument("--task", type=str, help="Generation task to run") + + # Add common multimodal arguments needed for e.g. building the model. + parser = add_multimodal_extra_args(parser) + + return parser + + +def preprocess_image(target_h, target_w, img): + """Example image preprocessing. Resizes input image to target size. + + Args: + target_h (int): Target height in pixels. + target_w (int): Target width in pixels + img (np.array [h, w, c]): Input image in a numpy array. + + Returns: + output_img (torch.Tensor [c, h, w]): Input image resized to target size. + """ + # Imagenet's mean and std for normalization. + pixel_mean = [123.675, 116.28, 103.53] + pixel_std = [58.395, 57.12, 57.375] + pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1) + pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1) + + # Resize image considering ratio between input and target image sizes. + img_h, img_w = img.shape[0], img.shape[1] + ratio = float(max(target_h, target_w)) / max(img_h, img_w) + + scaled_h, scaled_w = int(img_h * ratio + 0.5), int(img_w * ratio + 0.5) + + image_transform = Compose( + [ToPILImage(), Resize((scaled_h, scaled_w)), lambda x: x.convert("RGB")] + ) + img = image_transform(img) + + # Normalize pixel values. + img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std + + # Pad to target size. + delta_h, delta_w = target_h - scaled_h, target_w - scaled_w + output_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h)) + + return output_img + + +def _get_partition_bounds(total_num_samples, num_partitions, partition_id): + samples_per_partition = total_num_samples // num_partitions + return samples_per_partition * partition_id, samples_per_partition * (partition_id + 1) + + +def generate_samples(model): + """Text generation using a trained vision language model.""" + args = get_args() + + images = [] + questions, answers = [], [] + samples, sample_ids = [], [] + + if args.task in ("TextVQA", "VQAv2"): + input_metadata_path = args.input_metadata_path + + if input_metadata_path.endswith(".json"): + samples = json.load(open(input_metadata_path)) + elif input_metadata_path.endswith(".jsonl"): + with open(input_metadata_path, 'r') as jsonl_file: + json_list = list(jsonl_file) + samples = [json.loads(json_str) for json_str in json_list] + else: + return NotImplementedError + + # Optionally, process only a subset of the input files. + if args.num_partitions > 0: + lb, ub = _get_partition_bounds(len(samples), args.num_partitions, args.partition_id) + samples = samples[lb:ub] + + num_samples = len(samples) + + for i in range(len(samples)): + sample = samples[i] + + img_file = "{}/{}".format(args.input_image_path, sample["image"]) + + img_sample = np.array(Image.open(img_file)) + processed_img = preprocess_image(args.img_h, args.img_w, img_sample) + images.append(processed_img.reshape(-1, 3, args.img_h, args.img_w)) + + if args.task == "VQAv2": + questions.append(sample["question"]) + answers.append(sample["answer"]) + elif args.task == 'TextVQA': + questions.append(sample["text"]) + + sample_ids.append(sample["question_id"]) + + if len(images) == num_samples: + break + elif args.task == "captioning": + image_files = sorted(glob.glob(args.input_image_path + "/*")) + # Optionally, process only a subset of the input files. + if args.num_partitions > 0: + lb, ub = _get_partition_bounds(len(image_files), args.num_partitions, args.partition_id) + image_files = image_files[lb:ub] + + num_samples = len(image_files) + images = [] + + # Run image preprocessing. + for image_file in image_files: + img = np.array(Image.open(image_file)) + img = preprocess_image(args.img_h, args.img_w, img) + + images.append(img.reshape(-1, 3, args.img_h, args.img_w)) + + image_id = int(image_file.split("_")[-1].split(".")[0]) + sample_ids.append(image_id) + + # Load optional ground truth. + gt_sample_id_to_captions = defaultdict(list) + if args.gt_path: + gts = json.load(open(args.gt_path)) + for gt in gts["annotations"]: + gt_sample_id_to_captions[gt["image_id"]].append(gt['caption']) + elif args.task == 'MMMU': + # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation. + import datasets + + from evaluation.MMMU.eval.utils.data_utils import ( + CAT_SHORT2LONG, + construct_prompt, + load_yaml, + process_single_sample, + ) + + all_mmmu_datasets = [] + + hf_datasets_cache = os.environ["HF_DATASETS_CACHE"] + assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE." + + for subject in CAT_SHORT2LONG.values(): + subject_dataset = datasets.load_dataset( + "MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache + ) + all_mmmu_datasets.append(subject_dataset) + + dataset = datasets.concatenate_datasets(all_mmmu_datasets) + + # Optionally, process only a subset of the input files. + start_idx = 0 + end_idx = len(dataset) + if args.num_partitions > 0: + start_idx, end_idx = _get_partition_bounds( + len(dataset), args.num_partitions, args.partition_id + ) + + # Using the LLaVA config from the MMMU repo. + config = load_yaml("evaluation/MMMU/eval/configs/llava1.5.yaml") + for k, v in config.items(): + if isinstance(v, list): + assert len(v) == 1, "only one value supported." + config[k] = v[0] + + for idx in range(start_idx, end_idx): + sample = dataset[idx] + sample = process_single_sample(sample) + sample = construct_prompt(sample, config) + + # Skip samples with no images or multiple images. Not supported yet. + if "image" not in sample or "" in sample['final_input_prompt']: + continue + + img = np.array(sample['image'].convert("RGB")) + img = preprocess_image(args.img_h, args.img_w, img) + images.append(img.reshape(-1, 3, args.img_h, args.img_w)) + + sample_ids.append(sample['id']) + + # TODO: Support different image positions. + prompt = sample['final_input_prompt'] + prompt = prompt.replace("", "") + questions.append(prompt.strip()) + + answers.append(sample['answer']) + + samples.append(sample) + + num_samples = len(samples) + else: + raise NotImplementedError("unsupported task") + + idx = 0 + while idx < num_samples: + image = images[idx].cuda() + sample_id = sample_ids[idx] + + if args.task == "captioning": + prompt = "Give a short and clear explanation of the subsequent image.\n" + elif args.task == "TextVQA": + prompt = questions[idx] + elif args.task == "VQAv2": + prompt = questions[idx] + prompt = "Given the image, answer the following question with a single word or phrase. " + prompt + elif args.task == "MMMU": + prompt = questions[idx] + + prompt = prompt.replace("", "") + prompt = prompt + "\n" + + forward_step = partial(VLMForwardStep, image, get_image_token_count()) + + if torch.distributed.get_rank() == 0: + resp_sentences, _, _, _ = generate_and_post_process( + model, + forward_step=forward_step, + prompts=[prompt], + tokens_to_generate=args.out_seq_length, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=args.temperature, + random_seed=123, + ) + + for prompt, generation in zip([prompt], resp_sentences): + output = { + "sample_id": sample_id, + "prompt": prompt, + } + + output_name = "" + if args.task == "captioning": + output_name = "caption" + elif args.task == "VQAv2": + output_name = "answer" + elif args.task in ("TextVQA", "MMMU"): + output_name = "text" + + generated = generation[len(prompt) + 1 :] + output[output_name] = generated + + if args.task == "captioning": + output["ground_truth"] = gt_sample_id_to_captions[sample_id] + elif args.task == "VQAv2": + output["ground_truth"] = answers[idx] + elif args.task == "MMMU": + sample = samples[idx] + + prediction = generated + if sample["question_type"] == "multiple-choice": + from evaluation.MMMU.eval.utils.eval_utils import ( + parse_multi_choice_response, + ) + + prediction = parse_multi_choice_response( + generated, sample["all_choices"], sample["index2ans"] + ) + + output["prediction"] = prediction + + print_rank_0(output) + + yield output + idx += 1 + else: + generate_and_post_process(model, forward_step=forward_step) + + idx += 1 + + +def generate_and_write_samples(model): + args = get_args() + + for output in generate_samples(model): + if torch.distributed.get_rank() == 0: + with open(args.output_path, 'a') as f: + f.write(json.dumps(output) + "\n") + + +class VLMForwardStep(ForwardStep): + def __init__(self, images, num_image_tokens, model, max_batch_size, max_sequence_length): + super().__init__(model, max_batch_size, max_sequence_length + num_image_tokens) + self._images = images + + def _forward(self, tokens, position_ids, attention_mask): + return self.model( + self._images, + tokens, + position_ids, + attention_mask=None, + inference_params=self.inference_params, + ) + + def __call__(self, tokens, position_ids, attention_mask): + logits = super().__call__(tokens, position_ids, attention_mask) + + # On the first inference iteration, we compute image tokens. + # Update the sequence length offset by the number of image tokens. + num_tokens = tokens.size(1) + if num_tokens > 1: + self.inference_params.sequence_len_offset += self.inference_params.key_value_memory_dict[ + "image_tokens_count" + ] + + return logits + + +def main(): + """Vision language model text generation.""" + + logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.") + + initialize_megatron(extra_args_provider=add_text_generation_args) + + def wrapped_model_provider(pre_process, post_process): + return model_provider(pre_process, post_process, parallel_output=False) + + # Set up model and load checkpoint. + model = get_model(wrapped_model_provider, wrap_with_ddp=False) + + args = get_args() + if args.load is not None: + _ = load_checkpoint(model, None, None) + + model = model[0] + model.eval() + + generate_and_write_samples(model) + + +if __name__ == "__main__": + main() diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml new file mode 100644 index 0000000..c9f0257 --- /dev/null +++ b/examples/multimodal/sft_dataset.yaml @@ -0,0 +1,15 @@ +__module__: megatron.energon +__class__: Metadataset +splits: + train: + datasets: + - weight: 1. + path: + subflavors: + augmentation: false + val: + datasets: + - weight: 1. + path: + subflavors: + augmentation: false diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh new file mode 100644 index 0000000..df21877 --- /dev/null +++ b/examples/multimodal/sft_mistral_clip.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Run SFT on a pretrained multimodal model + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft" + +# Check that the user has set an output path for model checkpoints. +if [[ -z $WORKSPACE ]]; then + echo "Please set WORKSPACE for storing your model checkpoints." + exit 1 +fi + +SOURCE=`pwd` +OUTPUT_BASE="${WORKSPACE}/output" +OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" + +FINETUNE_DIR=${OUTPUT}/checkpoints +LOGS_DIR="${OUTPUT}/logs" +TENSORBOARD_DIR="${OUTPUT}/tensorboard" + +if [[ -z $LOAD_NAME ]]; then + echo "Please set LOAD_NAME for input model name." + exit 1 +fi + +if [[ -z $LOAD_ITER ]]; then + echo "Please set LOAD_ITER for pre-trained input model iteration." + exit 1 +fi + +if [[ -z $TOKENIZER_MODEL ]]; then + echo "Please set TOKENIZER_MODEL for tokenizer model name." + exit 1 +fi + +CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" + +DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml" +DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml" + +DEBUG=0 +if [[ $DEBUG -eq 1 ]]; then + BZ=8 + NW=1 + HD=0.0 + LI=1 + EXTRA_ARGS="" + NONDETERMINISTIC_ATTN=1 +else + BZ=128 + NW=2 + HD=0.1 + LI=10 + EXTRA_ARGS="" + NONDETERMINISTIC_ATTN=1 +fi + +OPTIONS=" \ + --img-embedding-idx 1 \ + --apply-layernorm-1p \ + --attention-softmax-in-fp32 \ + --use-checkpoint-args \ + --use-distributed-optimizer \ + --transformer-impl transformer_engine \ + --use-te \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --num-workers ${NW} \ + --exit-duration-in-mins 230 \ + --use-flash-attn \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout ${HD} \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 4096 \ + --ffn-hidden-size 14336 \ + --train-iters 20000 \ + --micro-batch-size 1 \ + --global-batch-size ${BZ} \ + --lr-decay-iters 20000 \ + --lr-warmup-fraction .01 \ + --lr 1e-6 \ + --min-lr 1e-7 \ + --lr-decay-style cosine \ + --log-interval ${LI} \ + --eval-iters 10 \ + --eval-interval 500 \ + --tokenizer-type MistralTokenizer \ + --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --data-path ${DATA_TRAIN} \ + --valid-path ${DATA_VALID} \ + --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ + --save-interval 500 \ + --save ${FINETUNE_DIR} \ + --load ${FINETUNE_DIR} \ + --pretrained-checkpoint ${CHECKPOINT_DIR} \ + --split 100,0,0 \ + --clip-grad 0.5 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.014 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --eod-mask-loss \ + --freeze-ViT \ + --patch-dim 14 \ + --img-h 336 \ + --img-w 336 \ + --dataloader-type external \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --language-model-type=mistral_7b \ + --disable-vision-class-token \ + ${EXTRA_ARGS} \ + --distributed-timeout-minutes 60 \ +" + +export NVTE_APPLY_QK_LAYER_SCALING=0 +export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN} + +torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh new file mode 100644 index 0000000..72022b1 --- /dev/null +++ b/examples/multimodal/text_generation_mistral_clip.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 + +INPUT_METADATA_PATH="placeholder" +GROUNDTRUTH_PATH="placeholder" + +while [[ $# -gt 0 ]]; do + case $1 in + --input-image-path) + INPUT_IMAGE_PATH="$2" + shift + shift + ;; + --input-metadata-path) + INPUT_METADATA_PATH="$2" + shift + shift + ;; + -g|--groundtruth-path) + GROUNDTRUTH_PATH="$2" + shift + shift + ;; + -o|--output-path) + OUTPUT_PATH="$2" + shift + shift + ;; + -m|--model-path) + MODEL_PATH="$2" + shift + shift + ;; + -t|--tokenizer-path) + TOKENIZER_PATH="$2" + shift + shift + ;; + --task) + TASK="$2" + shift + shift + ;; + -g|--gt-path) + GROUNDTRUTH_PATH="$2" + shift + shift + ;; + -*|--*) + echo "Invalid option $1" + exit 1 + ;; + esac +done + +# Please modify these as needed. +NUM_PARTITIONS=100 +START=2 +END=0 + +for PARTITION_ID in $( eval echo {$START..$END} ) +do + torchrun --nproc_per_node 4 examples/multimodal/run_text_generation.py \ + --img-embedding-idx 1 \ + --apply-layernorm-1p \ + --attention-softmax-in-fp32 \ + --use-flash-attn \ + --transformer-impl transformer_engine \ + --use-te \ + --use-checkpoint-args \ + --normalization RMSNorm \ + --language-model-type mistral_7b \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --group-query-attention \ + --num-query-groups 8 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 32 \ + --max-position-embeddings 4096 \ + --no-masked-softmax-fusion \ + --load ${MODEL_PATH} \ + --tokenizer-type MistralTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 2048 \ + --out-seq-length 700 \ + --temperature 1.0 \ + --img-h 336 \ + --img-w 336 \ + --patch-dim 14 \ + --seed 153 \ + --top_k 1 \ + --no-load-rng \ + --no-load-optim \ + --input-image-path ${INPUT_IMAGE_PATH} \ + --input-metadata-path ${INPUT_METADATA_PATH} \ + --num-partitions ${NUM_PARTITIONS} \ + --partition-id ${PARTITION_ID} \ + --output-path ${OUTPUT_PATH}-${TASK}-${PARTITION_ID}.jsonl \ + --gt-path ${GROUNDTRUTH_PATH} \ + --task ${TASK} \ + --disable-vision-class-token +done diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py new file mode 100644 index 0000000..c9be30d --- /dev/null +++ b/examples/multimodal/train.py @@ -0,0 +1,314 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Pretrain or SFT multimodal.""" +from copy import deepcopy +from functools import partial +import os +import sys + +import torch + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir, os.path.pardir))) + +from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from config import get_language_model_config, get_vision_model_config, get_vision_projection_config +from megatron.core.models.multimodal.llava_model import LLaVAModel +from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group +from dataloader_provider import train_valid_test_dataloaders_provider + + +def model_provider(pre_process=True, post_process=True, parallel_output=True) -> LLaVAModel: + """Builds the model. + + Args: + pre_process (bool): Enable preprocessing in the model. NOTE: Not used at the moment. + post_process (bool): Enable postprocessing in the model. NOTE: Not used at the moment. + parallel_output (bool): Enable parallel model output. + + Returns: + model: A multimodal model. + """ + args = get_args() + + use_te = args.use_te + + print_rank_0('building a multimodal model ...') + + base_config = core_transformer_config_from_args(get_args()) + base_config.language_model_type = args.language_model_type + + language_config = deepcopy(base_config) + language_config = get_language_model_config(language_config) + + if use_te: + language_transformer_layer_spec = get_layer_spec_te(is_vit=False) + else: + language_transformer_layer_spec = get_layer_spec(is_vit=False) + + vision_config = deepcopy(base_config) + vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling) + + if use_te: + vision_transformer_layer_spec = get_layer_spec_te(is_vit=True) + else: + vision_transformer_layer_spec = get_layer_spec(is_vit=True) + + vision_projection_config = deepcopy(base_config) + vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size) + vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules + + model = LLaVAModel( + language_transformer_config=language_config, + language_transformer_layer_spec=language_transformer_layer_spec, + language_vocab_size=args.padded_vocab_size, + language_max_sequence_length=args.max_position_embeddings, + vision_transformer_config=vision_config, + vision_transformer_layer_spec=vision_transformer_layer_spec, + drop_vision_class_token=args.disable_vision_class_token, + vision_projection_config=vision_projection_config, + vision_projection_layer_spec=vision_projection_layer_spec, + vision_projection_type="mlp", + allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint, + parallel_output=parallel_output, + language_position_embedding_type=args.position_embedding_type, + language_rotary_percent=args.rotary_percent, + language_rotary_base=args.rotary_base, + img_embedding_idx=args.img_embedding_idx, + ) + + model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False) + + return model + + +def get_batch(data_iterator): + """Generate a batch""" + + args = get_args() + + tokens = None + labels = None + loss_mask = None + attention_mask = None + position_ids = None + + # Broadcast data. + torch.cuda.nvtx.range_push("get_data") + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + + data_text = tensor_parallel.broadcast_data(["text"], data, torch.int64)["text"] + data_img = tensor_parallel.broadcast_data(["img"], data, torch.float32) + prompt_len = tensor_parallel.broadcast_data(["prompt_len"], data, torch.int64)["prompt_len"] + + torch.cuda.nvtx.range_pop() + + tokens_ = data_text.long() + + img_raw = data_img['img'].reshape(-1, 3, args.img_h, args.img_w) + + torch.cuda.nvtx.range_push("index tokens") + tokenizer = get_tokenizer() + tokens = tokens_[:, :args.seq_length].contiguous() + labels = tokens_[:, 1:args.seq_length+1].contiguous() + torch.cuda.nvtx.range_pop() + + torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids") + if hasattr(tokenizer, 'eod'): + eod_token = tokenizer.eod + elif hasattr(tokenizer, 'eos_id'): + eod_token = tokenizer.eos_id + attention_mask, loss_mask, position_ids = \ + get_ltor_masks_and_position_ids(tokens, eod_token, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + question_length=prompt_len) + torch.cuda.nvtx.range_pop() + + loss_mask, labels, attention_mask = _preprocess_data_for_llava(loss_mask, labels, attention_mask) + + tokens = tokens[:, 1:] # drop image index token + + return tokens, labels, loss_mask, attention_mask, position_ids, img_raw + + +def get_image_token_count(): + args = get_args() + + add_class_token = not args.disable_vision_class_token + + num_patches_per_dim_h = args.img_h // args.patch_dim + num_patches_per_dim_w = args.img_w // args.patch_dim + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + num_image_tokens = num_patches + (1 if add_class_token else 0) + + return num_image_tokens + + +def _preprocess_data_for_llava(loss_mask, labels, attention_mask): + """Preprocess data sample to the format expected by a LLaVA model.""" + num_image_tokens = get_image_token_count() + + batch_size = loss_mask.shape[0] + + loss_mask2 = torch.cat( + [torch.zeros(batch_size, num_image_tokens - 1, dtype=torch.float32, device=loss_mask.device), loss_mask], dim=1 + ) + labels2 = torch.cat([torch.zeros(batch_size, num_image_tokens - 1, dtype=torch.int64, device=labels.device), labels], dim=1) + + full_seq_length = len(labels2[0]) + attention_mask2 = torch.tril(torch.ones((1, 1, full_seq_length, full_seq_length), device=attention_mask.device)) + attention_mask2 = attention_mask2 < 0.5 + + return loss_mask2, labels2, attention_mask2 + + +def get_ltor_masks_and_position_ids(data, + eod_token, + reset_position_ids, + reset_attention_mask, + eod_mask_loss, + question_length=None, + weights=None): + """Build masks and position id for left to right model.""" + + # Extract batch size and sequence length. + micro_batch_size, seq_length = data.size() + + # Attention mask (lower triangular). + if reset_attention_mask: + att_mask_batch = micro_batch_size + else: + att_mask_batch = 1 + attention_mask = torch.tril(torch.ones( + (att_mask_batch, seq_length, seq_length), device=data.device)).view( + att_mask_batch, 1, seq_length, seq_length) + + # Loss mask. + loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) + if eod_mask_loss: + loss_mask[data == eod_token] = 0.0 + + # Position ids. + position_ids = torch.arange(seq_length, dtype=torch.long, + device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data) + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + + if question_length is not None: + for b in range(micro_batch_size): + loss_mask[b, :max(0, question_length[b].item())] = 0.0 + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(micro_batch_size): + + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= (i + 1 - prev_index) + prev_index = i + 1 + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + if weights is not None: + loss_mask = loss_mask * weights + + return attention_mask, loss_mask, position_ids + + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + if loss_mask is not None: + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / max( 1,loss_mask.sum() ) + else: + loss = torch.mean(losses) + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + + +def forward_step(data_iterator, model: LLaVAModel): + """Forward training step. + + Args: + data_iterator (torch.utils.data.dataloader): Input data iterator + model: Multimodal model + + Returns: + output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size]. + loss_func (callable): Loss function with a loss mask specified. + """ + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids, images = get_batch(data_iterator) + timers('batch-generator').stop() + + output_tensor = model(images, tokens, position_ids, attention_mask, labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + +def add_multimodal_extra_args(parser): + """Extra arguments.""" + group = parser.add_argument_group(title='multimodal arguments') + group.add_argument('--valid-path', nargs='*', default=None, + help='Path to the training dataset. Accepted format:' + '1) a single data path, 2) multiple datasets in the' + 'form: dataset1-weight dataset1-path dataset2-weight ' + 'dataset2-path ...') + group.add_argument('--dataset-config', type=str, default=None) + group.add_argument("--prompt-path", type=str, default=None) + group.add_argument('--freeze-LM', action='store_true', default=False) + group.add_argument('--freeze-ViT', action='store_true', default=False) + group.add_argument('--language-model-type', type=str, required=True) + group.add_argument("--disable-vision-class-token", action="store_true", default=False) + group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False) + group.add_argument("--use-te", action="store_true", default=False) + group.add_argument("--img-embedding-idx", type=int, default=0, + help='Llava specific parameter. Defines at which index' + 'in the language_embedding tensor the image_embeddings' + 'should be inserted') + return parser + + +if __name__ == "__main__": + train_valid_test_dataloaders_provider.is_distributed = True + + pretrain( + train_valid_test_dataloaders_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + extra_args_provider=add_multimodal_extra_args, + ) diff --git a/examples/retro/README.md b/examples/retro/README.md new file mode 100644 index 0000000..f78bcde --- /dev/null +++ b/examples/retro/README.md @@ -0,0 +1,74 @@ +# RETRO MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Data Preprocessing](#2-data-preprocessing) +- [3. Configurations](#3-configurations) + +## 1. Training setup + + +To run the model using a docker container run it as follows +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + megatron-lm nvcr.io/nvidia/pytorch:23.09-py3 \ + bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH" + +``` +NOTE: Depending on the environment you are running it the above command might look slightly different. + +NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include: + +- `--data-path` +- `--data-cache-path` +- `--eval-interval` +- `--eval-iters` +- `--global-batch-size` +- `--tokenizer-type` +- `--tokenizer-model` +- `--vocab-file` +- `--merge-file` +- `--seed` +- `--seq-length` +- `--train-samples` + + +## 2. Data Preprocessing + + +Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md). + + +## 3. Configurations + +The example in this folder shows you how to run a 2B model. Below are a few other example configurations. + +### 857M +``` + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + +### 4B +``` + --num-layers 48 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh new file mode 100644 index 0000000..5d2e66b --- /dev/null +++ b/examples/retro/preprocess_data.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +set -u + +unset NCCL_DEBUG + +######## Megatron, Retro dirs. ######## + +REPO_DIR="" +RETRO_PROJECT_DIR="" + +######## Task (e.g., db, index, query). ######## + +# This script takes a single argument, which specifies the retro task to be +# performed. The available tasks are: db-build, index-train, index-add, and +# query-neighbors. + +# ~~ Examples ~~ +# RETRO_TASKS="db-build" # Build the retrieval database +# RETRO_TASKS="index-train" # Train the index +# RETRO_TASKS="index-add" # Add data to the index +# RETRO_TASKS="query-neighbors" # Perform query pretraining for neighbors + +# You can also provide the task as a command-line argument when executing the +# script. Example: ./preprocess_data.sh index-add +RETRO_TASKS=$1 + +######## Data. ######## +DATA_BLEND="" + +######## Index. ######## + +RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32" +RETRO_INDEX_NTRAIN=66625331 +RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97 +RETRO_INDEX_ADD_LOAD_FRACTION=0.95 + +######## GPT. ######## + +RETRO_GPT_SEED=1234 +RETRO_GPT_SPLIT="98,2,0" +RETRO_GPT_DATA_PATH=${DATA_BLEND} +RETRO_GPT_TRAIN_SAMPLES=200000 +RETRO_GPT_EVAL_INTERVAL=2000 +RETRO_GPT_EVAL_ITERS=50 +RETRO_GPT_LR_DECAY_SAMPLES=175000 +RETRO_GPT_LR_WARMUP_SAMPLES=10000 +RETRO_GPT_SEQ_LENGTH=2048 +RETRO_GPT_GLOBAL_BATCH_SIZE=256 +RETRO_GPT_CHUNK_LENGTH=64 + +######## Query. ######## + +RETRO_QUERY_NUM_NEIGHBORS_QUERY=200 +RETRO_QUERY_NUM_NEIGHBORS_SAVE=20 +RETRO_QUERY_EF_SEARCH=32 +RETRO_QUERY_NPROBE=4096 + +######## Args. ######## + +ARGS=" \ + --distributed-timeout-minutes 600 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --micro-batch-size 1 \ + --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --load ${RETRO_PROJECT_DIR}/checkpoints/bert \ + --exit-on-missing-checkpoint \ + --no-load-optim \ + --data-path [null] \ + --tokenizer-type BertWordPieceLowerCase \ + --vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \ + --split ${RETRO_GPT_SPLIT} \ + --distributed-backend nccl \ + --lr 0.0001 \ + --lr-decay-style linear \ + --min-lr 1.0e-5 \ + --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \ + --lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \ + --lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \ + --eval-iters ${RETRO_GPT_EVAL_ITERS} \ + --bf16 \ + --no-data-sharding \ + --no-gradient-accumulation-fusion \ + --no-async-tensor-model-parallel-allreduce \ + --bert-embedder-type megatron \ + --output-bert-embeddings \ + \ + --retro-project-dir ${RETRO_PROJECT_DIR} \ + --retro-tasks ${RETRO_TASKS} \ + --retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \ + --retro-bert-tokenizer-type BertWordPieceLowerCase \ + \ + --retro-gpt-seed ${RETRO_GPT_SEED} \ + --retro-gpt-tokenizer-type GPTSentencePieceTokenizer \ + --retro-gpt-tokenizer-model /path/to/tokenizer/model \ + --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \ + --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \ + --retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \ + --retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \ + --retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \ + --retro-gpt-split ${RETRO_GPT_SPLIT} \ + --retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \ + --retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \ + \ + --retro-index-str ${RETRO_INDEX_STR} \ + --retro-index-ntrain ${RETRO_INDEX_NTRAIN} \ + --retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \ + --retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \ + --no-retro-index-delete-training-embeddings \ + --no-retro-index-delete-added-codes \ + \ + --retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \ + --retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \ + --retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \ + --retro-query-nprobe ${RETRO_QUERY_NPROBE} \ +" + +######## Command. ######## + +NPROCS=8 # Number of GPUs. +CMD="\ + cd ${REPO_DIR} && pwd && \ + export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \ + python -m torch.distributed.run \ + --nproc_per_node ${NPROCS} \ + --nnodes 1 \ + --node_rank ${NODE_RANK} \ + --master_addr ${MASTER_ADDR} \ + --master_port 6000 \ + tools/retro/preprocess_data.py ${ARGS} \ +" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "CMD = '$CMD'." +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +eval $CMD diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh new file mode 100644 index 0000000..c8276b5 --- /dev/null +++ b/examples/retro/train_retro_2b_distributed.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# Runs the "307M" parameter Retro model. + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_LOGS_PATH=$2 # + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +######## GPT or Retro? ######## + +# 0 : GPT. +# 1 : Retro + +ADD_RETRIEVER=1 + +######## Megatron, Retro dirs. ######## + +RETRO_PROJECT_DIR="" + +######## Model, training args. ######## + +# ** Note: --seq-length auto loaded from Retro project dir. +RETRO_MODEL_ARGS=( + --num-layers 32 + --hidden-size 2048 + --num-attention-heads 32 +) + +# ** Note: --data-path, --tokenizer-type, and --tokenizer-model auto loaded from Retro project dir. +DATA_ARGS=( + --split 98,2,0 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 1 +) + +# ** Note: --eval-interval, --eval-iters auto loaded from Retro project dir. +EVAL_AND_LOGGING_ARGS=( + --log-interval 100 + --save-interval 10000 + --eval-interval 1000 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --eval-iters 10 + --tensorboard-dir $TENSORBOARD_LOGS_PATH +) + +TRAINING_ARGS=" \ + --retro-project-dir ${RETRO_PROJECT_DIR} \ + --transformer-impl transformer_engine \ + --num-workers 8 \ + --micro-batch-size 4 \ + --lr-decay-samples 166400000 \ + --lr-warmup-samples 162761 \ + --lr 6.0e-4 \ + --min-lr 6.0e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.023 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --bf16 \ + --no-data-sharding \ +" + +if [ "$ADD_RETRIEVER" = "1" ]; then + TRAINING_ARGS+=" --retro-add-retriever" +fi + +######## Command. ######## + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_retro.py \ + ${RETRO_MODEL_ARGS[@]} \ + ${TRAINING_ARGS} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py new file mode 100644 index 0000000..d5ffffe --- /dev/null +++ b/examples/run_simple_mcore_train_loop.py @@ -0,0 +1,158 @@ +import os +import torch +from torch.optim import Adam +from torch.utils.data import DataLoader +from functools import partial +from pathlib import Path + +from megatron.core import parallel_state +from megatron.core import dist_checkpointing +from megatron.core.pipeline_parallel.schedules import get_forward_backward_func +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.datasets.utils import compile_helpers +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset +from megatron.training.tokenizer.tokenizer import _NullTokenizer + + +_SEQUENCE_LENGTH = 64 + + +def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1): + parallel_state.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size) + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + return gpt_model + +def get_train_data_iterator(): + if torch.distributed.is_available() and torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() + else: + compile_helpers() + + config = GPTDatasetConfig( + random_seed=0, + sequence_length=_SEQUENCE_LENGTH, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH), + ) + + datasets = BlendedMegatronDatasetBuilder( + MockGPTDataset, [1000, None, None], lambda: True, config + ).build() + + train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True) + + train_iterator = iter(train_dataloader) + + return train_iterator + +def forward_step_func(data_iterator, model): + + def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + # If you have data parallel reduce loss across data parallel groups. + # If pipeline parallel, loss computation is done only in last stage. + + return loss, {'lm loss': loss} + + data = next(data_iterator) + tokens = data['tokens'].to(device) + attention_mask = data['attention_mask'].to(device) + position_ids = data['position_ids'].to(device) + labels = data['labels'].to(device) + loss_mask = data['loss_mask'].to(device) + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + +def save_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict = gpt_model.sharded_state_dict(prefix='') + dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model + +if __name__ == "__main__": + initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + device = torch.device("cuda") + gpt_model.to(device) + + optim = Adam(gpt_model.parameters()) + + train_iterator = get_train_data_iterator() + + forward_backward_func = get_forward_backward_func() + + # Running the model for 5 iterations + for _ in range(5): + optim.zero_grad() + + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=train_iterator, + model=gpt_model, + num_microbatches=1, + seq_length=_SEQUENCE_LENGTH, + micro_batch_size=8, + decoder_seq_length=_SEQUENCE_LENGTH, + forward_only=False) + + optim.step() + + print(f'Losses reduced : {losses_reduced}') + + # Saving the model + ckpt_path = os.getcwd() + '/ckpt' + Path(ckpt_path).mkdir(exist_ok=True) + save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + + # Loading the model + gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + gpt_model.to(device) + print('Successfully loaded the model') + diff --git a/examples/t5/README.md b/examples/t5/README.md new file mode 100644 index 0000000..205da1d --- /dev/null +++ b/examples/t5/README.md @@ -0,0 +1,55 @@ +# T5 MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Configurations](#2-configurations) +- [3. Training Results](#3-training-results) + +## 1. Training setup + +To run the model on a Slurm based cluster +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3 +ACCOUNT_NAME="" +PARTITION="" +JOB_NAME="" +NUM_NODES=1 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# +VOCAB_FILE="" #/bert-large-cased-vocab.txt +DATA_PATH="" #_text_document + +srun -N $NUM_NODES --container-image $PYTORCH_IMAGE --container-mounts "/path/to/data:/path/to/data,/path/to/megatron-lm:/workspace/megatron-lm" --account $ACCOUNT -N 1 -J $JOB_NAME -p $PARTITION --no-container-mount-home -c " + cd /workspace/megatron-lm + ./examples/t5/train_t5_220m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH" + +``` + +## 2. Configurations + +The architecture arguments below shows configuration for T5 220M model. + +### 220M +``` + --num-layers 12 \ + --hidden-size 768 \ + --num-attention-heads 12 \ + --kv-channels 64 \ + --ffn-hidden-size 3072 \ + --encoder-seq-length 512 \ + --decoder-seq-length 128 \ + --max-position-embeddings 512 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + + +## 3. Training Results + +Below is the training curve for the 220M model on Pile dataset. The training takes 4 days on 32 GPUs, with batch size of 2048. + +Finetuning on SQUAD dataset, the validation result is: 63.44\% +

+ +

diff --git a/examples/t5/t5_mcore_train_curve.png b/examples/t5/t5_mcore_train_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..de1aaa8582cb44672c79d41d38b96c4d8d32829a GIT binary patch literal 62988 zcmZsD2{@G9`~NevsG)>e-o-$1k`V4!7NSYH;22*Z5d|hIu*@;lsIAm+nLh?s>G|x~R>k&T@0o^P^fr z)rkr%@v2w2L$qwNG(~kE_^ipwQQN9^UqRd$tIXzv%=9y$w6Q^KmYWheiwmE_~q`-(sBu-9E z6&DrBJE!U&wCIR``*x1sQ<(n!Mvk5%gxN|expMqzpER#?vsL^z*$UZ(;fvXJ1|tG* z&RSSlJaw!%Ha_1_D9bAStLS3R)k}gp>HB0Cnkq}?r-q^qNzmJ0RL>W(x`;YKq^Lf1 zDA`_BU%zExtdMv#LcBS*V{ugC_vgK)^OFNvQmTT>3j&GcZXN9G?yDgN&i8aO%!@r| z+CuVOW`Eo%Z78HnHF_~R?52mC()uFZOa5KHrbODj;_2Jsap7c^#qmJmj+s_R)u&F?XU5+(P_5LwCX@$3EUH+wI(k&%%mcSc*XzKUTf`frbTO(hv!u5p)xf$f4v ze2trn9DH2WL7C3hsco@I%{S?baH-HD+CBQm*oUOJ!{OvKxrEc6oe>?;JPjgPOLMbH zpv?5$_V#v5JG%fbm)66qva_8MKmKD&znA~`@#AJ~>9m@2QbGlksnpUnhQI!u?OZ7r zszPARzVy-CcCl}veWqZxTaMB_J6kFjnY^7#XfavyK=4L^e`@jz3JPqwtRS*R4$@m* zt8INDIXSuN`**cm4eqzgid2d7H4c=yK8v&~-8w}6>e$T@GaW=?UeA8vd+Uj_A00mL zzGFtzQ2iM(=0T4tUgB?FogS%QMa(v1YX|!K?@m-m37#tRg$FS7m@;hccQFv6e&y8OzM*Am^(*?rK37> z&g{YO=@$FIV~#&R<=vQwoUHu7mc(jk5S(I}YWvx6|GHZZe%?A&>doo*RB25nS>$G> z9TCeI^P0&`Gb?*2#1ZEH?%HZncuiXy7pw`Lyy>jeZf%_bLn+tHiy#;NEpgba zw|L%e%5P!nwN}ja{qn7!hK#85XlZqSHKP2^Askvvj=lcE>glx@(aoVurnKkCQFG-M=?VEG*&PMbqqFUcqb{XQ7EuuqV>6VQLOChL zt2ylB>bpEzTG}7q&a3nsbxf<2U8rsMG#Rj|iP{}p2g_$QiR}?nC-*!1z$7VQ?k$)_ z>4TamZug^O!TD(G#<0e)*c}b+?K^$92>JAW4Jqv^_Ra7Qn^I3%)D^cX{u${$`b4u_ z%%jjZb9(@GB_ zD9F>kPbyP`kzi^YtV`RO}TIWTcnKCXij%P4uny<*mKU< zghCX3`$O~}=F1Mgj7hINa!h?*AaKU?1(m*`pY>V2vPd-6rR{oyOu<9P7+7F}ou0pg zqWppdw9Wfw*G1JYERGSmk@SPO!w1Y=*q;#O0xQAOu%SP=_zdW(fTww3OtwPo;#)5h>jQ@*1CWC##jaiPRZl_hUV~ve7G# z{%z;%m@RRwaEuI6Sx*@%q}X^2YDD=|5-Af+GmG`v1)HUYU!TaE$shBFQ1L2fQSuR$ zq{jY`6OgAjh3D&k%$<`1H5pcgqRVGHsEZwG9&^71nc_~n)ToGG9+Ia1w5RYGP9E$%{1z{}Zgo z_65jMX5%+g)t&U;Ia#5z@v)Gexo7{fDb0nh_Q>3BZLX~fmwzegD%W())lm7St#v~G ziu4eU=I08&>6hLW`)*Z4yK5U+MFNwpSw)&tBK)GXyLVMj8%giP^pujv$pfV&OpK#H z`WGhgV{^1^TlHBBtMb6})?eyL^pp7P3c~%>oSrM9$tWYjutPVN$@5rRuDH5yZawW6 z#_6j_XB%hdRF~27iO{u7yV#D-uPw5^&?86t1nJBk?8v3q>X%~CI*UP!BZlV|_W8fy zQ$mE~xY>9-hFRcU+rt()xv(k5SIgk5$N8!LYe~PkX%8#v*@!)P{Y(TCFViCVkZ8Zd zg8sqGK75HW$271V`0vg@p!QTbp;{>V?+omymu9e{uqs$ z*y)2`ydKeiid$Cat?!r3#7_H>)U}$_W8#FJP1M9&0b~7k8w)9XkFnT+CL{Vr#rGH< zQIdiM<1M>|!BD0!0u)W>+iell#H=;+hnm1cpI_j3yDU?8^Pg{g{x9~IIIOQv%XZN| zh9dLY`JGcSW&}RkFN%6lH(c3B{LGA?6ohX0ttI`$A94pfEvs~bmtZHF$#j7c&a~*U zUL&u#mJm0~%NPI%h<#go&&w&*4SuJOHVHLq&o?YUcKP?e8?YN+!|Uj?(JM#qL)mWZ zp>G-+>BLsr&sgy$tV|(6^e7gF2TLT3i8l`>uCNs7*H)2w%<@FA{qpo@JV%2&$wFU1 zD6nB3vi3)F#)@EfDj4f5`ZUw7>%0G&x7Er)#NO(c+_Bbm8%gwc2gyTk_HHFv6B$1% zz|W@4(VL?gKOe{T@$zGKJ|zCdGR9|6bRQ3+avme_;oAlTh5jR@80KAnzO~h~0(&LN zObHejE3np+6vpD%0n>4MX`c!8h#ib> zp3tA1nJI;{bf?^0ndb zY*G5`%AL{=k3BOQXidiSk{)$8x3rj6Jy$?S^u;_^2hpQP>)XE>jS_xArsOYDA+c9> z;z_01xLx7mv~N;sfn=OYxYoj_$H(S+g1Rl_7Iey=9uqOIj5Ns2J)@_m=P{($;d6V< zR@vAbKI~`eLnh~ep7b`2KYjDj#_4mE6!j);hGnClI( z_+gi4Ipeg0NFE>GN*v#6x^v96R4VfKB=>doX4e&+zxGGhz32YftU+%*OOPIRw)eM7 z;er8~+$R?}Iy&lFrfjJ0ZegNM@a$M<7ytQSk1-3WH{EnCZ-*-{_9r5Gv?}XUjkx6& z2UO9a|Lu9(J>JNR?st;ke`Xx4MP_CDvABW4H3xHcg;SI*C=q}`-;M|+UpUaD^KX8B zep4zlGjr2eY3&hbrtf5KT9b;*KJUlciPS}h(YJl4w6(SKhA#{IdQLaRPgi_0xK_4* z8~~{FIjND%+1hy7&2p6Sz;4LslJdV^IL|Q=vkwnxZ`loXedqdqjo}7i z&jDpenUFCE17DP`1){m3KT#L;>yDfsWKzxXp-!<6Y~*XUe!MO z3aJaWlCN`fbEh|(#!5WVy5sERh_yp7pj`WWeRjz@@m95kWjD5auGmG7fFRR<2)}^BxF?XB zcO*b&x={kMbQ8e-&@;QS{F#8ciN4aO#~h!-qS4jWHEGYg;ZT0K^b>#l(@lVG9R^}U zO1=aOi0871dYb?Ea68R)ysMOZpz>{-b4#7Ms?U9GU6~pPI95GNvL1DETw(Qa>CQngO-L%K2*234BWT zvpYM#)O!S2l=yAed3v^Qg`G!UH`mx*Xn4%@>fb1k7`rN|?9emJb*9Zl5jYy>6`ioq znr1)o%07^hg^(?sDWon6VzT64ADf1ps0f_`zG&8#^X$ELEKW5lQf6#mpv)_39C#v2 zTif!um3QQ-8-h}+2x{~AKE!t5(tv(JNWo;R17c>5ujHDXHKi`j1VYz^)?Bq(*GE{n z<9=HdV6p$YTM)rCYQ*9@;C?0J+tI4p+6(R>We+I^>%}3XU>s}gSN;(jovcx<&7Wpx@C}d|${BjMla+J!@shU(E zb(DvJB5B%w^K6PJK_ZdB!OX~Hmj%69Gx0OrK0wv$cTk#bQ~E3fu6C|ePhwJvPZYU_ z*8$)~@vZf;p~VFVWeJ=l{<_6&CZJ$Z@pQYrTTOWS>Y}O|3wHZq7PY0c@=`UEF)Y+A z)1yE}xDKb4TD#n71ynxnLY#4Fa;s{{AH6=F;*u{wi@+qK#X%aa;z_wJ>~U$@qvGkY ztW=Y0{#)^+uldFDBHo8uw+nuXrQU;$(4395)yTs^-=TPOlhSwg<$#8$G6ldyd>U@b zP@$@9>>XAtD`aFCWm<`>i87Tc^N(Ri9RKxky`qMOhO<^jzgh?nR&$I9+sVTCGTrkzZud4!!}vH&)VKTP{DUg@$cTRrsG}3 zRgH}o-NlL5pI~RxgqOcVOA0n6${Ep`&Spe&`ma{AM2RUUO4Bg zCFGP&*vK!Jr@e@@XEllb#$SSa;I9CiMF@=NqpbIlQukBB#f8oE4|hRc@LY6s?NU~N zUiw_x)&Cp`(rx_Ee@M@uk{NvswM8kI{qmQ1D$%tXfP)XQJBpBV7{@8>C?%95@Zs4x z>V9nXPp0T)jK=Ikoy}zul>*?x04eA_C2L4JR8BVrC^`-)$OER%SsWHa=X#z$UF7pW zOFF?u+Ovz+3;72qQ90o6xWZV7ap0)!Alz^DJNM_jfKP}XAZR6>V2m6Kj9d&mB!xvz zlqekF`0U3XhO0aK@B6FK;=SkR>@DY)pa-$1vbtx6>qAzcB@f2o)Am1rA*FVP3X}Q@t6KCyI{n@t;%%J5S53Eqk_;v`F-{ z9U&(1SaqwDe>(;KFGib|v_XX(hJf9sQMj0odsPb!^8@!kx#A$I!=GhkEQZ zBPT>CB;XYTbkKt3j0F~O5ev(b!|4D>rr+|1oDwF8QR%QBA+jB@SPnlEYq1;-KhdYQ zzr{ok{}sTkeyk*Z${EH8plrWxi8Mjc9H*}&ckW{)Ci?e-{biZ2s%heY08L>@B*Aj& z3`DQ7pe#*sDEwXN7CdEdry7|J&jt3uU_RR%9lmKPokyv=N`k2MYD+EZK6%>spw-u7 z4vBWqv*poDyJ){d;W>Dk0VZiPUA?Tcf4e-sW1~KNQ=_yz!tPWr0aREWt$vYINBkM% zF#j3{g1y&cti^?if2|>U9N{Yn;;8M#kEH>;h{8|qTsROH*aj(fuMH+)-De|sJt+!fT7~8vm|F<=F=M(y@B>%%;_%EpRJcyIq*F1gX$~&ejld4gjqkjRXEJEYUdpN24o2WniPE-^qzV6$XE59suUc${l!aw#i6(Q% zOq7ezGDt6bo{nF$+4u0Ue$mQg>~no!xR8C!aPEtC31wl-$h|KicX7I<8-b&<;YWo6 zWas-ET{=8tE=U<8am)cSof4GcEk;`PT1pDsFB6x>NC6wZ^@H|*Cpi+B~RE0wC!h9j)lEq7B?7JAP47-Zy=eu3^K$0D#d?kS5ww3O1JUbE* zKL+%!KZs?vZ5F!BlDa2OoUnyNT<5A=&jUrk*s0SMrw5)ELAD}-gv3&AZG8uYS2`JS zkcf+Vk#SzsNAwvcap=$nIr5M7-H>N~sTkBtH!I6|KboLaL$taktTG?lD6Y#w1US(& zA-WhMD;3GkTPZp8N}~6X#!ezFzwaRb6wgw*YHdRanO@}qtCTjG@~SDJK4-DhU>0RQ|ff3kNc|HX%e&?#re zf+cmG<--gCn^u7wMw7|LJjdKUzWrAOa>E3}ut;<}U5r|S&{ zTWPqvRF2{>eY!F=f4X_twqPCSQE5v1iS;tBSkq>iQL2W?iR#79OvAW~D+810RuRSZEx%U~$C0w4q=+>tcb&Tz8-_h{p&e*!uUu z3_r|2Hxo7Sw9ic7SXB9?J6u~b%)fo9_XUz8zpAE2!`*_9FZQKt)8&xMm4OmncR?d6 zsHzEOWS;<_*)gN}>X4El!4kafmSs)hvk>{aD-v4 z?1UN?H5+;Q4(B5V^R#ij^e=g5=g#dGPaj0AlhD^4c=GR$n=5y2(>A!S=`o;ebZ2`l&R@(hS8;yaX^Hlm zr$B9_b$Bflqpb<4fL%3F6eo{~@Q(7hRUOlBvWpS!36=G98xaV#1#Yc;=`c)G#?Icf zi;r;j2`MD^>U`$3|HmGfg>g_>g#meNTC?@wtG2d%Dg*?1{E$$b<}^@kyWeuY6%^pE z*`k%Q8@T5_E1Ce1GOdU({p}~iyaa-u2Pbl_Z)j0+a&j^|e)x82=0BOSv9Z&vY*URz zrVdr!6Q(0YVa*YDnRV9(#57Y*zodhUH@f_#S=rg8uxFdh{q9Ae9#HVB{BXHDeGgDl zVj7R5HAv6wV3)S9z~vH@u-TP-CE`3LO3v(_{-$SLhvp!A6`5XDbbRSVMrqEq>2A?< zOe6YU&c@Dz@3WsI(R*OiYJ3WI=8;q z9Niu9M}Lq&X=O0<7H6W-J2M_^yvnzfF)D0?jPd)X@0T{d*5&#m2moBs_m^FHC{|7x zk5$N?GphQqIP%}4pnYP_USrvgFh;@^%?EE(8nEwK`#-oQSgE^5ZjrH59D@9Tb8&C* zIqX$!Bt73(vxl!;Td|AQQ5vw*G*!ik@@>#J4w=P}>L?rx z0~mYiZ-80j2{)Ri1E9@e$yzl=)~U4TVlTzvMZxK0@oOwA9AAiiX#ef9Ce9!zi2ZK{ldcy z9-NTzXbCJmhi)cHyRPf_(`-lt-*Wg6W7u}ln}R~A>DTNZE{)zTTp&Ye;_1ZEsgc^t1nss_s;5GoBLzYEODZ^;Z2m^;U$rHtqW#lwmu$owPCdf4URAGWCam zyvkqbRXawT#43bsCNTD2cy>lwJm!s$=55r4#_daZl|C~{P2CQElvE$opz4>aL2I+X zg+BB-4i=K>UdB+MAb?r=8wA?FSH&?- z8G?lCc3j-_!%kPgcILS$8pi^(Y15}ZF%uzW_0i{{HXOpFB$6?)G&Vr&5%ttL!qO^0 zaikQR_J`3xzbGlE#!hSh&#*t>#q6H`wY0Byu9c^~g9JM9!z94pM;L>vhMJFLS6rh) z-P7VAcL1=%K`z=GIEaj(C!v5k&|8=`v4l&|SH$3%(titLv{n6{J0WduiGEQ^kbF5! zw#o&Qk7=>b<}~M#W=mkSfFT|B0)=82Na?h(f`t;q3+uQ4Zg(x)A7a*#QCL{$Z1JL` zq{MX`PNqkN_KJ#%$QguAv9FJXe38RSs^`bEg3aSY6#`ukD0ea%RAS>Ab1!#DxXn8( zNg66&91@-h>1Ogh7B<@HyH&VCS34iLsKWWW)DmQ(5?p52_(g^`Vmm}OVna5X9ba-J z*N~FZ8~Cw^>S!So!7U{bNMxg6psMD@_+hvI8NFl!Z*C!K}*u{lCi{ z5OiATpi&UwZ!$GlmzCOhO_AHv=_&Y%On1^;Q^3i$X@pT?DAn6Zd$mYf$qbf)PyHYp z$)(jn@obc=jNZ(#JgqpZB75RO%eXdiO1nw4^KD-wry|$&3!ur3&X#tsF`E5H|LXhYX?AVznsd*J z#X8LYp}T@?usmc`voTuGrCDMM7h4MbI^a<7o7?CL@lhqF?1;^=i=QN&dn0h7;j40c_?4eD3RLt$Uqy=2!|a z`dfr_*}sMe2ZGPV-a*7&l7q6_zrT0i;*abWyIcvd0CsoYS|5O@8Jb`3VpD4{;Indm)PGvZeWaqZ@T>O~)5^!o zv$PrG8C9EAUGh~suWR#RB8AKsGUTFel)^^hjJ3)S2nYbtYh~AS--ABHxe#JmJ<)5} zl>6(4mdB@x#$QB-R1nUx-L`u73+3~R;D2nbdp=+k*fN970hvoVnh*KNef0h5!k4~% z`_^=zlD*w7f2tm#GXTQE%S$r%h@N5d{K!iVHH%fB?>MB%@UTS7-}d`(_gR0 zJP~xBeu8G#ygc+leuzouyX!;g_e0`ly+PFpoSga?+W^+pi>+5bu5$p52}yFu%NqLq z$-g~U%*dP%_ZKxg(jPx)^?~qWc)%N7gy zfVeBQZpNLFr@wI59W2MB*oQ_GA^EUtxv=YTu#no{L%h;>qIUiPq%6)hmdZZ%9Hmk^ zsGLAged?}p1EXr%wcant6ln4-Obpz6)jBuPj+-$y=cAps=bX$`t!p=D$2OYY+LV5I z{QHUZm&ZJN_HviSVNWELvTQ_`Sk2I@SgS>pCyf``;q?7@lVqju!FPOKlQG=IQw>Jk z$jm4=7bI84gK2DQ1rIfMRK2nVOdk6VPuW(0I$i&H#xd>DQh!%lo&JAqVcF=Q2B?j%6M!Bxw^@Ve=zn?2$ONxcV!zn zFdd(IHK-4Pdw(z=72@YKK)_V&PfLr`$&l&d^^g=IHGOkqEPkcKS

!+El6&khJT#c86B)CX z)ESgBUpU~u6MMB0&p5$}lIviBCE-LiJb2K!XV3S)Wf!pUNa1l4q=>yZho{TP9N6lw zk4fTo6FzV3^k|75;lN)_a{>F_U6Q87pA`EA>#1`~-iTI}2XkiTeslOd?#qR@W`yPYH5!&6^qcV&GztU$7J{T8LF*gdG6n1fYl#ja5eEwY zKufI%{f&;;W_fztqVC&)_93v8V+WzeWH34P^V7rZIuCRQf-Ib&s?Y$xxE(oZiEu^z zVn4l@ndWM;n`HXVPOVZE}b&~&& zR04|x!Y7-bfbblS6;Vu$4o9ejaB!;)J5;7{MwSK?CTS+LT=6b>A=UT&$0ua|#e zIVyFNFFwJ4ffi8pu#r_G$MHg*Z4$dH|L$0s zk(C+KE3Lf;288LKsf~b#f9ZaOiQ)Vc3Yu-X%l;4uXkd|~EI&b{e+O|DZT-umaR}@H zMH3;0ZdSBhJ3D{m%J{>OynU4ZKnkDxQ;*+e<4E9f0F0twWMni_qnDDDBmmBks86US z$rUPxwgFA8PacS(Sb0GGjX!8YpdiF1Sn^Rou%{T%!<~J12o}1h?TJO?c7_DN^QHIN z7BZBJlnB-@&P7pg=)3;7t&`cJ&c(&$Fh3Aq3e5HR?0_nf1G#{vWl}+NnneXCG-t@7 zAze}0OedQTe`a{YH)dnN?HhZrB^zLS9dc#1+bh)=qn#s$-iHtH7rx(d{SF1#T#xn>TMZ zYaiMCqElRQeCcTL(`V{*Zn?4*fZq;{12tyt@X0XXw0~s-EPG`q`vj%@c5izh4-W+% z`x)8@>I8Qb$5W^u+p?`@pRuIgAZIGF!p)x6zK1iUpKo*_l;$0CE4wUx}&&jH$iJ{a&*#$ujMIdeV8i!Gq zJR*Mj%*J8Alx<{dF0}iLw{HUyTsXs#Ts=ob@OR6kuAZJvRC^voh(P#Y%jFP*^m~O> z>KhQcJ?elAUKT_0;_>PUX1lM=#^`R28MGpc>|7L~^UZw!q!13NbbBK3h$_3l`56*=|)zk7^gFRtcW} zK7TQ3v5p~TFZOa^MkqyxG1!764NyZ00bf;s+|~ZeTl`bJ>{PcLBSv-Cli zKMdM0XY7VhCPN>4{$&MF;8s@eWav3n*Yw0wl_z2>WKWxEpv)E^6tZ%|U-0Ev*Ch%r zL)Ae#S6iP>T5Mm8E+{!hxT+?0;MP|9#x|uHf=TFyeQ1Fxh)GL3?CZ?QI}`aCY7^7> zi=dEFu0F*;5B#%XYx6*}&3Kv9VCa51>hb~p*x$PDvQKgLaY}0jWUfh?VmdY!_(a6Q zu4?{U<`Jj*S2_+*Kxn@4b$mQHO~20|gsks1*V{A!voF0+aF#1!(Z7QD$3{IR{sYTF z*ab#7{Y*Z4&rzkyn)wPKp9G=2GSAl99$A6`(On+v0M(c#c_TgxX->}eF`|S}*>E0m z_1l}`ZB8}a9qs^sXvKjJhdj+eWfdVv6#EfZ7+?R@0dfmt5RU$U)-M!*wp~Ibml8jT zgCryY#hBfSa`WfOAT+wr)GRc84qt_`X{AnxkFo{FA)vgn0tBseFyLzhGTb@Q`UN6} zwv(pMR9h5GM2wk5QJWhIwpc{o033RFFS47OTVQcwF=x3>5U$!TdlkkYN?+!#4#@y8 z9Z@=yZ0{+69L}E*k5kGkJAQ9?yja-lFa^wupOiuKLlV_}m7y#8gfJea29W)ev{Ng1 zmMglW0|v|mL=IRcm2p_j%t8;Z90Z`lDO8dh-aZ*8xanM!Y(BUPHbW{<$aDb@+-z^+ zwhQi*%okSqu)mkEN)6vv=@kH!9t`05SU4*Vi{|E4(r) zl-27y<~}`4s|f@?B_&^E-|Y`*yLayJoEaXfeq`k1d1uG9(|M!UQ+-|XhYXsUh^p7; zg^NEzNLa;sTO@>?Fg$RpVbOD@IP7L!mej_YuOcR?k8f99aQiK?i+(6ZUa7v? ziWEK9VgbVk>x$|eKVD9dBzG1h%m06A(b&S~t=dlHBcuIq^PPE37D~9f6hw?RUAfE_ zPMiIr`u)8*aQq!SjH)>5|V zLQn4URxz*^WSi`9OZnp{N}DKTqd({*A7!H#l;-ShnVcA!5z0_&tKwE&Opd#Q;~GdK zuOD_^_Y#j!l%55p*~l+JuQvEjapNE10rIhfO0f@=52n5~MitxfWeMoP z>)GRU@sx8=1@PIaNG2SGASH0n#EinZhk+&#{xojb)#i1OBT3x)My~h_WbqMOSn}JJugr_i~?mLe}be|;YKaf_h$%|Y10~l)%s$mBm5$; zOonF^fv`c7yg#`w3X`(W%Ovm*(0cxN2^vHIKQ;IiPmwjARwErw^33Y17cf7i5qcP7 zQ+qIV-k9#F0fx8(@UT|l3lT&(MhqiI<2yd6HTPE`F`5>FoxX(Qc-n|Qs)j?D73$%Z zdJHMGfnP3`c>o^*5ON~?aQ8iG$h&RARL!NQz~>|U!1*~O_dgSyS9tU07+dSTrxVf*#ju-0dS{W(1wddh5%C0 zNH`YGWVD0TYaw5?fYeB_YpbylZ~4Dw5DCHj+^_1(wU zY-kLdHwmBuGko%77)b+-gu`(miYZv~sr`}yT!)SEVNQVurs$&{C}%#>wRZ7#qIF&y z58R!wcU5ihKWtiljKuleh_T;%4#rOJ21ESem`i4uC}SJB4;$G(qA#$Ek(%yeFY>E* zLNqrE&{xK97~#7mmfJG^h?=mSusq+0l*gx$?w zYE}|Ek1l@JjM)?sd`^Qzf84PHnk(xf<%FQ9XdKk3Vsr@sa?)*r z0brOC04wA33kT{72FnakWx>qw7&p}~2h>Yc!&M$8Ch(AUAI+XvV5f$bHftC*2uDYfqq z4|}!uOR&RxxS1d}#E=8k({C9;N)%rEMHlAtnK8)fy*2E+Pc(ZNtc`<;wqFBCpFz|7!WA*&rlf~tA&x4f`ik%i}*>dLxlVU)K_w~-O zsH}M224!Q&59*UqAt`VWu%;`x<(}Ryf_%Qd!1@Kia>0ex-Vh_hTsEBzAY4Ba!xRL^X5&xtk_|s(}NPb2i@S~3NBkv)i#G4 zFu(hDwB-|c8Jffgykz+f~rq;`~=;7!D{f3$_7><&2MnL3JSg0 zu{(?XQ|k@EhUYJEoF4eD^S~%t*$)6H-hteh8Ff(uii|*xk&hK7u-X&J*-2nr9`AHkVr8uIuXR=Z~X*cdD0(L__%JC40 z$@2&8DJ}7ivyzgMCLg`mH68Q(80)yKX7BCdAJs#XbLifmrp!w|MEB9R7Is4l{HI_) z%9l3%b;n*KXC%d0!6h~W&r{-UJ^%;Q1I3?4U%HIkD2Ppyni(sr_i6{uYMiEgB2S0L zt&iv=7zd~30kE|jJ{P7G$gFX$j!&hB40v_ZHCH7k}J=XrbCV=v9lJK zvmT>f=g=j%yoA3!8A05ECwn@PPX^giN<3d=MW8aph$s|s?bS5V!(!{9cYbsr(qyx{gcx7mP^1r*h2CI-k_ zj`6<(n@~4ccfV8+JdOZkc9G^1&s6?#$kRY~=q(7)r7#!<3m43eJHYXm2KBA|gM)5l z=^`)$h03@)BN6(reS2Pf*1SlBRH)XSgY;WM9($_#WG9Y2o7F5WPO~$^h z4IKx=#~5o|VNgFIHuiWn=$#gNhoXih^6!BV>=CySi}$WjvLJgb2AWP4#lSJWMsM1G zZ*xF^Rj*Z+6z$tsrH8^w`53LUKJ}2Rh3K6+yD)XZJtm|2{8ArxV@LSaXKe&j0==u_ z-nHqCl<_d*O95`P_%`|5&yr&&CLmL_pMfdP-o-w@cv5v}FNd;9mNu>@^c=^`s>}H* z{o>x+?8ObLJm=t-FE&SieJ*zZF{Gj4|gq7Yl)nIrIq|kRF2;0v4!};T+h2Qfb zZNRiEC@!Iu{abAPd+rb3%@Lx@8#%d{N0yeBCi;M=YgTZ)qGQ5aXA8@9mWm3DJk9kT zG+a~fgq~=K;))_TzHtmdd!sdUl?TtM-@?@-7eG?PC5T^6Pdl_P z?F``=i-fCA&WI&T3xz2Qo#)7VIM`Vy4oUM;lewTpiHE}gn`~NltMVwv%W1`hB5ynu zqGzAGqFb;&$C)NzQIAb(!{g)80pWHJ-QE@>AucW%ymkyQeP!8 zl73@l7`W9|Os}(~lc%lwcNDE-9>SRdSD$6_asSyeU3JA?v3{B-;De8*13fL?KMb>l zi?aUNMSlxwfzoFtV=E2pY?7RNl1!Ds$QuSa?Bs_FP&UC7G;x3pio1-RQa=S~Crdq- z&VUpJ!HjNFx~1*-cE7H2a+VaZdY-*$wp@&KMDG-4_QjGv>!jWdw-HUKwt2-(PGayq zuVbN|ewizrbl)aVA9${-LRhky$HR3sUn~!bDYn41JFZt=zRF=J;Cd03cmE)-&UXl+F;Yx_vBfPTD;Z3>SXTM!IOrChUp2I%W$N(Ti4rummrxWMJ6HP-sH+1?!y| zOqm?P{V$zyUZ;_XX_to5iG&?Ou~s^(cBGgZpP1BKz|mbJsI&ZW;MXV#m9=Jh9fMCB ziI)dE(kuFi!}Qb*Ey$^zSeJ+%US9MURzs1mA^`VcvslDWc-6Bog0Mkwg1y@Lq){;g z4$`Lh8iz7Sqwa|YReB@${+fPQ7}%%Y>Lb+4yE4EwnItN)~79EYnB=cvHB1ajo-r?1>|$nSK~fb#^c`G)C09>HJ=*a%*G6; zk?wH8PC*+Veu=#Xo9SmBf_NhBBDwy<%jv-DwGNEGy6$3=z8!4zZ_eAuOU1%!rYbHJ|4TSE#S1;zwtiw>dH zAf9pQ=V$C9&Y2!ngcgTC;lX0kV5P=4%gk)grONmbcc>isf}`wI04{oI)?pyru_qMvSQIm0FDv{|ajhe1bWlOE;_*E%e@A z`PHD(hXEn={I4SvzFfHq2=eDvznb|Xm>YHQ;@U%Sk(9^>Kkp}17kA1M8aWM^@pT8Y zRJqC&_E8o!T)X$%js5mJ6eB9aq5P5A*gjLcUQs(M31dFjoh&W#&Pg4MyeRiNh4CDWaNpv; z>dq|1336k~rP0pyBi{o@iaN8Ljjy*>dAh>^GQ5+%Gt{4hH<~)%I5*syXH#w+YLLrT zWiSx1urTT`#+;Oq6-|Gu9+1Emez4w#TQl{B8*?;7A9G~52fw#DaoM1?%25x9pLE7W zmwF=b(ML_|GEC#+3{0GFgoD4Uz}l#_#a;W=C~@C`-HbYSJs>G99yx944Dg>Y35jca zo>2R!O*u(U3Y=kjmB~#?YDx061cj9R1je@l#y3r)rXuai$JXYEXPVJ7ldanWr{y*w z9>##yN>Xwle6YH;gv+GzXa4hN&RdMDEECEG5XEvQS`i* zl((4+6%I_ONpBHY*m<{fMSVSEnxz-uGCyD)BFE9?HndxP;I_meS9C$9N@E1#qL3Yad{Dbk z66%FRiu2glWSey!QknCd$?fnDw|mF0H#0ZtQYeKApDhYi-6)`mA6tk-m4;nE?r;Yv z#i}nTz0q11>wU5DoqajPCdZ1@7e`!WYG>4>8)xJvwHGoNE3ncTAQaqPXq-2wndwq+JS^xs0%dy8yT@1!rkg@rSK>oy#5IZcW*O zpN^<_51Cij1cFz<02xi2bZ*+VK`_7aZaQ3eW6V-WHL1=*dbh-~B8 zfkmBWA%Z)@bjz6ftnxEEWv0%o21CxFhSWfGA&#C95P-T+FFXd<%Dj?*OExq5kmq?? z=k%6L!SV`LOx5_lmZ>Oy!Xt9>d&xg zo|Kz?;LX7x)o418E*(@Sgdw8Ni!u>a3V!D{!CE1pXrW16AiDWIL=hs5)8t$TJJh@M5aIyLjU9<@FoR!j!@!x_LweqFn3CQ-As z!^NI)a2I807y!d@4HcnunQ|{&+%iO&Po+u*NDjVqh@TtekC2;1q3X~ZtSw8)fx9-- ztxiU^IombwceZWiM@o*=Mo29k^k$MP7!JnBx@+r61JVV^D>Qyj)ngCvCT#YuA34(p z3NtAp#YIA>tTmZsWN7Fs4?$6R2ucxhGy1GHU3IiksL{D;rP&Wvth2OBhbk-f_B08p z5Jus~THB5sQ#jv!b+30`LtAJB~zUCQHQ6H!!;R&Y}ww|(&)SoV!j zsZwIl`}P?BmZRe07B(j})OH9dya)e)=$e~gMDpA+f0VcQ{iOSPqOH~4o+nrAx?39) ztU@dW?8BsieJUN1$v&kWtHHQ^FEaSr8WI#DH#JwLdh`m=YGu%Mq1@mGNKeT9rqX|< z0$p(fDh?EJ`;uZ}%ZiXGNov8q+u~AYoVw!{tWRuIKbH;m{ba6-Y%bHS4^);h=Dr2$ zfi&*scKHZ&`H58Ah5Mh6f(PLBtZ(70uPZpJv==^UK&i4cFI-%`9f%eSxSN}~$?V4| ztHD$#wn%w5m^eIFQu)F44eSWcR_$WuV&y4q_qM&9E**2X^S#a-s1U20&_);PT%CbY zIJ*(uT>Z7n$cMC&%SkN{Dy31ahTMwlgDM`YI>Ir1fPACY3=JlcWy zey9y;*N`;2pIlk~`&DC>)lBFyTb4s_GiCx@U(Znbop$b`wdt4O5oAgk_X6uu31S72 z`hxTW$~ZUmFa_$ky~_8hjKoeXJcK)_QDp?W>>HhZ!|hj7-RhFMkOF~%9AAPx6}~@0_H*09&q? zB6I^d$>J6a6394>$SP))567$Rog>N;^L8z?qk|tfEm~>UEV}&{43Y*%udh-yce`*9$|yOCbPkFj zA>Ex)DxiS0%utd;hXWGA&>|rqohmSNBOx$=5(3Il1Cm3hFvt+^a_{}@=Q+OPTYvbq zShLnu=XGA^ecu8C0(HKA^&20Kx)Jiey*IPkqeGrwq2ONR>0e?bR-ZQy? z*0ua)oJ+&}#2jWR#Ri6(vvP`-avRHk=hC(P-&609j+;i$y9~$R9>~hAh&jvp`K~Iyq9p?8s;c6Pkz*J9A>6jN z;?It>NENv=QsE~jg{Q=s9)tL+u%V#(Rb58NgM0kDWOV*?d?yZsMTZJ z8LX%AFQb6xJAVhYF2HJ;)d5Z0M3G~IRO8_{09@rx)IV$KY*Do*p(-ccivJ_J)Wb@4 z4`SQn>PVdOuais?{(e)#MBu3vK=U*$xkW>$-J~b0WxL)Yz9qcw+@TFXM)%V;oG;uE zqhJ6;7iXQ$uPUyb(Q@w0E?K5KEH!yS~-XnRWQotdpHA@^u;px z9?t%$L2>_4It!(MUJB`EoKAmm#F>0Q`eVPB6M67n2ViEcZBK&Q3>J!c+2B2QofxKQPh0{8OrI%fHHV3^iip%G}zrruddJ)^V3M!)1~+P7OFz~D?; zf&v&f13cnC=pNXw^z)7TV=o1k$E8}g-b+<)tpdN)QcT0pC%6D$|7J$Lq2{AO&*Wrw z-2tjX_aBjXlE$TQfkyx%*RXK%$+7YYEPnoR!N;7Wj$ zqfbBM^xa%F2bCk@!RA=CmGiWa@2;ic7FLXMf|>eAfd=DiaihWaLXd4MSn8w{QRn-& z9ROlETxOyk-XaB?@?7W`SGBQR92zmrmhkgkoO}uJRnAD}W>ge&iz||hz!p2CUGh)I zsw7CN^ganFzMI(Uhb*&4t$*M~zAp3uO z?cuvipvyq--fJm9z7WoMFB*}7PZ)iF@oUudTVTLWET4>E+?@9u%cmdNQ?h^C^I;rt z3ooUmD6@T!=M+^!k7#Ol*?uONQ%LU&_@oJQK3JN!4OUFlhvj9TYP9N1|6&$?0W$Zv!qvb2t#i5dkH@t2kpHLk`)lg2By6y%19ro9G&hzL zQl0lJ>?gXduDt>p*);uZM$xY|`67Olg{3mD<#nub&TCkMTu#3LnRh^NILd{&A!dtd z1uYRbt>NTvnp_w3*VEF()-EtW*Yy9{D)8eaTHqWIzz#kVKY_gk|9nkGPi7xvL?@q? zas{A6(ZiRo26Gr4DKeCv7@@h&0I9>drj7SSL6x1gcNwcm62cDK0j=wu>cq1}0klN_ zIMQ;=-}XksZ!c4UeY1gv>SoqJ6YwI0YvL$<`}Sr zjY|qWtLtyqK04_y9R6h#SaXGmXK#Q5FIptF=*S+(N*y`L3#~|O@6sCi6dRI$ty}9H{Z_7T z@KZ+_7(&is0w12{;p%}f)VpZ^gnyZ|ad!tl?Bm_Pj>V#}FNLkcI$shQhF#^ntinSy zrs^=C^N@J$_hpr!t6TQOL@HV#N<4XYNGYy3&$u@%7Yjc9bwep@Q%dc9xGnrdNnj#dXQ|SPa$j9G*q{PRdw2M*xh%% z{8!r>wYnU#{utmI=eNB(P#N0Xc#dQMq%lK~(vPzW-(gYRIac3AI@Pj==|&gq2L z-uIHi-iHVSnF_yPQw*b{fA3XCWkSA}`SfPr?gmdP?k#mBoRl!Sk{3COIKvk*2cmQo zXAj%g(AygjS#X}d+J7%@!D}xede?~O%j%!5g%2f-7L=I3&K5aaTN_;))=hY~-_79U z?nv0DAz3?0f`epDWv+8wld$2TEdD{lH+HYNLhl_JyS8ddm6yQX|Jn_rNpe{!Q4<8C z6G6>Y4=r-3?mCKQr)Mi3ME^>59KMfx5Tt851%Ht-BE#if=kqXLXtyJY zn*g_vzP-~Tc}U@FjQjUaB(hIU8MyZ-6s0n4Hf3n9W1KN7!6mRG$0R9)38ooC+?0a#`V)|^1 z=6rjzMIJjD=yYPc6Sqb$ulNwNd7^)^wp}`=hsBfxo^0O@s^NBFpGv)Q_v}Yq)PLWo zFC_O84}@?T>~7f`>NQ^IIXY1PbdRxbb6ToVp?tMn)`h|3Z9RqSgFi+Hn)HM==i!>Y zJclXQ;%CjGcXqT($&4bb8=d?#r=IEpZgVZf!&;)4b%ZL5ZZ)(th}c=-F4&4 z)^x=9TJ*aslEv7rmZO%S|GHnI=Jh|1vzs!TQoSVCcF-->OolL};!^378}3;Zxd_O+m@+gg(} zb|laY26kngc@sfPto~o~0zR|Ukv1bEoi*;QVLDpH?_KZp1SfGN#Lb}Uw5)z{^z`V} zUxB)2VP7oTJoMm>Z#L70_4#JKv}3?Vtk?%z4`i%#OC`9=k>`)2Nl>74@P~6ruG>ut z78t<(Bp8Y^!M5o`p}qgf151hFfK#yu)l4Pml=x(2(|Pk#bqa>L_u8weRG+TY-2>(A z2q&?>%@)~OThkiL&KSzspYR%5IvBfAA}=iuJz4%cU*rV9Wm?jFJun{}^$BliNbPF} zb%DIAQEVNk)2pDaEJ9=aYc3r%iSYJf{o)rAS55Ojug-S0_VKlkJM7@i*-x^c040{* zpQDGGfyEPs#+WwIG_^wEwD^=W>WH89jt;8Ak%9`cDy?SUt8Q_NTs*)0xvC?85Yuj< zDmyx!S3Yu?2OH=LIPu9{c}FSOr}_N~gP@N^x9cL@{l*IqMx(Oss9&*h(%DeBtoA>q z-V2|=qbyoR{c%}u|8N#wz;@fkM@J+vIqaZ6d*hmiRReMK&l z;}?)MN(92*jiu(<+_hw9no9Gb+=_KoW)^O7ag}k~hmO7(vi;zsh)wyQtQ1QRlz>|D zM__3O0Pl>});l`Fdq-<4p>WuA96fB8QxI^OmBN}a^)W%Mx4_{@0Sn*ns2dl=iJa1K z2Uim!*VZjAl7OAH(*6Xh+kah1Erb5D5?J~c&Fqxvlnv94tHeP)!;Q+CB2hh;ci9it zWo}!D9lrUM#!FxPw7jNNC-s}2LvYc(az>M^pn5{@8m^^6Yvamzah~>KY-d=A5p!8Z zaHAsS<102yWfW-OSesvoDKLTm#^a4kEtl%ZA02)VS1sCNQFde(z5I2uM>j|i*bmi@3=jTSV%>c;4 zoBqbMQuvHy(XMed4iSWz8qJU~m4U81uf1>cDMqDgIeO^N_LmzLkbmbD=%L-YQ*xBL z7bx{VnQu2g;qIC?SA#R{@(E{|Ue+r~3O(EYmQ&U4T*rdRwNph+ee;k|we5Hm% z(!8u7uu)dzte(;>4wI!SJBFHP;{LyL)acTBZ* zaS=f|j%a%m7Gobt*SSoA?}h#Q%6)48ys`h%h}oK!69MzMhDfpyc$3jbKq7qVNiIXD z$<1oR{@Kc^sNuu~+Bcx;r|JNG(P^-HS8sjxg1%z8{rE9KvEo$mF(buDEJ_1BawW#` z0vo+eIju>PIi?^v=#-OV*D$CzHeDPkgshZiQ(>{)Vcph~@4}9-t`E&;ai)1}0y2J} z8i|#<_+(fnhqylYZ$4p*;<*$r3dC%JIYWcnkB%?3)feZh5ZcalrRo!!Z;%a?w@1cz z5#H)e0A1;L##~LL1PhY~c5!V8%todQ)IV-r*>Sp{IXQ_SsQ4A??wPJVIO~zCW+1x} zYQ4ua*3s4dRRbpMuH5^%wi2P0fmOJpS02xbbobx2;jmeL1fas3wt}w|G6^^<*tQkVQV;@Rf>ZGhxnCS_T(aG=K8QaMwRy?Sy}ITaf2pygY&>~vUBbS$^8 z(55HQv*8CS`#0D|S^o1<&NMT9k6PMyX1#(hqHCS}FkNV}vdjJ@1w~xNnHCMX2>n>r z`}ow@5lOai2Jx92Km~dd>DM2~P5SW}tivY#n9mYSmM+Nzr00ZKU#S+C2N~aGiY7{q zCKT29yGut%`ANTssKPl8Zvy+v>!(tNF^{5cro@gP3Af*i-)Q?ek@d~r02u-!ec+mv zJQcmg?lKbVQR4>6fJLCztWu1q<Ou8B;O>KwM=4c}0tSnm*#EXRY$)v~Vrg7XO!agkRuh?@FhhBSvV_m&EbC z%K5X!ZB?*MaorHlku5x9PFPW7A7}-%l1#ZZ?bzlTYP&D0PTi{nt4}4UU9^D&SO{o_ zqFr2Lc8)09gO3+4gB=`u_r0_+jw5=!DoiSqiN&<%5h~wq1@5)A-E{oX5k{a5!(Z-= zK|G^C|L*Y)od^IBd7~xG70)a4Ttx8#nAP!4fi(a$jGctk_0;`Hr1KTj(vf|uS?|o7 zs!)B|;6KUDANEg7-xmPcnq6rqWNxX@aU#QXXEwPUjJ=6pcGSX+(!k>*0ImKvaURIe zGXQ@6IHQi&FPv88wf5Gq`?-|L%Sll^CmT_(PM<>tiA@PLHbZ_a2$_&-N9fCZ+K8c$>HWBkkFcd-?az>H+Nkw1!0|3-JnLU{9s7y)vI4w6!1~FQe5+_YeXolga?xay zYqAhz1u7+%p)Yy06=UP9p*9}&sjbm+jml^G=XkA)pV^c`tvDK%IVBHbVkS%M{LIw0N(}MpU5N2W5Gbz!4LwA)N{b_=gGhj9G5 zTf91&>>n4d*!^K~YDLWmLP+eDS9`o{a`YsDH>cm7h5sCVZKIfPK(2TR7@or^a5sDJ zp>6r&l@SpfxWrhQ=0L3e)^^IaRSRQF_3j_MZf<_EC8x6ky9RH}|lYS9N5!%`<^ zcBzumn8iT*IhQ|JXW{sW_BJCz?h6H;sW)tY^v8DV(d7~K)zjygN}$)<8hZ~vdAl+aqoJxa^Iq$gXm#u?oelU>1mnux$ zewzkqoT?z9?EQVyB8}26j8iO-#q)3@waf9WHx`9bZMrtU>%W+ludU%Z7?N_$q$dTpLiMMkP2Le-CgQGB&!?JY@nsiAVBFHDZwy8*N^rBWpuW2pRr!%&{k zTexI|8q2_<^LSU&9fGH}z8xB*Mq&Oq`Q65r_0UCLN02u?MjKg# zo*?#^=AYCdZ9;pgmk~gJ{)M5|-(_c*C!wVK;0lmO0<1~h&MX)d@nR)V?IWX$s7jhj zn(_drEcM=wQqAqBck-`6ZvxA^lK%7yEiu*Ky8^(>;EFyW^U*3XR*b?OwQ@uvo*EqQ zozznU1>#4%mQVb;G&qX}OVc+&MKG}wi9c)Vkl-S(NKwc45Yi=><=>N_@Wq;U>0-4l_|(zXn(6d!0E~F9g6^y7Us3(yrSu zj~Bl5nBJiU8MO}e?oAXeUK7p=2i7y=wXEP)?m0rJWuRSNE9=K~i}(M2PDfMQG`oMk zA03x#_B3AI@y4}xomLZZqdvQgl5d=!b_=7umQFJCGXu8;P(;p`anAXly`_X^a<;e3 z5{;;4*nPszY!&ciIO&)J^69tT6P30R`|~%({=pzii6O*4r3!$axjK^HTHQL>?_mUE zC!)HpnnASAd;QrBIT|LKm(>ilaBBq2^oQz8*1TwERguqhUi{jg)3~h^#48%*;+c3G zMA?n`IPbk|ev}5paA^HmW~+41iCx2RRGF$;Y!}fqj2$cqPNqra;j&dBg|ca$KAJy% zWG8YarTK!p=0;v|P&?S#gj=N6v~MPp1#;y;<$F1*ptR2EaaDwOH8rvU6b!l@M{3?Z z{fae4^_Hs1@&sg}r|Iqx^~QuuYtcX|%M2~fv35WS)xPE;3TdH$^LDo%m$vbZx8UR} zTKV4gA3RS*!sY#o_(Aj7lbgRdas+zyx~8%8O(XQ=jxVNaatN=+@kv=R?SashH3if=p~Pq zah1KUjO@|jeR8GbxBLU0eR7m*3P+LcGzW_<3y2qN6vA!^tHy}(=%A+{SUQP374+3c zWULa3jJtX*t&eub%4o;0v(4iiLsh0#dcuCc^okK(uXFCn*17WQVj?0RnB}v#6YWzG zSN{V%$H zau-ur+n^I)ZwlAqj)Tu!-?@kaXFdi1L7X6nhEQVOE2)r<{x%UGw$Gn$sWsDye2o53 zK7g`W=Q)4g(se7zb>rs>jRYIh%phwML&O=?_R_7au@f){`#TcNEAip>#IQO@*sv7QiR-6V6<3>dIsjnQ}eD-!5 zI57QnPd5S^(;*&r9Elu8-1o8ekJF2Reu1;QNpa5vmJ*DM351TglB-%y%01Y0to?52 zFId&cZC>DHZ{p)7=FjOaqCgJ^sGKqTeNq|75P!Ayt*+^O*#TBn*##+%X8l=5Agy=6 zf~nMCD#*N+K|tul<&>}~6E9%V3SnWT>U;6^Y04e%tW+SY$`-~7ewmT41voo%cBs^- zIP_tWtZ*?TAIe3v;@%-1lKaX*>$mhc=m|_2vij{ZFJlV8>M`5c3X*;02kLH?=vpg7 z!M=5cz<4B2O7AQe`I`+Qc-jU#oDzJm-Qyg6U{VWtM~|Z;N_N&95>*Mv?iG%v7_rI! zYNEuU(I}w)b>_!v+hyhP=x;`Lqk{=}{}=l8a4GdCKhxmOl2oqu@%VCNnPR-X0P@rF z;fv>9(69Uanvy-BwK`loUA5yRwTKn2ez0P@x3IBzX*F>;CDhro<9B!cu6eFLDV`Od zf!+NJhxAp2|C5A8_LM=Ks<;q#7+myT~xJfMLSf{K=GDC2zA&+#V|*wklY+4p#w&kqqsaB-7E2ZPrg# z33Qu?Q1VWPbR32SGFO~hF;X$_TR(faz1318g*N-BlUk_=`!0f7Oivn$7gnix-_fL^ zRPu}%`wHR%DVDr0*_6$2hi1=Rnee$WRmQA;VN2@$*=v%>jo2!_uiGwb3%-qwAEnQK zl>Jx`x7zV=eu@2QKB^RMN!;g-V=E+|HEezm<81R6MoENT{wGB*O%R2^0FYdpzmr{? z=eP9`0e+pJTSVuPLIal>Evug#xyc@apWLcpAIYCP&kgL~Qk@>FpD%Zuwic#z(UXmA zD4ErM$b^dK=FEKj1w+xB+41|=c=})D=m(1u_SdW4l|PH!SvgFv&mqvU=H& zP4Nxbn3~8LW%fl`gKrYXd}X}ouqS-A_vGky{1*8a`F;9O+w;c<7Q#6#x9tZwd=hQ7 zQ~4`KJ~>a<(#SVeP?53tirm3ar9uQTEV#tQj=8qn7hb$Ah0v!NKtu4Q@6(WFvSy8( z>2Hg*FR42My_EX=624?X8OvcHAl7F-kJq265Q3g3dQ znoICQ3oP59yr|L`w~c35WC^(}eM!%j4%gSJ0V9K#GBZmD4}AF3hc~WZ3ZIZrqy65S zM<&G_eDw_3vvOqdFrS8xJaJN>-ZN&EeauajX*|R`Tt-I5b{`msfEVO(OpWaivVOg< z{FE*Gm;I!T{!JYVTaYS^T!AEp+b-!)vH*?MBA$WvN$5qqmClDVofN_UkeqzRrYm-Jw4)Id{mE zj3;{P1@lV%JP<*%Af-SE+lb$jed{Bbu%`(A*#A5ZbIqdoX!G{3I9vYD6pRtbSo#N_ zc$#ZiODUC;Ihc3jXQ#)C^Wr$2x2c`!2%SejcOJK?wBp!66~{k-S`QWiz~Iz6A1VY9 z@>4VQUML4Nb;u=0QKa4(ec<6W&$bfO*5Ibz{0aBg(bv{kHL$96G5pW83kmm3;7{&x z`nzwWV)@toZ|yfiVm?xAziWc~aW+QlbnGq5SB#V|emm;x;Fxv`nEut5y4a|AeZKLv zxWDJ{A*aNWQ{o17=T@c`u0-@s){W3_s$tArT?x91q>5-On16SrM#!eAR2!^QIGY8V z1X-#I68W{7?|HqVC;9TH{dJu84t`ItZSLT=0~ka5)Ra@=EmR)GW|BTmSr)&2rV-hy zC=%2^{yge1S+nAWaX?kzFqWN#ygFhy!^>5s>R{>nNb?27w}3CPH6c>NUVPeJq-gGG z`bkTma6n%OuD+MBaj;^*`Q~?3+2(+)169JesDx>!g-TtE~3G3UnE!DD2X0bjG=0usT!Iims4OUsD0Zmy$*V|1 zV8Ira#LBQB*>;37>^A2|j`!gX-BYD1zPzTps8d-w&c0!|GEb4y6SU$j+3TDo_KmV_ z?DzWO&1d~pVAW^_L?n-C6+5B}F%q|pra@pdFiB7_2{?a%t)wP}@_v9sz$!TSb)RRk zq$9inwXA<9Uh2IHCfmE37b@IFWAXJZSXZx(N&R%8gvj zRcZ6rdUf`S&0|ONTf%KzgledR8HlAIkbuq~50-KX?4~!H3QG`F*7x zg=Fb*|IL96{~o>^%@$NsHn^%k*K8cHoCLsT8k`;d!LxwCQzS${=4~tXBcU8rdzhz5<8SSVM+Y&0exV>-}3^1WXe-{~XvGH2hKzbUFKKuX}0El%^ZA*b~pZ;Eo|zj@bOMV1Wz)i7#TY+z_ICa*M3d z9*7Dz_6h{X$)CKS?%6!Y;@do`HW9x^r3#X(j8!5fS7T7KuI62fU;tBxFmM)tkwT(V zc18YMr2wcK^nQ77Y2pYCXW8)9r|wO3s;H^kM5RY<5QAS6 zpr+HY``#4!*1SI{dz#d{nLlQm&Av61RxD0qpDXTscCLA<-g@uy3(^k8a#1LYLA4yU zJr_vY!(DHaf0H3j*Wvq1c4gyHv0DpMH-MU)7!~&q!i7L0Nz`5eN}zn%N7cs9^SZwC zmXARWNU9Q?Y0w9JBpqWA24_0-Du*e}t_JAVYJ$6}@5>Qss7~EG4a?FTsLDzV-epfl zpE!?n$(M2`zG@sa^(HEU26^mm&FFZ8EL@1R;x#uD?9Co4<6F;Z`a<=s7}dwX*ue3- zQ?|D|zXUZf%QJpu5KnDk>~YG}*-WGVhowMqtmP-Uh){zY!OoG}y5Insv4y%&p8NFa zEeipQ@{n(x9o`cmxAevx$tPO_yB!s{h<#|k#(FTYp&!{wRkIQdd9CA-=|HTiZt|T| z-C^my^RPv%`+JvGTS@^X&dg)MCP2}oaJge9R( z3vTe<>R+*p!)%8^_E>Y(mPnsSfqAwi%h4MgtE=OtdhP1t%$AhnBdfOG9V6ae(f9v*H(`nnlM0wUQAK~#f{WMPtOEz=p z`smPdVx5z35g8fghY~!f(XvSpeULuXv?PO-tL5E(w>8l>zx9m05mgx`)~qzqgLxfA zg5SAhB-cDHSo}Oy6t6hU=Ny911)`IS{J;5=9u`d#YYDLQATrP;QmZ-?q%kMT(;-932 zV7rNm$g9#lbMP%sE}l=h&}Xsms7s*?p-Ln2GfDN-u6kyX^&0pJvj~aa4Ts*4xx~(X zDW~O$7s%k6u`)oNRL!b;DcfbF+=2I;6SQ?r9~1(DSJEO^wUdO`sLp3NGTvN8S26P; zrC~iDWmG}a0&q5(+bz+G)uSrNyWF!}^&<5)>FF6NuVw7_3&x%%4Mhz{URy>#cvIH^ zw^p4JY80%NZSq;iM{SVLCv3oi;Ga0oIqjAXzWo0EdpoC)KrrsMS9<>f{p)KNnt0>Z zw;_M1|E35|EFrAX30Xzs+bfJ^cYeDOjb?!*r`~uP$jZrpuG2S{eX7gNcc4T%$={dq zv|50Rm<*$R?&jUbO~Eu{RJz~r zY51=3wUB`1o7_tvYqS6b3CI)OZQtbUXit#9w7H{e7+89>fo)k?l8*Pt); zwkVBn-HEjpCGx8_kf4;OO|OJ?RsOUZWhG^~Nts${k~HdIhqUE5TqR)rmwh)b?3L4{yjuwyO+MCo z=3Fe(0TbNBwCO?QrHln*Q^il(u_V2WM7yY2Sp&LVyL#5QY?8}SBdd_^#=LEs6Ph1u z3aO}YX777bpluRosop4PmsrY%D@?AP5k3PN;u8CIb{7Mhj^Q6&kk1#!3+La#knd2} z@{My{#?ujMTBYWIJ{~Ju7d5UDrudP*tK>WiFgl!ZX>mPcfWX;kBKSV z6@5k)vtcWQN)RD$80Z$jJxg*l5n>spIIU!SWsx`1KM!GwrNt0qRJS}i<_;!EQEwCR zJx+9|8qnbbmxl@9WrmvNa1%G*PY{SE+o?%*@A0(>#}{geU9bMF8IT)b;>@`B)CL$s zKEd6#qk}Ho5`Nh{6hqoYeY*oGr20DG$sCDry&0Xw^M-5AG23L5Y>T^z>EcWv6V8rW z-I#&Y%d9aF(b$fY&e$Sfo=SEkapDt+$3BedqNM(FR>}hYaY3?y}qq-I4C{vLg(bT$tpUV&6q0*k`~PNn)yeI$NwP-+Nj?D%v4Ef7V($ zT+#)&`D4Wk(fu}nR3740Cw4d0Y&r!_*%moqZ!L`K=58NQt7?}P<81Tel**r_@Iw8z1Tz^!G`pyNE9!b3VZ(Id?-{W49Otc z+(^GR#WelcvMcGc9UL2t>^tu8BxQH?zWBJ>6l%W(s5%$Z4I~s;6!#Ba+fxm*F+GDQ zJjcKL&rB5Pc9(Wlb5lE35eR&79Z4xfNqgD*;i})JRu-MYrNh(x*+%7*Qz<8Gvf%Oov%WSHk}0vB7GmXflYUa&);C#~wEiju zi>^?1$L~RT$!?QkY2@HZw!~KY2qo@(JiW#P^z9(}xB*y39*}#{R_pEVx#=tabS4WU zp_nz{uZ?VhQof(T25L6f@$jYVlMxfZy;ZmOxfX88GeyXyzLPB7Vk^TCV#dQR`a8}{e(FE`l1w4J0OX;mEk6wBspeF%6XD9~_%sT&eM zx4n@0sJ;po6Mu7vExkOp>Aqy&goql;(AD0X)B1K_T;sO8n|x`AHTg(33I>pFNVQ}H zVRsz#*u&W=fosEl@ zVv^VZRofrMDzMsx{9TYxrF^`F)1lN}E6dk9bVVO5(_(^WllJJ^^pn6G#p|M^I)R@} z8jGZIwpe=?+$YPd;o#(WbF;>!k+#8O{QB4LDwW{}Q5;pd zF8TrA3q`q9*Ol;p*$r~2D*xb?)hjJ+zXYLn*7h`KB`c77(#NuDKU>cGIP?WCGbi^nkhxHRNk zq^7#4G4e|wQHyaf7W28He+7cPkI(FD`XlHQ@7bGBXf1F4-Zk$#BWunMI!D(05jJ4s z4B`jF-i=}rzTNOGlvi(+!u3Z%N3R3}9p8s!Etuw&!K@ATDR2VQ6kjwKk$UP){bha* z)~c{L_VgG=tWA%Yn{A-%~%p9V*`~YB^AJ* zjLaJd_K9&haC}S`vJ#p`nLyR%hMn#8O!+16eVHcj=NV%Xcd8lIWaz4vl$sBmeL&`O)9v|EJCfnM)A#*2 zUMkc{#gCcV$C488r01nMF?cQ_xASB3v#$^khDv2PwNsC(^>+p{JwwP-mBTC<5WIo44R!lfT=;PBA?SLgkT{SY|%h zl96Xx)AG#y50`^PvM|2?d0maG(*NNc)=}?5`#FbW5aDxe>p}SHXhg0F9o14_*n1zR zoxw!)WKP27rWUTppKC0XUp%xU^C9u~$D9XqZ5$jAH0~&qci*U%2OCn+g|0NV>Uwum_U-;5fTTcF1u`7K&?6|UIxvFpjbxw21 zn7Zul=FEDn|0-u2uW`Uj4mSRE6H}Ii#R8ypKRkE7)k_~j%K5r%78q8S&ap76kP3c5 z-XGXoR8TvHG&VW9t7g&<{cxX%50e#iew9cK9_uSsUB$KP#tqy0_GP-Med|NA6onAu z5cDeD%qmHpDD4hL~i`OKarj z47|(gl8tt>pSr1HC7m+4X^X*f1rz*_;xe}~4u;r$uS~~p6g3((NviNWr?ce?FUq~? z;qB#~=`~lKuws(l9hlEXMliFIce(bl`Dq+zQuzd~Ge}Mg8#zq)XwH?d%>t4*d+Ev9 zxE;4_#)J$nqkw^>URHy2f1?Uu*)0|m{1CQ>FKN@ykxutl|FkBYbPst&d6Z>W4j|n% z0r}>i!ACql>BdPUJ{F@Yv~)iK!xg98*jRwC{}r>H6GzG}b%&JiY7pdOtN`c}S#u%R z?yEl8nyp3}e`!o4iOBeB{?%u1MA$Gv=CN?}NfWB?GY`qUr7dU)(@fBAkM{QLFZS!`A9g;YWO?B3Bt_Zn#nMS+# zM7&pPDD~uOsoXqz>k)fsT1-J|Bppeb-*3k=e5beAjgSo1Z^yRPx)xWV|Z9GAR%xeH5V&@eBHGdTzPT+ z>)W>ooAa)Q;d2|&M!Ly|XvU>u@zCuwdLMqOmIxj*33J|x81q!sR~t37^HAXfZqFn0F^Vr+b7rokbn%di79SLZhF_!x7hM{do4-}qpW!6lp0H?gn)3N)tQeOF znZ1VbDl$y^oe`RPND@!CXhcmxt%jdQ%6r|Bo$}V@VO0!A(;y5HRWRkPCncLPPy-vxNd_+C&Ez6JDZ>6 zU0BTNhhLgX1y9&y^L5m9Ji45a_2OCbq0-NcpG-~deu5ROu*_c;@WE%nO!7JF^8qrY z8#7)2Szf>y!b13$C;`S1w@KOhP>m>pJ=yJZn%V@{ZICIQA?yJ#Q1L3)ycCz)Gk+_F zg>({>dp(yEd6$fu$W4UKAu8Uw+RpmN`$@w{;{2UgqK>VRVR{h{H+mj_4& zWgif(reXBP(_)wl7lp{UAVC!Tv(>Ba!8LeXV1Usc3&&#oMut5Re*C-x#r6L6rzk%i z4atbIrqZ!4|pOyFu?ER{kaNhZ#(iKk_D`>>@m4tOpQd^!SIud&aFE!$RzPuh7J;cJOpXrkEuP z1C+f;+}A50!{|i?ag;1NavIpEfJxQbzUfFiZ*?9+gqbc}nElzN^m1h~auCPnxKj2o zyBfLa<;F4`-8ne2acM!JHxtT=LBJ_0KP-Pw0cL($y>(+hjyC;z+LtV)OkT_JkmDMi za}6a%=ToZZcXngn*NG;RTfNE_+TLoHZDtu7ZelF28D&fYwk>_Aa8DuO3L!EVdGFPr z-RAQb`~loQ8Fmr!lR0)}b4{!VWA}V6_&0;vP*U7D-2g8c+fhe-G)aiJd)zj|3>#Ew zhd+C9V7a+CkI3iM#tR4xreN?tZHwBG6350$HuLDm5VDV+Y*fnpp`wXJk0|MOvj?b~ z!?=LW;384GG6}i4T#?ko=A^G^N(u^hhy}P*zh-H>Fl$Hn@5AJwM zv%HZ@oq8hQ>nU8KtZ(c5=xVFVk$x{*<84GLbEiR%*G1!*8G11ww-s(dzYJqBRGVOk zbocuO2Z3!E6yvuAaJ7TX;B%_mAcPe1rN(_S_PWNkdWm7Kc6}>Ei>3AHgl7Jq_*XJ>$PP3Q@)eJUAjFd_4kZ$$T6(;Z`tYBR)bJc!ij??aOV; z+$DOkXqGFW6a`q3Zty(&sFtO`vC|M_;|(I>60{(#1I$-WD0H@!Dv z%80*@bCB?O`~DwU?*myF6Ker(p~au*Z3Z%d8tzhkv^U_ z+q5K`h#RcEI=6{8QzV+r-UfWerr%Y^Kd&*_Df9mUPA^6iv82S#w5vUB8O-8Jhd8_Ix44^*n!o^8~$P=Tj9AD}fVU887jF>_X*}9Yq&b zKVH1KUqAAyW9P^aAG)#QW$Tg5(CBhIt@ewew+@Z*@AB=eABa|zV#;d7kwV%1liAnZ z<$I$K9n7e1jmGB`y=Sb|zvK0Lq5ZHzR|R|gvQw|GgUKf5iPb?!ZeYVG^5t<361ZjtGTTv3-TcC+WsViY|oM&;Xe`Xf^%yq@IA zR-D|}eN1+60t%kFD22}-$@tT=?m~QCG(l}orXnDC^NlR1muZ0G7|_d+o`Rivr7XFO z$4|2{gZCn1Q=HV1-8~8pYFJ#{u=-}Es(`sY1>e~^d)dxMs-O5!az;3+rk|yJ6v`Lv zFVFL%Zk9W!y`GeDzJv<=q&A(EE35tWe6seE;uKG^Lw{97m~4cELoEN(O&rENSaTC& z^D{e4YLuefu>e+l#l?ocdyo*Ns<;@XDsL|3ScH?9PULM36gaxv^;KX^_)2APn49y~ zCqe@|?h*G!iM@sAniS8wHN(m(uw_Mip08px^Hn$ZWq8;pGNm2H{SHOsmb9~JIvh}C zMsXG^%tt8Vo=1vKP7k6Rix7HIL=d(@U zPCJr9AOE;Dm54Uyo4)3G?4bhZngVqx$Hk|#*KcZZy0Qz-Gbau+k)p>RZAh?A^!;Fz|!$0Xd$c(3h~!(<-d z{Zfw{76|X`c+t*#{hEuTDSgJ$1zC12V-7NtGfl*LxLk$y)mGnoCTv{)Qw-Gr=YYQk z`d=kAU`TZ!CHqOyTY;ch`;4HXZ1R-S1Lx1~;j@xX;nE0$NXG~t+*15to zYg_97N;=`n$3|%eHCVxT`!;By`fB+j^#dW%CER1s&{pJ1vdZYyF?(~u+@VIX7N&1h zXfbYw@N+r_oZoM+R+4(;7p2+~;25v<$C~Af3`V?rJ5a9ejlkJii?O6j&Dh-~I@YZO z&wPgG@>eFZ2f}ji^_p=M()6yw`sw#S_Z<{u&k-KKdwg^!HT+J^+YY}R)+CY@a}|{M zghcf2dk2xu65@T@8F#TV+Yn5 z`o!3V-qbvbnCp;CS?9N8CHD^WcGD^3m}}UcY$*?)$no~`=DRzZ=`GnKIjh|G>uTCMUow zgOYvx7s?eiV>lf8kx;t8jtNR~c&Xub36 z{G$D>pd2H;pcIS_rQO(cF5DI-Q$UzIZi+*BryOF0Qc~8doxGFfNl^fzdM**w-n#hi z-Mi}mJmcNY=4_mK^V&L~mz#}ldN?fRg2{FeM$>ih(#2JBXrRS^zZ`>juQecC{C;1r zmW-nRX4yN-#5j3}e^^8Hn@^zQq1dXTGB`=g67KDK-b(R>9oyWK!OzP-lhZmLc4yf~ z1?=$PjxBaR_zw|k*-29xmUO%~8qD%SLL?Pvf~o1>V3Rv5MWR#hMYuZf$vmj)RdQlI zFrEZ^OT%`oVdV3RdF|GdHT>RbRyE9C_EQ?&EAvD*j^7zz5aJ!sCR^y%GJebWQcKmv zDm-mR#xZs%2Lvub?}}EJ#QlM3MFaO7^G9DlOC;iMcKPIb&<%vVM%HnIDugZwIH};cQbJ{V2m3(zjl1AT_YZOD z=}zAXXsrb27ng@>3%uB0 z>Z5%HR_|&Z+l|Bun+t87W8_ZSY!WM2I~s6w{?)5{)n(Fy@-O>Syd?5le7yC}hK<^U z1|I^D#j7V@W@5a-xVi#BU65GAR0vpsL4V7Dw(IHhfqd?TuJ|1Y0W|T-&5gdn2n?mm zs6W{&FM6qeQ|L3A`xW1R+s2S}EH8DQj$TXl9q7~5x|Sn(00hS|I52*9=r>M$-H=x< zrBNp{EbbD;ih#<;*O#NO$w|f7-0*K$^zw;c5&6k=#r&RIX`)4jrlLk#r~Aix6BX$O zGx3GZ?AbkA4Wu@EoSEe_guT-#IaIP3wi1$_dVW*)L%DEKGIi-~{ltMD*2gag|IE7I z8317N&{hYJO-^mByw}2ehzer>fR-xLH0J;NQ~=9KCQfjbYEQeu1K~KWIUEdA)W+KY>|H2at>P4*@wR z?PS;B^t#f?R*H%v>E;mf#5=Hw#@aBamMW2JGL$=U;QoJ=3!wWB`nO2(bPAdYzNvaF zr;A@?*=!v8T3;4YFg&j~@V~s8@@Ya!y&C7cf7^!VtFEk`>~;uj5x@4p(3wr5YD4A47k6w8NU%^D@4J519_YSQ?ptRNR;N={q^s zOM`#K9q222KC38IOOy#fAFY35wb^`$Bsv~~368JI8cq(En(zPn&uXwszdO?i%_d!r zkAEPEcMh`~4zj)Hl4MOXF|xZwH?LiL9*98fzmAEGEz$koPXNBV z&cb+)L~*N?amefK&YN98FqKzYb2UhC1*D>7FyZtL&ncl}2oPEassYR8_*{U3FYY$L zYx@n^fJx7ZdJvO!uI69c+uyIfolzG_z%bd+8+CJB8@HixwVdXm(2UAK2p7EzO&1n> z_~lwK7`mx%13*}V)6=$lEJ-s*ZsX_zFpe}y4w_-S80y=yZ;_|O#i70&h5YRP0f^8N_(RwF0$XIj zVQs@vtv7zm*);QCM?C+o8CUTuU|`i=4^l-13OKF{Jz1^*Xq(zwQ!WZoUWUiUvI_OG zDo2M$Mog@3{rhcr|5dJ#LY1PftgH-1_JcV7&1W^FgY@5qVKFcST=E1l?i#oSzBmkn zJ?~i1_kW$~{d*^F|B$>)BDH`8Ggos=N-dZi$ln5t(1-l|{KL}!xK*4?q??N>rvZVB z3fL@b!2Ed+0l|NL6{4l3Ehs6kzj?e{q(WO!o&jA(_mG|JW%pCOyTH!IGm)}ulvHWiB#^;dThdQ@=JTd&q{|05srTu;?!J0w#0aX0$|Nz$RFJa!O#luij9R zfdTe#q-OB+xZzaR4TIvV1C&|azickHZ~H_{hjPZrp3Uiv-nU8$IsUWpu3#SUhuT_& z$tMR6HJih^|F}4b zES;XUzH2%FduI(M$dn8M^4#m+XhOKZ=GopnK8F)AGZb$QK-Vu5=2BFa%2ay)Ep&n4 zCGaD)k3h9%>VZKzhCjc7vuk~#s6K4|9eLMtHAg-So~kS`0vpJ|8+I~KVqlLhl|Eoo{Os^ zrMiCDV83qwxi|z^dCy16ZIFQ2G2~j5lW{^wIsNi&IDBYm=w;d0h1a`4&|WVEXaS=h zFp#|nBKBDYE*H3N|M~IjGJWD?tzQwew=Dw>h_0!1fYZv#%Eu6af8Uy(64VyK*mTpI zV5Ius15+y|3sVFFu{Yk*&qou6JjOW>jn?Bo!$@VpUW9%!`TFqiaIlTV=1mBgQf>;e zVi?SP!xWGKya7bM0I=resKHjrNA_1}K?;Ljhr;+j_7!NGVkV*gO7oMJLnf*i?fB!x z?k8s@5i9QjO+F0QITb+E7!*|Aiudl|@_P?`v)t<*;uY9!Idx&x8Sz(9`T@2ljN(5o zup@n}C<#B|udE2&qzZiwpdtGOEcKhwA4*G057)xgN0G?G;i5sM{fU=vx6_KEHnj_< z9s?z`i>|2?gXo8lY+Hu-23HGWUo~F+_p3B$%*jj&8`yxdNWdv+R!ixnYH4Ya%=ina za82nCUiTWxa(dSb73gLbf{ZD^^@MmJDKq_yNn{XEGlKp@67a{Wktn`755(7F zaFV8yq+f0>u^K*BS=YXl|9AjOrUVsd&hinDfvCPyS6BDT{XahL`?cWR@c`DZCx@#; zvJD5ltc;rv$^#$xtYd8T8bVL^Lk%DRM_iBr;`b2$czFiUGP{lB^zV62He+26w&w@# zE~N?G|A%L_oj$5zufm@#5F-rddBqnWUYf@Q<3uChh@KzML&Gkwf_*r zZ1leC6KwHrE|!_rfvzh{OT{5xge*fcUaL?DN)dPn?tA7GkeY>~U@g208>s|>b{7oV zzoDT>Hwf`$t6c{2R%#GcfW-)aFTq!j0T{r+NB;wA{r6`PI^kIxZkzv*yN=&g^MBU| zBM!GS8{WJ<=y5Lm@lw#VL7u25N4g2g&qF|>X)qw&d~9=M5x7@klLHZ5cfa-rKQShK zzOFn9;4Lz6jA73GM`wKf{EF~+-!qMp&0B{pD#Vthj6F?>K%*5L?h|+}2RVP9*#j(! zj46;KY0D0Hi}ik~;^*%r3&Dc~Xv!Oi!9ts(&o)Cy#?(Rw7niG(fL^jJ<{qH?H`4>a zhYu=({CBD9-UI_&4&4zxM)o`3-PhMY945teR?weJ9PeX!)Vmpw*$RG7h7P>UlhwTd z;KpkZH&)f*!z1{RCAZwaO=M8Eek*DBaI`_0+cEK%DUDUwNk=GAJ7^zzPJ_2!B00f~ zLDfx%kI^(|XZpJuF}dT$QqoYcD8e33VsF@90`H+e8X{BF5!xj5Aqd55<=}6$KxKZ& zeYynz$m7dE{XTnwU!%0qcuhm+C!f5DyUuleC9oec#VFa^Kge52;NyI5z>J^jr= zRr##|^Z(UP;5#8`MVFd#?|#E;z_uQG@efu{_e=UJ(MGTJs%gAVeFJNH ze&^5|{LFO2-JF{@zLEH_R<^Vl1>E}th%(1Z+A2DKQ}nw1Da#y7>yhYYC;5m9kPD$u-RU5TW9u@^NI~e?=wMrq-D*Mq?0{{ZP;oEz6x1@JlQRBZBJxdhTz}w%Y-jPtwFsJ3MHV0T(mvk3Sjhvv@w2RgL#-Ka#j! zeX4zSyDI!tt0m|~!_VN|MY@~}9sP2J+hz5KE4vV3mqLM8m%budRd{(R>#sv55i~<1 zu(Sy=NFY4!Z2&kja1WZ}$d9%bgB%hF_B>QmzgG%7@-I$Z*3Qh`tS(X#3o+Q|wx;;^ zZHXTt_otAboSfWsu`kE<2)`ydFqjOl0J}$GkHgC9D#vmcqo)VhgsP?oao|c+-!Jp46A9jo_V+7(;kopG$~wncTjzoQ zco%o0*utcQy~huuC4%@MI@MhWy?kE0Cu{{g!8?OhZ}&c32k4`f7ySH5GYpYpU-7>m z4*?$f%O~HK3wJ?ctToz)c`n_rD6O=U)vwnN?&{F{J1;*r{LY4mF@I?;lFmSauv%#& zLr)HtYeJ7|hQ^$cIe?M;QFono?nmV$MDxB=GwEx|&dHf5{&8#s+};UjOI>+EzxXbe zTj#Hup$8zbYTvcrSI0qOoo&4t!O|*Mo$vPYNyS+IDY5@lPj<@R0TM*s0mttJm4xa5 z^hkeap_jua*B8GAvD)q5OaM|AFWU2Pxu^&{?lyNJ-g@FY5&jNlZ@bgqqxCYAo*i6v zJ5yW9eyy)C`CG>up(?lzv6_h&0C+zLnB0&N!HzESg|7ty(P#p!!okrE6K9P74b_6@Ffd)C=% z2TYWio#`4P*xk+}Ag{6-D6%ie^6%SStmym3W zu~;PJPXLE?*I*e0CTgA(KJCpE>)kw-0s#1S9-dJHHBq1rcS`|+yA;5Vg6`vlPNo5r zXZE_!%J9kyM9Wjm`d=dwxP(T8n4!YEx+wuY4uc{#2VJ|x+DgJcK0XgJr?S`A*9pDQ z4?fIV^}nk%n!N3S|DT{xOLlIk<>Smeik{8}7#= z?ghZ0ssHvqj#F&YviSZ5@TMT%lq>jjTa%FT@oqjV*m|Tj-GMr<4BJ!0wZ#j*Wescu zYJL|b0{U|ZR>@c=%NVPsQ4U(%I`llC-i>8c=!KM_r{iKC2^b_|3?ThSL*N6$XJ5S6-+` z2V8eS*r@H*-G0s$u0n`Zi6fDdRY;dz@Dex$X5d{Y%G%1MD>9@G@Y<^9E&|WXU6IrC z7K%ghd4K_oz|ug_r1-?P^SSU$kLVI3eO+Yihq2i2z0$s&j&PyS&JK>DY-~0`1OQsHZzo^4< zwlBcuq)AWrR!57F#6N;Y3qYIlF8rdZIolZeM}X(fDaeLcWn>kMsSAzE%-)`J4(7e9 zaYdXWT$wmSAA4I%U6?{zSP63feBzZ*I6;+zAFPY)9MQHNuf{04D+I5f%N=~+UNwy$ zcTZUhTDgyWHeX>*fMZ19^s844t=w%Y3ILM)^ySGW{pmGf7g6o#SQ40T)-(PlPO@L0 z@yR~-g@Pc7B9lVGqUJfRgr*z+5hAnAc(&O=umS(MT(fZrn4$@$_?O$2NKE5^hAxBr zp{<}Ys7u7G@yY4`Bp={Wh~?RVHd2aXi-zcSGbp`j8C4^;6y;Zt;VErm5!{HEYj>{D zB)KET$N8_(!4n#3NCSfu&_h>QQ{0+Zh~Q3f;m1&S13Dt_!KUsb*mPJl4$DUEVYZEvfBLpr{pJZYFki%)K8T)8kTa0;}r z!0|Hj#P#}>HY)zjL}ZkE6ecETZmP`fjpq3^iIZ80=03ZP%?^|&hJ(mKHcW6xVjrA~ zoguHZYSqeU8nr*t!|0;h4*zMw=Jj>^ZXKbVL=&w3_+5v!nEV>&z@I^GL3)+=HuzC! z@!~ySoYbB_+0xz#D}GBFd7y_0?Yf)8!l9RoJbuqfakRC;er8qkPiL5Wbh*~UgfATY zO#3%|=OC*U^H~ela2N`rWfd7Dj!Ve_GbiS z{`ncU&=m6M^uj>$afeOIl!ITq>`0_RtjZCHqM}`Kw2s&~tY_k8tbzP|TI@l!Hscx9 z0M>~&SJxGPel?mRF386I*3Y0zXNoYJ_(I_WsksNCm-)9nieb|fFxz*Zf4URf-8Jyt z0N2G#%EBEN)UVEw9YOtSMlJBy15-`kv-)IA7cFLfRz9avDScKv{1D{Gh|iP&uV|tO z)lSrV5WSDB8B2eTT_NL4rlF&zceMcgz_f}`XftF>mv|F3+HkVBJQ{jDDzKU|@1aHA z`tnXEc^xyw4~UL7FqrLTr|=`{U4dL5{ZZ3a?s>UeY-F0%_}1{o3jX3g&paola1UYk zxD@KJ>KElK^qP*d;aquxYI9V$Gg*=oSWHYH>2s&Ex%WMrm|sj1KbqDF%Vbl-(LRh?OEq4V7CpmM?yot_^h2ge%k$J;c5~+`>Wo6n)fiYuU9v%k z`(`Mxxzl@lUAv8cRIr4M1_w-TfHFm~lTab6;2 ziWT)BEoJXfD2&6LqqyDlPYuS9On)P2Q9)bEev@jMFP4`P&pH0ujY)>hE_IqkSvc=< zM{LHlXaa3(d5iM)SEFk>8p+%kPB`t}?RC2#sg=lymf_wn?hL1}&Mzs;pGi1waa}-c zL3e6K(5?t2kI^LSB*;u0U1x1v{IKUJw)x(QwE6#ACTx>tRw5dP4nLGb$9v+slB9VG!pw z`dh7%Ha?cG-p*i?u>v`kVdoY9&d%Vm8yoFlF;(cg7m@~XXEOV|BJM`_eV(F_B(WA` zx5s%ii4*p2L!N@~{h=p2Ep!6Y12A}ZhL{l@UOJNn`EqT>#$Mt>=-jH^j>~m5rA(iv z-{tW6d(57#Lfjtw0vBb~=1-0`%Av<5y}6>8IKXpkRNxULtgf!UbqqwFNa$fvsD+UD zY!{>z38MT`?3$Nz?j*=@K4H!I+-<1JnbA)iQaQd~ z;2&o4tV?4asJfAV6$|l;8S9 zSBcH>3PUaAu%*}3t+_#;3BXq+iiv<77#Cn@OeUf^{|VZ5Q904-u=!3RNaj9XOxA*D zK6S^|*B#!yj4$yB5!THVFa3+*#qvv4G+AFv&qOhTt6D+?j)eL=fJzpNnxS&Hv-*Py zzYTK(*Uutgw04Tg7nc3N_@fql9#ls~pfah*A~G~&9u%zT#_x3lIe$L@?4j)AO)BGG zJRN#ElyXufa7P-`kgqr@~cDyQq~2)zq@CA&Y^CM8z4{1wlhI&v)g#qHP}LIO$jB=rE4p# z7^MSgkh?wBAL83uR@ESj4f2{7G6>KCa#qMLG~t{nFy9yxpd2e5@<`zn`zwWavp-_f z964Sc^AbRo#_FRDXY$P-X=tW=bh-i@#~tf>I^Qt2p4T@-a03vA_JkcM;0~oZwtni` zZ78o>AYW}mHY@Ar-2A)^IJ~u$Aj@a4)H* zwv^Gydnjw%qPFsOY8@WA0l|A?L!YVKR=UX(YgOy}0HP4L1qw-@`TNOVuACM&GR9b`#_Y$X`Uw#nFTU?`_KCPCRUouzI`lOg*Ete!8K?oq)39ST!MS<=q> z!liAcNzfRs#A(x< zMf@SF%rWFs#r1bRg)9UI8;H*g`Zwb^&5L6_%NggIt!)v-EcQhZPjuxTLcM1d3e*Js z@AqoDfefbDQ-o8=#F|-CB4Q1EiCZ0}`e0}6f)tB(Fe6G{^ayHAnCZ7uud+EaYy)ir z(PD_L1RGAD=JWV79CtWrk%?dI@uU9p(tD0@I-e&&r9>rP0;GlJY6w}MCDlic1Vgt4 zXQ(c)yid;QjuJZpHW&BaulerGSgVpG&_xTFmq1KM%4;?HjyV$_;&;-^gPi6;2|8ip z3fY1~w{`B*{LUfvH{O;~K1<@fHB$WKu**nq8w=%%H*uXVKkqF-R95Dp_kE*}kehGM zE7rH*e@exUfMLx-mHM}HirOxSCAet5t$pE;Q%zTNg8?1SEaNTz!cM7LColR^DXZ&z z&O0W4F{U5u0wu}tg;_N&EEE0V5`P$TTD5misd)H^a zbNA$B;;^Z|I`Cm8YmSu6Oy5ujM48CC{2oW1K@goH*CO&NFNW~TB$%XEzi5^13E|JW zMu@NObCkDu{0OyLO}r$Xy*6+tJ??|g99eb@aA0xOMAiuLGg7TPNcp%v&@Yr!)(NJ0 z3a8%d_{~j&VymuhuTQ=Fsy4~qoOnaZ^i}MJHnV3;#Rz%_fuJo$A`2Av$7 zds%T@kTGUtku69a$BE7!`9p9&ko&2_iOfvov-mmZ1(M{5n?k-6*^vfq$JR9U^uFYX z=w9AcmfARXb^7ZB=|tCO?r9srM9Cn5l~xSXw~UfANGhHrCzLas+>FJS=ID07l4K0` zWvOAl3NO(Va--wq zKDMAw@0bN6j(;`RUNI~8R8d}rk@Sq}7|k0lG9`+4U)1iH_R{a3-?^Q0*;bRqfS;d= zrjI0#lirRykpk}DADN+Zmi`g2e-Z=H*V`oecUMdscUmvCwn z%0-+B(KRgQj@YCaq(&RLUQCUrIbjUH@GW_+zCgqvLgO>~J+m+)H3B1wBTp~jB>XT8 z$iEO#${%MbNa`WNvDj7q1W0-5M+I%!tid2SX zT>GcUQM^1WlCAW6A|l$V=sd1P*c7cyQg=ZAC&$(XK zxIRjmrtxPfQsUN6c0~B`yORerTa^z(TO0D8#Pzl`%+x>Pt~aH--5u>)AP-OM@^FNnoRJz;d?0MWb(av3o{<{n@W@cq2TLD zaJ3OhTHW-c`FDMq-O-8n4e6Q0IJq)zu!`}=lF?gp=PLFD+@58%4fVKLY-<#sYj1?L zI9n8EE1S;8{ZTYqQ$tv|nxeyK+Rlc)p?5qlFN2GkyNBX!*tB}aGLpQ^f3oYP}BdZ)Jb5m{8{ z^yRiOig@5TeD=K-maYbRTbQA`usp@Wu;0I?&eXFm2*zO80Vi0sqv)B94FNE zXTx1A5YpUBkWtx&F-@est>uM&-1rp3KsHHXhBDo=eT=fhXk@g7kf{0MNPWY3^@DAx zsHH0UzKc$aFxWQh+s~nV_F#?e3xAwk8%YVd6rB|vpI92IxpDvC)y7nEd(v{AqMii( zJlvkmwm;p}+^OW3@|>Q!?5~T;f-|=J309?ODvoV z;cNG_dqafqW%c?A4H+C^6`~DxDq}Wy4s6PH^!W5^;9*fIU?u6@(tCnkRR5PtD&pTkaNIM1lr$w<~+p(g6M8d!^_+ z`5RI2zj5j}44g1dx{O{mNro&lLx#NhIZ#+@DA?&!kf_^EFNa@!{wqSPDYLoI&X4`T zs;8yQRuDyM&gy{pUc?_E)~Y{zzjo?(?#)RYCYkPcmDpdn2>^}L~gzz`C zT^Sj_6OxkiiRCmm)vDjVC{FJ@48LiIQUm@i%Mg|FX811TZAZyUVgGjTSq03eFM}-K z7$ehqgDA?LZ0`=6tt>aCM^k6tc^-4bQ&}8S+LU1@xYC5LnEEb?vWgJLbZCWhfumjB z3%tyKwiotf_>bZ#HoRKSSs#e)Q~-DB1?lP^1UgmvDU{zQHN_$T+npzT5_DrW=$0cU zR|ji>*4!PbslQQo(CePuxYuosuUuMSX*d%ai^vL`a!I1xEZPi3AqnbixJ&hPSm_(v zCI5_CTBfw>OBuxk&RkPJezCDN_cW3`B=qWhTW0raDOK&WV2U8sW4TlZD&hqq>$*d0 zsn+z|G>=^gN`-i(<3n-g?0q{P_5;;`I?!*Gf_BohACbEuVXe`uD|@x` z6B0{y!i|rA`e$z>(&~Nv>(yqx&wK7&$|Kv4RJL3Ij{?!u`#Ms60+l3eKCx+>XO_XC zuklB9=ACYCoXM;=gZ7oPV^U)ahGVvD+^>^_Z?WMn(mUgryYKVak%h=wY@XqW5u0ne zTR^+AW|QA#Xzd(AaK@Wj<7ucOOmSDb$;J!66AM%4QeBO$8M5STRagZcWHjb$V)_!;SsWMxJ_w6@*}H-DN5frR{pDRcVNrSi>(Z4d=JdT|XCCVOu{@mdu=HsC@kD19t|?u8_J z3iAG3cA-x2OAA}aJj5oR_XeSdLjp-mQVwLe9j;qLGt<7axog z+9saHIo&zv$CZD_3{G^#J9%{%KC}*Lao3J?vqwDc7U5@%EAG;Z#FM}o9Hp1s7aB)|ldnv+DPKpmBw6 zqGN1baosaT*Ez?>Yv$;0gEgX~?#-I905%T6UE3IJv%z790NZb}7OMVt zpd&u%snSy$87hw0{%Eeg`Uxn86b6jxiM!(&R-42hSz0|d1rF@X<5w1^#SEA#i=(je z2OJzN%GWEM$CUpZ!qM~&TZ~l_rW<}+8^deMn%_T5*hoaXnFI_IO|{bp27 zXW-K=^Yeat^Gd&6lAI*I1T0a#q51498fD?3yhKUroIenj$F{ix73Zhp#x{ru+~%^ce%tnglYhn5v zM$(Pq?^x4}I|gO_cCrhdZ_BX1Ur~y=n{lSy#I|eIjp9hn+*5A5KEAURI44ZYz^1q@ zVp^v6xBT60bF!TpFa92!=cq@~Q@nV!pPCYM31^BgK)L28r8g@ENeIhe5;%t!fc_kb z&MA)FdxVI|>0;su=^&iPWDd0Fb~!Nrae9aToO^o+_c${>={xb-cRQNV;jyPBOm956^25?v|KaZV^%3|Hf$fnVI{ zD=DM*=vRAAczx+MvzWy!E;fDmJ{5SCYRl`At+k0HWyY-Y5nm;ZP$~Q6VQYp zqBvF3$_FubZV_FLE0mzG&pmNb&uXoiMj{cxIUaVm#pD{YonG**zkOmbNW+GEQA(fp z>H>>-W~sIj#CPXIE`VLj!HfOifx`#wg^JP@J7w&Um{K=cL(zBnh0)^gX-+otrCg@p zGT&@~HCaU@7knpJe#*q6L~Vzbr?Q|etL4NOy#O=LJRL9uNqn0fQs7>)&1z(X;0QEU+{X8zk9+px^EjB(C1I8n5 zY(y7~^nIXF5|xz9v)GmG`R=)QUF2`#sT9%~L)0&qafj4CnfylGh=S8G8#UR9-m(p9 z*3ze;J4<@<+lkcjHXAEPp3$|OF4$b;Rv;*&MjcadlA-&9@Kr|hZofW0V7MlDyR{OR z58N}9CWdo;6p~*9R>p7ZPmeb`>XK-`FBfLpb7x5gwP(OYi^!tZI!SjdPzs+i!xYdl zT~wPokG+NYFxSb%_fX=kcX%6fM0hdWxJ$m9z)DUD*n>p49isAiEb?|uFU8CTaI#{U zuHe%_v9hJ8$IwLBz&HcbAIR{$(jE~o|H@YIvmZYrP34;ka)OxNUIZs{3+aDZu30H@ zvMJfaz~TN#nU$qgApO`$g=jq+ngQib2*BtP>$S*u(J8z#-S*BTDmz9gfyQa19I=xh z1drlHb!B(uVbSzDsMzhFKS>)ZxZzA{d*4M3AHhXJ#ISaV)$W8|r@EewqHY;;u-=Bh zqJ%JX>x($AU(HI%Dv`8&gzn(P(3nzHSgZ9f>=8z81^iB{YmvvQwk;>q-M?h1+xjlt zUf9PNv9mc^wr#w+Dn0o^CgOmQ8epD_OUv}Ruj1x7_cXv#E5g$v14YK$o{YZeewNZ1;#Avx z!p~i)JThMV9b*z?U@ial>sH$U!2jVRo-tZ+c?kTp$Vot-Xz}vyf-oZWhRH%DJ`P9E z9fAbq{XuXF=75^@wB4Pkb=Zp~_WJY6kXdS+iHyBWzXNKgwHk&wK1yKua=G@B%aK9{ zG>jiYZAK3k*TZ$rHo1nhL?Tt04Vwa*8Q;P-%0U-yvFNXxitv}%nIJ>iT_!Zv z>&@{%;TXK9yBKz2p=sU91Idlg<-u+kSz#s&FoKj7BCbLuf1Z);RNn4hSuCgT7U=8< z@_F(DlpR<|`C7kd`CiDrbuE5cCLH;2fIl}msi3(<3bSNUybFU1erc)HQzhXUk6mll zes6D#HRI3BJT4KFnD3Brcr4^zw|Tpf2pZ@@`C*zRN1Vpc8&hT`Q<4co zERCZ`+oMV)+|j}C$l!TQTxTokc=+AM8a6Rl(qxNi%Ol<;qs-PRPaB607hJT#eH!#1 z$fi9M^l3Uju5e1+5qOoj_Wqr#7{Os!*OgNfM{6T2o1AAI7kWqD{rgt?D^nU7cIFg6U3Og;CsDB7AEj}{9=zu+z+;`2 zcd`V;*hvLR_)_K>CX3-tsK=n@WZJ@zJF6ajMSDf-}@;iz=Y7Ln%>=hM&7o&puarZy*p96FJmY;9eD)1P?ez#kvCa zK);6yF5??}K+$9_Uh@!G()g^slkNthE?oL3-gpAk>3E`hXJ6SmOJn+=9-A!$K6d-w zR|-5!EfKW%3PspKJvc38kQdz82bJsbafqC)@}~kM>kh;!=)|WT<6F>%s2z3&6+mI( zq7f|-9ZW_PiC8AY6I6!P&MG>gEGg2!jLj&*{Qv~P{PW2JW+KVyxzOjT3f-Zrme^;M zwqNa7PMCaCp4w)fy*fWK`6)6WE(4=RR3mCPq6cAkkmo+Z+$YMSbDADiVhJG_>DGE2 z{VaLa29bL!CF}*XUXIWTw`m0VsbY9rh|!#a4VV z$81T(6S##K!51WXv7u=kFYYb2J(_)7LCc=~OC~wbh)J2SmbQbT!Im{ zN6ke;H|RQC>Zs)ql+AbdF-^T#n@S%S?zlp zy(jEo>y?MY8XUT8yHQ4j`xu3cURpvxTd07(*2FsUWEq)9YEmlJg0~l_@B`HcB_h;# zi!$2ni)_f`cw*ZPJLCGQ>x2mUO}jwz0=QN-D|HwJYf>!B(Om}{>xytp|A75l?bm-B zN)uY6e?;GPO3F^=$mWQ9#wJT2?!~=JQ3)NBd_{Vd|0pU}i76b)Y3Vs`l{8H}d;W}( z>7C-1VbMIj47Fv{tC*IimRC9#>Fvp6jITp=da>jE&{IE(5u2@NrZ^@{p?xz#Sg-nG z-mMn*F>X@cknVkn;AP!>vX@-c5t_sdD%tUAwZ4sF)(U!%N}n>MiwNtv2M%R7%Xgua z-%v%VcXhiL%MH4a6@|554BCe}vz1oO`+t{kkO46>c57!}=Ij^vf1dWAujK!i1#1Exm)8#+;UlG7(o}*8fCn`6N7+EVj ztE8UeAAg@p!E`lZ1tUjnJxtWZ-Bh}RD#v&wpGBIp%nFhgL8mbOHuLADYNA(xJIKo# zQQEmL8N!UGl7wZeH&WOvyMg|u+kS=^3OJuH8DBC2*$6{!`P2g&tJNJko2vyY9x-9K z#J{8wd5vrt!--C*%avqR=C5Z8vFh_^@BM5Qe4p*cR^lCOFag!c?+) z&OM8m3ES*6MuL)4la_tA88;8CA;p@{<$IV?M`&xNK%+HHKM78*gL+&qKwdfRC3d!< z9_mXqedwu~W~7l!M^GBxDK>bK-i~1JbYO|AW#Dm8_KoDW_+nEL4rKpDICoMp%8A;4 z%&$4)XFiK>bReJY5XuQ79wbP-0VglVND&or=X_c~t2LGeVZN$BiauW(A=Vt47nF7k zrT48&Hv8X3HPwT<&Jo{k-;04G7|xGfaVmNePZDQyM>Hu0#)jiHA16d%tdk$0_b^gj z38%3QTK!I+PuOWCH#Nt&(btYLgS4dIO&O3+ij;qsiqeL!Y2+68+u|nQl_<{6yvTjA z169>InV#5^jZZwRUT-q|zN9IkBY_y}*0S(<7I!_Os~;+x?>dL#ooCE`e$3WU_r3T} z)7wptMYT5~GK!t9TP^CZVnw?RXPULPu3dIfUtJfSb@HcX49nZ2RbwHO7j21T>-clf z9~z?A^VpJXyTgy_BjyT`JxM9TZo{ zW{sm>q23v{yQtmXf43(?Q(XL9De}0f-AJg}Ie%7z`39+v#`i-L4CV^E!ZjI@^h-n) zVFwHsI;l4={ET_`*cI|_Beqt9_xy3aUE|=Z8f7zIgWDg$3-G+}6dOY1=~sn)z6(+# zBe#r~fJJ(X){vSbjs}LXL>i$qgV6kLR0lza02LawGs4M=35_NTQ0*7c;2`7 zRk&k>iE5w)*EFQ#rt5P}VAZaM>XO7yD_CI#6bEf18X3^(KDH@amscZKfK}LNBS7fZ zn4?^`!7-ufY9pz&1@5tv*pf?OD2Z|reV-x^1!d@LlCiw37t~t}K2oV9wu9onIX5!t z+hv(pZd=$X%JZ|DOJf;O&^u=kk#|A!)dM&^+2!$1HL}Cf3>(0Qrf-!wJM0{K9+@`p z)gnD&PJ?I2)*(-v*t?2;TUq)4wtvd!*!NZ8Df>)TeeOOnk?|tmZ3k!V1Ir$87xD{F zK6Oast69y4JFaS9J>SUpasbzFq~1AwMYl6JQG7<0^pSLdE1)r*4|6xnyutQ{>-MtO z{;v++SX#AV?Ux-5vMwRX)qlk1b47dU^eCEmXq-5tm~c9S*1&yYQH;q(`NrD`7!rL*Ox8M%>SYu8XPvWTzBMyfX>Fz@p=iTu!JNqr7QefM8=0GaxEu7W@$BpGkf{z3 zToA~3P{PG9s<5AJW&ul4#z&8(oYfxJkKCAdEAQDW#Tk7Yj_nA!!Ji}YNS;wE=U#>x zcRWAiL+GZLDZe-RYB~Ps45^q0T-keki@yEDHpv@SOzte} zS3C&ZskD0bg3B4Vf%CS^j&i`k_nmhN8rtG+Xut7W{CftEXjH@F_ZOUIr*1GVX>@O1 z7j;1YV@)EkAd#ZZv)ER-?$hvx&*$_`*$zn-_wg2X98@fL^(#nLwF(Z4s9fG z$wh+!<;(>WL8Jq#Kni_ZfY;6=0h$}3D~Pgg&ebxsLQ0710} A_5c6? literal 0 HcmV?d00001 diff --git a/examples/t5/train_t5_220m_distributed.sh b/examples/t5/train_t5_220m_distributed.sh new file mode 100644 index 0000000..5d9357a --- /dev/null +++ b/examples/t5/train_t5_220m_distributed.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Runs the "220M" parameter model + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_DIR=$2 # +VOCAB_FILE=$3 #/bert-large-cased-vocab.txt +DATA_PATH=$4 #_text_document + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NUM_NODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +T5_ARGS=" + --encoder-num-layers 12 \ + --decoder-num-layers 12 \ + --hidden-size 768 \ + --num-attention-heads 12 \ + --kv-channels 64 \ + --ffn-hidden-size 3072 \ + --encoder-seq-length 512 \ + --decoder-seq-length 128 \ + --max-position-embeddings 512 \ + --micro-batch-size 64 \ + --global-batch-size 512 \ + --lr 0.0001 \ + --train-iters 1000000 \ + --lr-decay-iters 1000000 \ + --lr-decay-style linear \ + --min-lr 0.00001 \ + --weight-decay 1e-2 \ + --lr-warmup-fraction .01 \ + --clip-grad 1.0 \ + --bf16 \ + --vocab-extra-ids 100 \ + --init-method-std 0.015 \ + --transformer-impl transformer_engine \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --tokenizer-type BertWordPieceCase \ + --split 99982,9,9 \ +" + +OUTPUT_ARGS=" + --log-interval 100 \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --save-interval 500 \ + --eval-interval 1000 \ + --eval-iters 10 +" + +torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ + $T5_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ diff --git a/images/model_table.png b/images/model_table.png new file mode 100644 index 0000000000000000000000000000000000000000..f126c2fcfbb1e2be2fc4fe068ce9b760fd0d56c7 GIT binary patch literal 200144 zcmeFZWmH_d-hgJQcU^TGnDXW&tA|Y zy#n6x$Ru}o_Ut*e#pln8R-Yw4+gaN=s@NMEn~Ip&8JQ|eh_dqVa6fxS6Ju;(pe)Hi zGhm2mU@$OBPmAK{ru_3~xUzxY?_XoTx=H)82eMO=b#ylH&^NlDA+mhH_HOI@g_O&D z{C%V=z8|$&Svf4u|B8fkOrD8`&d|VMqXh#pNsG-LNa*^&vaekp8S+(AlukQ|#8U;6B#BPge!Ado7= zmlR1_nO<7B-pN|Jne@`}^;riH_ta`5XGbd-38?^+$3l*^7X0f-kx|c?Ju`T)G1JwwPw_^)?R!n0re*Yykf zrw?tKL9Nf82|tq(6H#@4ev+2#BQ5o+AlALqPxb>_4~^p2YmBUC)?j_@DjR{~F@m z^Y`LB82@)e$6yH?)(`L{?}_}Ucm2=sz`(EnXW;+V)CvF3RR4!7<$o5}e>DQ-|X%>>G0l)O>~_#d~d z0ZR%Vy#Ko_S8iW+L|qG39`EP%zl_zw%XB9y}9PjY6BlJ)0j z2rtDqens+n><5lU&QKJo59gP?!qq@U$3?i2GdTQyT5CRyAu-V67L`2F$Po3u*6oO$bX_JLX8*5NGpN1~p}6$BE!D76ebg2_jD*HPCv2AA(#`4 z4PK%#yGbAZmG0L>L=q(HPIF13dsq*K zeM3p{*+2GVx@$4Dv-@L z?<}cP@(O{;-=++$ktRGceM#X}q}XA zXqlEuc(YdKQ?uUO$QaCO*vS6VP;>3~;%$j;M#L%~@1@ymB{rJS7}-(s$ccCx_u+%Y zu91lT->K~H4i0ewCu@hsR`^g_xiE+k4zak~VPe1_6Am4wDwbb_Ijwqg3FF znfJpUh4yB`*@#HRxy@xPyw2Vv_)T6F$0J!bm4PRZ#_%@| zmfux1e4liM??MpxAC{4S0W#rDbO7nt!%?Kj{@Ypge<3U&^x}w$c%w-FGx&eFwf}!T z?O*k|bIED{8!0vVoqtfA`R;*#oO}$iPpeeZS;jQ2yx#wtO3;ua(1hbCnNPic`Xo?v z!~4>Mr}?&-P{>n@uC!Hh{r75t?yYMexuBz&zwpfdKvd^>X*$p1fyLlRn#5o@fw=qD zbjZ)zGv^&rX2gq(U$94rjN`+KM^v(tpLx4%vr#p5CepuZM7r>daoGqZYSGS}1Zb#n zmLqfMRpc41oA;ihf_Kg3pHIoa7fH<#r&@#x;<$!h?S>I5HO92&k^LfM{!m#_;<69_ zd;zbT8_W^tFCg0#(C8mK-F>LirJw!mRlkQG&nSuj?f8J74W|1hXP3KkQiG>^m z=Krz2Xb7JkSYDqDn8c0GzwQ^zWzGb$MVH*fFedw`Kn@KY(2GkuFl{g_<10o^X{+qn zP1o#{xdAMZoAV;z*EvPej&ld7rdCE%qU2IbbZWQf4pJIE#4_IvB&mvS zu!CGzRGyFUSTv3d`k?MOh}ItSDOqQ*xgN~u@wy!w^v$o;7#Te%ugpUj8*3VBCoY4~ z-}yU_X$u%x?fux4)X`{|Arjn8?;mP88?fMUDxJh*KBTiWY>~)f+vn1i8D2bLSG%%n zWT*$3K4#tzskn-7rxfCpO@*X*y`@QdN2G)ywbLbt7#i^mj@3JtSN|Ge6U*Qex}b{X zk7tOrvIfILAM(Ea4T2WCd0#uGaM~qvL=3E;|2>UL^w8s;olB1|^81tNy{FB!BTf@} zB(o7=&N(y%YV??bem;Hl`CIu51r;*3PHvalYQ0(IYhUEd2$_lP;hFmWJ%SMP43KoV&BO?UL8=NkC?R>fX%{FAe4%!>4M)t|$M|Am2 z`spc7QfNkk_DQe<^1Q+;_PONOgA>RdDv|wo+PzqT68up*kOD8p1Bun4HC@xY5q~z9 zF_uHi_a(?r%>6sMq$}7k688tG2TI5<6(J3BZFNFPT1V|Vni9OH7--%)>rC2d6kM^Z zs~738e~QLbuZ2qn2=gMc3D{8v@EGzsVmq#(?lTXVIYG29rf?QJ_Y8&#uMV`S=6&72 z%eFgKBWiw29~Jbv)5jfl$U8{I3z9pbfcbdn8P%IVKEb3N?JAEgj;+sGS%)i^(D({-8gSx`{dc_bYV`BJMIllG#2MvBigjl0S`t47q*ov6EonX6;Ydnnm;OVnr)?icg zg7iCPvw=EkrCz&CV?}Aq_}zWPREPO|d4M5*jLXg!hhOTW$YI~V4P=f=?Cd{26s8M0 zaOU9EXS>Bz(W08*oWzUQvaAclkYT+O$VqYukE$EV-bak@igN8w?c z^Je(*Cj0KU(CSAupX&USfundO?%SKD_2|O7eatxWb#s0Uo~>L#;aVO;R<8Cw8n1`H z5dCot%}hFfs1yM zZ=VC;Raked2&D|?)L9*=Xh_1-6=~(+moq+hsrkz9(qPr03_lijrpSyb%3nOntv>o+ z8T}q&b{#8AS|)nL|5eColo6N(I;CB0t~{j-tg+WOpCT{o6_&L`Dzb+0UM0-}6_FI3 z33%Ymi~PnDEPEZ+TX*}`hfFEQ-BaWIp=NF7vJ-Nj*Pu|k7|eD=OV_!<@6?V>=)Lq9 ztmsJMzwG4k%AGfNdHo*R`7bxTrdHyYdOhbapxUIz&_MOm%CuwD2 zuW1JLpwq}Ah^$3q3n?Z0gw3EM#XT!r>B^VV1@NvqGU1w+&b6IT;ze?QRJ!ZRFdET1 zwW%JPm_l}LX&L_?=<&`%j!LeNiqdSahtgspBU5y4kv^Wp1|e;@uc$PJ3rRlx@BKA_ z=Y+top&)FVzl*oL6k*eJcYDBDtTw6Q2y6XNc``eix+i{3w6eOogNyJ|ZlkkmB%#q} zTgZe14-M(H+x#Hh4HlY*WMGw}^v?Yn>WFrpPp64rA)A4QD93yd#;Z#a_&7Dtw@S^R~}aS*SHYI9IgI?zK}0- zLTmQ)E=5I7*{@uru}Xl3AnBAn=6d^7!)+i_m;td?nDn~c1#Lc+Cr4YN=DzcG3x}t= z`;ovfOJsGoHA*)AIfa+BCTz@c^X{d&Ud;g)fywD|TLrgsXGP*o;V6Nb94st<{B8gp z?KzT38c+-usm9Jp6`KJn!MM`-*%2w8-(M}v?WI|@cd0umk{45u*nTCNDzXk=yEls%Sbi~8UN!Sw ze?ok;d%PjL6>N`_2~*xtKbf(iI%9lhu}w5nkQ~Gxd#Ydd8VUl9P;Zwdi1!XquwYnVeLdyL zWp=){6{PdjgRl=tSkh*%R})Y9xuc~x&l>C(C@zV1=#vh$>ZV*k;cmOzT$$fm`8^-{ zhj-VQ$3hea45~Iw@)+)*yb4ksZsRu-f17dBYf`Q{tyPjIk_KwRDEEXKr!=KVwAGvJ z{vMd@zxY`>uwHYPZQS7nUaaJT&D*Z>@t{0ljW)0!9{rxm*)BK?Sd*9s1dl8HvPAcz z<1tC|OH6f0cgJhzBxKN=*x$yUl*CLEot;;@HTJeN3aNKs@^UlPz!(vPjHjtyiP} zROz(Xsb-FkdnRA8#WYzV2&HGBY!-=qi84Yjpv$(AE z;~HTf^stfrz)q2PvGYvrLFW(B5M$PbuJ5*_x$K`fus)4BK`MXlA-@4+E{|Je&o0;ZsL+cz-+Kij@6Mw_KW_PUKXA%U!X* zb(@WHxtg05&ZHF9m{KjE5v4$lhq#8JuelYJpYle{>P? zgTC$`O2mF`(qyVvvEN9yVeaYkap=8x7UkTa;8P|h@UMnaTlfErP$?VXQ=JzE_osrl z_f|tc_!JXs(cTBE4bHTDj=(M8yqHp8J~U|FoxQj`;p=Z{%UrKMes`OKJy-XsX49~V zujav*N^3t0!gbu35m3Sf@HYnTh2d(qpxRK?%q%+rWq5i0vjtC9!Lq*p=t zD7gC7gm#1K0{Lutj-`KchqyvSgp>tff7lkSsY*l6>{G`$xrg|)BQCzZQbYecK{67* z8ONll<#25ENnP!3DUXSJW;e(NYC1Z`o0qReT^0eIs1EUor3){;IJt~sND>goU}2r%`oMW4J|CxtFZpt605QinGc{ z%Bg`C8+AUET;vIY1h4(f_(I4>Qhy6In(s~P-D9KoX-e|IoHS?fhcC~h1Y4{RCQ!cxrKtJg$ zdxP$m?{tlO_l)_x5e%=r8vEY1CY%4pvLZA+6eFiTX42EIjNQmBni~#3mcR>(GcG7+ zYk{g)(4W-A&kaJe-`A2jdzZoHvur%OJ^!>)6m4xy<7?kQ4`+Bj?=A)>@-Tlhrk_tr zhBr;o+_)Xgtc{SE@gyo5UoJFuw+q@psl!l*D75E0hEipS2Q_w`Qt9Z%TTSC#^vWEM z-G3_>s_Kh@@BUgIx2p~lvkq~>sU3yaJPhlKUC1-^kH|l9{b1#CJ)T-;F$t}xYFJ)u z#r?5rE7UTLUlhowVs%R{>2XFTLQ{|w%(F(RSHRLgJs0Hg*eu2fRZli{@&TRfYw@VZ z7BKhIe<>}B*W`){9aWB9woLxbT#RZDM(?{MKRN#~R4UyKet@{;_tAt=1sR*MRzBG1 z*>(yfvkWj?%);^kphsl}T?Ktv0{J#-t$DlVQAc&6O@a%mtm6jf-^6GO@r*JsG!nG` zc~3>oh;TTDygJlP%x-eKcG&pYkBecIppBqOU^@tdcKr)ow^W~1yqsbEZ#>XsAe46> ziQZ(%+>j{Clk}PY6e)V+gBY8ooe)?G5S}f^xhOmQj@CoxU7vW|upCUj#us*E`d;S# zy?9jh08D>!(LFbpw_lv)hbIX^pzaOttDm8Zto^@~GP4qVqk?>ppgZjQx0OMyZ~yBF_Bw^<&X!YVc)ZyKT8eQ7oVK z{CwVH^FSU$>8SNG6T20W6nf12Z^Dgm-7m;iE7e=j)3JpwjK{B#E$^kVTfne_q;Mm* zi*xzLeq|jr5zlHD59fwXeZ?=2)!k^FceN6lpU&})Q0_wf{8ZN5JwF>MulRCSjW@pt zZpNB(sUpXI5rsZks5Xn;6v?J1c4D>ZqGymsUB*!kNxLny!N^jgBJ-F`{9p<82Hg?h% zk+erjm&$#wwOE#xpxi0QyvB|iO7dyjHLbR|f~*x!DeJUU7~|93l|GJk!WXRJGLg`% zhs&*=uh*YHMxUj#q!|a%oMnU@a5C0Kjk-+2udGgkj4_Y#1UjAR<$5NH zub%yr=%=c(LYe@p!pvBp1|T~|A*b`K0%-lLPMUi&)ISncM?1IDc6Ju)25+V#;$5;l zkx*QtJ6u!(+wuD=4rG!~2mNL)i^;>;XOVuU>R-xo^;<=5b+AKce9U-rCh!WT8;wS9 zd;HpWoB4j``Y6p+TsYoKOj<2gserl`R|d`XW3Qr$=P19wQv2YuCA21cLr&T#sP=tW z%tNU09r!d5tj7$m8D%!PgkN|CHL)^*kQOPUM}6lL16ge+k~qOWUp>UL<+HetX`F6p zEB1{TdZE!uphC5S6`sd)8@ey>`gidnW(~1r`x%TTxA6Kwba5R#AbTk>@OK%cN)hP} z@!l}u33#;*)_ld#hU9eg@7DO~#h;Ut^tZNZHshNONThUG>eqUSG%geEw8Yz_bZv|X ze0&r7;KI^>X|6)~L7^RTHflolIS(*i;<(m`n5uoz9dc0*~UDX5JC8Gh2w!iA*T zAs%H6HyZTWRGoqtgT}Z}NJJ83Is7KF!^pT8>-_qWwO-%cA?LGGk6b5{op_m(zZ4ih zwmRLucG@p7810;NX2e=^F|vH8|1z>*rA$Z!DYZw%-Ni&j48IVkwX z=5L}g`o4G-RT7mmYM81w?rDGJB=O3Euf?isUZ~=-9CdMpR&xgiX46C-&UsAHOb7x6 z-i<3yU$>(kU>@vr1oetc_%9P!L%ol-23+Qok3g%qH-bi+Pu9cv;5G6cQ#d&*S2V2f$46crFXp;~IK7ZQbF& zVX;j1C8bPpLu+eM31<%P&IE0VAUn$%p~t^ZYvvxQ>cq6?1Qz;h=43YY14f6wlIdhU zFk0${p0+ZZ5?AEm>$igZ{H)rjF$Kw)|K5C1qTe0--Wqy^%3E}E`dQ{amOXS_B8AQJ zrvlk>hj87VOq+3*%nf-QnzX_v7&=<4`WKdd88rQp@$-1~lY0u9j^e((Q{F!*%wI`* z_LiHV#rzLdVMMrRHRos7rt?wNT6iP<`(L^T>-t^{^MbDG{bm=s>>l=YFEiP0xM@#A z<@`ZeI%!g@LqOD=<&zbRgJjP#INukIbZLSC23n@CJhmNhtvUAc%QIj{lOjLu=(kQ4 zI(QK|B{aul`JaumG?fHp@@(BYBXsjjQlx~EJn`Gs3zdsRZvm;VovPX~#)p(`r8z_q zmEwb{!%SoAm+ieZ0XsKqX`LI&7t9;e}d|=Htscts}X{$xdTFe4YZW zS5arX?Hen#7vHS4eI~TT80m^-&T5C4Vrh#OuC>s%ux>84%X$@4h~0WE__mX|P1S3TQBTOVw^r8&;774)g1ye~NvE}?mgPH4xN9n{m6xEqt@NvCsChTzp#l(QcbKfTX2t&EK_0P~Ukba4&?^ z&r~{NBb0Q3T^PyZ77;f&4VM8VsYuH5rw02IhcY_Ra9hwU#sqkgbqeOXllu=e+S~@% zcW=w4Tb&{5l<%%su;}dk@4Lp4ZZTL+?FRc{dyWv2-!ebNI5@#Ku)=}*mTp2NXYakD z@<%WIRIH+BC1F$2WB^FekGjRlRv(BW5~PG5+%=s-RO;nX}Q`(N<3k+Cm0@)$h# z;(cm58%?Lw0>1QDF73he*33Biav)Be42#rQa%-w0>rIiVOXD|YIgo%Ej%H7I?aK;K z)Uo+Sw|Y?X4B}J@*4}h7S)5lMa652g z0kfEi`{92oP5ShK{tPbnwORPH#)Z$c&H6YT3)gTFx2DJHO68DEsqXDA^+AUB=QNik&q$gUkkKuO9~9J{prSi7Kf?XbP&8*yF{<>%nK;-is8DRg%<9O>w)qgUs&!XPL~ zy>h;Y4|HmJ(e$*=j?f|#nzGh+_+QvS+fAo1vBm3rd|Djea!Oae7(oc^44@|pjCGQG zu+3X#y9<&a#l2r!a_H6@HX@~*8r2ACzrj-37PO5_pBndWCZtF8+if6LsjC~SL7E<8 z_%RhdToX_6?WP~P5&QMND$-m553~UA$-Cpi$9sm-18z<1v+j4Tr4O0l=A|2!lR2N! zjP3M5%wZgBr@SrSbzfVR^scW=mE+w0qAM1sE>*ejv7W%6ZUUXi-#{!0X2$n2oDcI3 z{XeCW-d8|ZyT+iw5u$-^zU7rxQAGT-YO}izPEgskmttgbBw^KC-Oac=wX4441v*3V z7VGwCgYoGdgiie3fjpV_JMKbrkcXIhMOmYGV<(PmKzXg?bEMlVK2KW;dPj5$Y2S9j znO{4LxPB?`TS3U>l)ZAEf`qR z+9%uQ+I$C@*NXupXIs_bDkive0m~B5`&@ykZC842P`a&x);Ps8;SZR{FuaQPISqWDL{pik&Juz@K=so=OA zcV1kxcB16G8cQPal7 zN+*ZqA`To6Xo^)(o3jX#G|VD~QhYwQQySQZz{AIfF4Vc%tD1x_noY#P$3ls&n>Ooz zCI@TJJA9_>r>uT&?q*fSvi6nMoNkK>Haed{WPhsQsNq9BoNgj0e6Yp$S2)(3NYs0$ z59i-w#WqK|lE3 z|B!7vD5fU2I^zpM%MolYlHh|k!|wx*JoOr^iv5U=a&XNLNQ30kqK>{LbnK-$u3Z*Q*d!+qv-%{_x!ebx(3Z&jw83C)ddpOJCl=MEo zQ}EuMw3^%tf#%I~Werc8u3MQoi47e_^NFF{o47`dKU`^#X+U^kqm3-?vdjwgp13u% z;g;f1ootl6inmHbS1aAaVn&(BgC=S`mA3==CYoAPT3*Z(p}E#>VcCfhCct(WG{In3 zd=HBjPM#X;A&lb5T$hSFdDsL|u?o%`Uy6AwG+`+Qj#~M-eTUqpdwc7hEUVL(#+#vX zt@2UIYpPye5i62834YSkyGfZh7LgnIyQ-em=d%Gb?*ZoT=1MzgbUUq(gwGu%PzhJ{ z*LwBdj}}7q{8qO{v-YKQ69-q^OWg?^-)lQNNw%avA*Rksi-2~AmFENNK4M>g@zkCa zCPntQK2Z2b;B?TGFJ`KQ4ZBDa5B0Tz5>PbDh1S}_@CJS31nP3V|Di|(ZQ_G$nvb!2 z3eN(S{5bOg4(=hok=wSMp9}bdD^(Y58dX+4+zs=9EIsNJ7!7QEpb8 zp5W7Z6UxWm*X3kgNZfI+yL4Fh@n=`DZ~GtABwenqQkK$0)|va%vi!8LW0}>UJcgrr zr^S%*FJP--Ek4`g6Pu>UB2;r?_!5G#p<8${)O)`5OGC{FW{;&O0C;rJD){ziFQlk) zl3M)M9M@-LLkk(EE4l=O)C-h!8U7q$S@FAkn&Ar7GCr9o@saNnAodpx|9a|@Rd%cP z6X~L;R737YtR%dFpX-Wi4h&`fiuY-$OzbF8t?H%W4;oV~%azerZ!I}K6?c8Lg1n?t zi9fxzaw^?dW-_^mqCVFoUpbj zUEe(2B9Z9;Z!91}AaU@z>c0?46W@9tpz~ZE>u<#YB>WQ(i86rR`N^}HHt+PbWPdtH zKtjt&=4?>%;FFHsu^nutV~Si>W4>`M2fBZpAQB*uq7c?rgE%eQbT*0CY{S;#v|iw3 zcA^xUKx0L^)=X`nKZVzau#ZXc0a%yCW$NH+^WjNunR?%wyRFNJn5^`I_YkgR^@1bP zxM-7dfT2YVL$ktt4fEyF%vYwso!zaxx~u+jGWK94yo%P3rqp>F8Mc3J?X#hOHQOck z@$Q3f+gPTphScyz0dyQxP~)r-Jud^)_Opa7W!VXb06Iz+4%>Tq0@KSLMLV5n@ZY$E zG5zs}UABdt*(fgxDwrHE@sy_{PvVxR$r*e?_NtJ>M!I+L+YVAbsc;_()T7b;;U)$wt4+lQ|^`d)TCOXYGS{ zBn-pSeQPn9|8bVi-P^U;X(~Mt@=Z z)cS5VXTO6~aiR$WSLF_CP&uR{pu>y^O|2_j^O4e9o`FOmiW^nDktM1HRXWZq7)>07-Eoai)R+dJB| zVxyn5#BPRcNK05(R+)5w+ARW?Ab?60M6mFY2IRm0ub6nR#Kd$01vg*8gwC5Qhu z(t((3(-vQdIJ`hd^vi|SSRid(@DTqtZ(VH@*z|x#TU*_tNwBnE^(DWL!-<=qWu`JA z?AK>6!SoH7qpnpIi{2;9&nnwIb=A&A?U!9KXcMRcFZH-HfHUz0`&g2BwPQs*v=@XE(JY}2<{*jwkI|1!2B6PT{5=S=?$ztBvraNQqX;E+zVaHiEX zesU4L2#^C(had8%Op91(A6+qHJ$o5Q4Sy8JdMGzl0kmAqrco(a+FDz>yv5R;qOykF zZa9}U+Ik0<2hmZZwzk$g&A;p8s7d-lhr2U)L^&!WoKWimb&sz0IsGm9UVK)!iqTox zA#-8qW8V|n2|E^waW^~zp_Rm^a7D$t>91^A`~v8BIvRQ=A~_q?L_>R?-C|rnGh^2V z{GwNN203sAGQIaUDXxEf$V#CSN|(LtIZ9IiS#uX7UT@Y23O{Bfxw$M8TP^N;)u&wZ zDb!B8zx~Ny4kbzX*zp>!jW#4c9JiKs%(RwfsiOzwgCm;MN6a%pz@X<*Sju?Vk@s`Y zJ$zIp19`Hb>?ODdmXy)v8h_T&I`~J*ry#1P#{zm~x~F3&)FgHG_hi?hYj^oF_aRxA zr!wu_NQA}~L1gboFRvMm0^WZAb$yl$+_&44dmYjEl9MYdDG1)TP%POLaHW}#pvVLG z#hnqIbz&Lsk%df!h(HaTVw!GNreQ!VBV6~VpJG{Yy06TbrE^@US~Bs3fFVfM8np^D z3GIxfiq45_uH9D-(n_tjcsb#FIAYR@tR9BapJV;`&oKpq0C@Ik+jUpG#B zVG#<7^bxija7U^5IlCO#Er(lV9GS7#@ONr;zM*UpF9k+-p}Pps?(}-I;8-B3aY;7f zCG$Gqh^ILELdJOzTp)d^)Fwf4Io~8p=jnXD6Ed1LddiN|6nA#M?AGpX$8>c(ZKI$8 z{(}^jKj2T)?@fQ4X<&E-2<`OIwZ-g(EJ@kx2yE*w%%af~WI=xR? zwrL}WJzs7ZM#bAMElxgA4m-o++mnE;=^64;8|Z`*V=2N|3}3snuu5i_G#XxNY*YK&7Er%CzTXVkICmy8w}~+nGXBt+^qTRi`OVF629ZwL~Gyx2mxzdHD2i|KM1n@a(UyEU4EuDC_)ju^d;{g+h@=8iTiB-oi zZBzMqkd_|Bc7tMt*&O^a4@>^TlZEzGNx?r+4M>06GMG0f;e|%iL>E3s%#Tm4Dk#Y8 zsh%_#<3qi*Gn1jx7w|I29{CAi?X!Ni>MLvgRa@G+;(2xqt+vyyc!Q^uFpEW6F6b1f z&dl~?o^sRnJ$~o4t%YgKIn$l1Z;$H=FTes!LGR!BTOTPLkY9IbM|y7L!)+n727o`? zc6f6tH1y&hvTE}b+j`!F5k^1{w-NdM3ebY85S@+kn0C6~9ktrR3MnEfi-7L9mVTDM z>r3$l(yQP{$FTU?FC$??8Ea8~rRXsX3wqHlH(pR?H?A8-OoSuBAskCIDj@}HQ_Srr zG}Q~ul5DSrQS0$&sJAzEgD`StD?T^Ph$Nrjqqj|bKQ5a!*AY=yl)Yc`b5A(P8hF25 z`VfAi<0jdNzpJE14y|)Sk4_V=7ghCT8|CeZ|`Qt0lFnu$s(7 zYn!QF7vG%qXc^SwyNf5o){cbULWredn7SpOX_yc!U(Vn>8U1oP z0Mp#}^gN(7#fEzfwL6Yd4rx1mILt+PObK#jWtzo0F0QhY47Xh5f?S5t9R#C-t4(*G zTg6@~<(10Fe3%EJhQ=<@^!U#2PXc;*;=FIm+~vM+I>YgrCY2tC#@XG9?9DL5S5aKH zvhli1%RKa+W?yMI4`DCiXc!kUf+AhhNsfe1_m<<`fXYehN2!q@mI%R@igD+|; z=(bRjg;}HUoehKa0hL^DyP{x>xrKDAELe4E_fwTz(yjLkC)7_F0@POCdxu94&D23^ zI;wY~+1bb@+J0RSg0>^NzaD0Ye)}%tPiDGpA!sHCFKq_wcB{JBVzLxf$(x z0K<8X^h4ZWH#g(DHT-+H78dG7QKhN1As~Hj0d@kPp-uPaXzSne2PE!q^vfeoBVI!FX75#V6=V!zpo ze;!1b8M{c^;|nU7xQVKCEH^4P4POAi;EVKtSJvGacx?CIuf8ecQ08Jubd>r-@X8<1 z!|BZBxjajGt1pwSwQbxkSMwkRy;_0pKAs;4j1c+Rf=e!8MV(DznI85fKgU1Bv9iNu zAU|5J53yXE%6-qYnkp*?bq11S2lx6~Vrj+O>$$W6G3LddaoeR*p-D!!z(@QH`|0HE z!0tGn%;f#p)!}%|Jpps>eNC!{r*kd7=Z|Zy<3w=fPhzRmWd|N375wGWeAdI_WS*VLyU zIe25La(F|qCxG@Ay)TZKOHAK)>N!06F4yfBFT_aTUB-`kD?8AEWRc!QAyqcL zVH8olHUhq2PL+38b0jKe=KZE}*>YVU`@YFNz_E8za~x(5==ii3+MvY;S@NY+9KR<8@4WL>&>t&K=(Xr(Mx0M~=$%v3-po}wvj z121d@Z5^va&&KeYXxntr->EX_Eq2Y|mwqZX|qUDz10)9Hfl z?8n3?K>AV7(SHnSUjR!ALl&*pRv0aF$XBzOk7r)?<_PEw)ueiy z05)V%&h{n+oDVM#j8HuVgdUi~c7vki5=G3ci-$@|!}Ccd$GDa`K2v?vz}|q*S@m``s% z!h>}D#y{d4^c}((mTJzT?*Ier&#c*M3#Z@)!V>hVahy$xg>Mop#WJ|oPCBU^X*`*c zyDlOHooP~y2X!MWuWcX+GnN6Td8Ksw$GP1O?60{=I3CX(v9j;SO0sMwrH;x< z$m{bt0uHb8LjE!r^x8#au#z&7AMJmk+71EmvW&iQ$V8#)Q9C;uv)a>HQUIgJ))l3h zoKF*<hu2%so|-UO}^Fpn0)N?FXJ%vWR%QzhnM*9V82OZGJ;MEdxh8Ces}?y2p|r zcye{uUL{nFrO#<`qmkd@CtdM2@o2hP7*^9KdM4kj>5sd;!3|GcQYeKdhV&7wf$S>J zp|y5a6=}1Vd{g~a5!4hI007m)8VG#o$)4=U&uU3p3MV{hLzoq5-L6aGgDs2KEx17v zwLAce{2Xj8#6GW_?_A^XOyYkls;&3Z4`GN~V1Df9*t4Fd0w3pv_7jCSrhUAsoq+eJOt#-~>;76#+If*f5i8Hi}j=RHavJx(Wf#LjP#?%9}rJ@x@c7l)o1) zos(6?uo=_c*4`P&U#ZASr&^vTs@GanESQ}4HPneu6?$7LYvLr5SN2lP;>y2DpE{$h zuWiTzgXw#O>{41JUX!n5tM{YKWd(|8TlmxotdslE$OQrm)?bB-8eE)^`40BrvOITr z^l7L$_zRflG_-S=@UIlGC2UAsD+1S|!U3(fInevQ&1M7e_0qYn!b>bJ*z;85q$aQf zIRlk^_S*jppcseP7hE#f>#CVMhKthVp^(9KP%*ExIxZR~Gs|`Ki9b?eDE+(_+)X9B zfHQXtt=~4z4*A{pqK@SXX80ZCCZAfp3&|SRSL0_aJHebE4=>oVub!4h@ES4dl<2%k ztuv5+8~c$!+l3LbqZzIBbKaajhLZ7JRK9z<7}0G$!5O?;Gk0I#=)exg$5wY&x4Hbr z_dLr^$|H(t?6ry7#(ZGGV3)-(`HAdapVDxA`P&L9tJ4nBe^6e*5ycr<2~uFv7$GxZ z;>fueURtmdQ-QV&)i$z#IB;sv^s3(1Avg@~5i{s63ientir@3SbF1nfJUTn*y2%dP zX9hduYlp9slRqVY!@lB?U9O0DnBI~lg!NKTZMCV$anqJLeR%%_2iaaL;TtG< zpQ*201wSml&vUC3`p8>+wst(#P{wQ?`-**X#-8qXg)Y^uv)Z;zRw@(t)0v&}OTgu& zT{!2xP|6jWx2TTi5@mM}-831s3Q^6h3x{q4%~ETrh!D?5ZdO*tv{I9@L`mdc8DS7| zSzAEQF>7P*;QJ3omz1^uCnPS5u6h(xKhY;=@znBD0u`qvQn`6Km-G}=4BHWs5rpqr zf$h(A_)Ak(6_w(#|}ZPE8Jwku`3**McOP#h85+GswEBuN&0M6WyH0Hry_b z?$sZ^Bc{nzJvBoaRiYc~sT2wUQ-QUc;81+2U2||g`FmU5f9Wz68v9S84Ah&rAgxyk zd`S(G+_yas!q4^9GhxeAwEWx;b0{yc)2L-O{(1D0F(qvxoW9Ut4Eg*99ul5%qIF|=bS9s&XdNDziv`v zcy*7$20oc}(?&oT^+iiL0U?Uy{DXzcc=XeGzM&zslj1muPG~B}da$6-wDG`TYK$bk zzs`|DI9r&-tMGDZ%R%mGNR85*&C3*Vllbu|8h|nx&(mlYP8OK>WC>43pxo(T^k4r? zZF22Dd$pua?{}0*uJ+Z^T0SB03W4J3xL;Y%ZA34L4=GkIMRq9Xk@#<%pp+z1xG?eu zKsP*Y|8zbKwmh$F&X)qA#W3xjGym9P`+H)dCC`|xXIlP|H&MptNFJvGrJm%bt}Gw+ z>uj(#Ff-A@YZ}7@Xgx+_#GR7dE6?Vs?>iSrum5hImg+5yo?dpd-lusfab8OS99Y;3 z^t^AoxU{0SKlYgp&z0r#GK$sE#TWAQY=0T{anUI`URsTYFV_V!t7f*&GX^uz(eL~k z3Rw2w9V$uhE&#g+h40zx0O!O;O`-B-1Y=qP?5*67+3GT8YoDe(e~PvVRzx7NViIru zI#~qkTy@U@zKL6PZhI4j6fYR!`js>0yHkO)!OVZkGQX%9PZvZl`X8$D-5K|(H`6uT z{crAh%|mQ_Y0{BJVqxjMvA3RPSsr!`LwXOrB#%4)L7zXg;$?tBfoHq-L*IqjtKZ!G zV^6KNEMlm~NOoxSk%s~+b5^UFV+Kw+gB#eo)LgAAiOes~%A$1fR9Z zWw7)i>S1(j#9n)@Z+67>0WwSniCoK*L5Qi!0r0nLS;KosC^&ko{k z*Rqzl?jHWv5RX>R87^0d+kPbNJj_67&|1Soa5S@?wgQS;K9ht@Up3`HqkyVwb|+7g z7VG$o&zAdSZ$;{AeG^NSVG$=OkKxF+&COCg8291Uo+W4teDojy=| zA^|l8P-=?kQvui(IE9e8c3PfxR|rdxCy+{yL#Ykzm<9_xk$@bvD$t)k)r+cp{&)jj zz+)?|2$HNcQ9;~at+kp2GMD5zx{YD>wb%7H!9{U%xZZsCnj!M4DkH*TyueE-DM!Gme6LyBTug5NqyM)>dCJFnr=CXer zV~TcXjKdGZ9*4(#568a_@LwwO7&2T_#=8hy5@@Y;LwZE7RD9LiSHxTvREu{BcE}XD zri%np47Y1!mDB|}9XiQq~P7N62Qp#yUH7c^|<`UY#=wl&hd zzZ0#=B3?y-SQFkA8m_n;sP!=wJf00v#G9Vd97YM#J13XP4!ae8d=H6@3nqi<**l`r zYHwAWqx)%S;@x~?R7PvV1YkC;O_Q5Wul8{**UJ=}gYH?Mnfq+Gt*VW!&=aqh)6FTo zfMAcMf*FF)WC-Bu*nbr-+|D;S^Q82aIxj6XmOgJfJR%Mx2&tSq-Q~4i9K=~|cSLD& zx$Y_FBZtNNLiG8letj*;| z3$VJYF&jd$D6)$vpVc@on2SIZR%@?TN~8~wc6EIky{0;J%?IWfYY7>8JOQ+Jc%+t{ z5B>b7EFxJZmd}fro>$Z?jFfgUZYbxWCDVxR%z+jC5p_zQOSGbhdd8s#@hN9-F z`QKv))k)V$hXULSmrvm%Sk3se*ZD4sTTy(NODSj)yHBu=@n0VK@HT%Q5GRZ}$zU;; z`x+%3ExOM;y)+-Issc-I?F!bx9Fqi{<-NRigXsJ#eQhMk*j-t}0;AITkHc+7>5Uvo zLrwyk6yi5?eH2__c_n3qn6aj7OtT)YJ!WSoK^{Jq$NP0m=Bw7)w+ty&9)GSerz)eQ zX}sLn{8sXU@A16u1##H3L42x?OXdZ4pIUFI)=3(TmfB1| z=}Kd@>SvG)!MYX>%4{o9iEG17BYJ|2&?(m<+x}MM@nF>{ zampYHtx~luOP};=ZM{_$@{ao=P1@0b)>6|n6?kWWOfiVG-3@tQ5W^ zOhQqWE*V&bZtRoVRjXta_C<#A8jd;kGGh*VKvcpy9b;x#R>SL7c*9?<(o)bW*{*&q z;pkpRdmOye_;7iXR#|UzkZ!>2yCbN8wUw55oTPclH+3_rzEnDXEcKaUtXhS_@i}l4fmP1IoJZ&cRtuZVa-pQXeMk12)q>WKCMuu za|4BTifQlaxav%jy1IZaosWqYK>havP@Tg=YZjUAT7jCF&vRn}^dqvnHUkb0es`4( zkTsUbYE(l!uMdrd+s@A#V+tQl5tp z=Ejz5k>scE@bg)gUeLYoSL3M~p9?^3W522qfHp|58y-|ksQ=`iR-URd!5eOXMC^THiV zXMqFt;fvBe7Zuv>UPEOedWTUz?f9*_#EaF*X1(lOM=3fmffxttr)Q^!9Gs3bnZ_J+ zGV%$Ny5WG_GN>*Aq|C zSH4#c(iz(VqmN;d0PbDg$~pVjbv|JWuWCKL%2&$_I#xg2lb2kLC{#7#ENFuPUGwKv z`-`&NWVf1=2qWT6>u#=HjRH8O6`IKgn%MqoWw2uCvFojiT$z&*0#N%o(>*s^B9Vk?b)RuN^|m^-#1?QzsU7$HQh!-2L+{Y=0aj>&8AUB9@@ z`4-)-K?tMTVpw(i9LPFb#cTb)d>TG!`tk30-Jh%cgIvg20eW(L(O2uC6rd~?+J+t6 zSJG>qWm3(%Z@(;TcWINSOhhOdsvg1)@&uGE$T8h+PVbIB^FW9pK4uDI3HbMNHp z>Mx#gJjL1-aj%;!(Ofe+UC})B?sjZnXclXDd1Ta!$%RJ6%);X}GQCQQ_BC*o`lc|O zaq;&(^IOg=S-IzZ%bEVR!y1V|%%`k;z1V1IX@9&?+osjKQ&lxNPX!MhtUI|?xtt!w zruo@f67}XFNZ)=`#2nGF;&hCM1eA9MKj^iOR+|>Z!DW2I%2m{i{{-J^`>dYF04(-0 zVk{cxjm+`6E$9P7t+0?!auYO-X8*vNC_qH`=ip%Cele%rg1 z(@FzKh6YB2Mixr1Vf5x}#cbTqq+5gm-s4{zPidW|>KRtu?9X**@7-L{gGDUI_frU4 zC>vG;Rx~6kPOHUnPP0M{y1l&00jxmfr1&YxU=!LlypceenMM6##+cDumHN+#%6vuf z0f0l9kcfYX$aNB_ZpH%w4FM^kpEPLY zejY2ILJ9c4EX6B*oz$d)CgQ+4-BF-mVF5e87(G_bWd)~06NZ3moL ze%wn`%Br68wH~*@QY!93^L^19qVTbPE=(JFpQ9{zv&{)Wpvyj&>)|*ajZ13W9Jp?N zKS_4T%B{(A)&6mVZ>(yOBB0~LY-#!0VsBCH(d)ZVUm|81ISc}4+=b>Mh|MBSc2oJ@ zd{M=lDZc~2gc0GV3~kAvth#p8A5AN1yj~L0R~3=LCIC8%{xQY2C?T<$!H7B{_MVJh6P&3?qi^)E^R$H>4 z0D+N^{~*k+BnOSMrgM5~W_r1&SnE1;D2POs7w(tEu28kwvJIx-D!;${3J7)N3)@=b z8`z#|`$=H(R*T)(o;&hu%+WQ~n2i;XjuK<>fpkr^1%!c;Np+B>H7pkpPN<^!j+i!2 zDQ8e;s+Q66xRI#J@%8Xr37bBT-<%DG=HI!!H{CjPHb6y=c8cY&B2FQd^P&{*@;DHSbe_L>;;kS3i6q}BcrnLrVcvd}N`Ru>VPScq z!72-<9?%d>3~qZVQPG%oWzPIpeq|Z8S*2way=0gHvRIKRdota2b`ed)s#P z>$AYd*n%p1nnTbu^uktG^}j2`r$HG5?MbC!#{x|!2?21~r20n7YbD)Bg24koL+*74 zAbfW9JvIX_DzpMr!#gIyf=5>db!OE$72v=WvsB zzXk{R*QMY@Gv$}7(AtYyKd-Mq1u|6R9UlZ&jFht_xbkoL0l=E|CW^7*i}%*||CBGJ z|FQ+~0%C-u3yZHnVZ8;96)EpAiXt?(GK-B<_hW13xC^kS9k*MODD3t5v z$?bhOi2J6lg;t4Ti874|9{arIFkm~h3(jB-OFS5CWn1rvGO7VqPkbuYdS2rPt}Ag& z-61raej@IIO#0(+I z7SD3Ohbu=S5fo1NcaPUX5+ltQ-vSZvg>$mw#iM3sCVK9e`x@ykGGLt z_9+^7MjZ&$uByiYRNxKgeZ3BF&<0;BQ4dWs#y|ccY#A1b1GT~K%blgntALct;ML(z zFU?x8p^vgG1QO2NQXGdy&ro#bub-aHik3^RHYSrZKk{3|u@uT5{)=%nL+@$%BO-HE z?*XaOQnGQ&<{;t>f=C02oHc)_hRru~cV4sd1qDy z`g3WytXV6jo|`SFaXG0<9*w8vYaU!$(&_DpqQ$}67G-l1z8Gl((hT~4hfTP!jtoG{ zE()J}J62=&8PF2_?F?hjNR`o6kpDlfUhoiROA)D8{sPOlMtBj2FPKB7IW6;-N!bN;EdP-74uZzh*&^_jtMrUj`XzV*qi;;d?D0r~0N)^WPyDN)*_vWJ8fNb^(cP z0{O2lGJ={VAQtm4@hU7@u;S|5uUPGei+bY$2a5w#yr2nAvZFkq-=5|G%brR&-bI|) zPTeKZvblRhoUaoWQcD%NrLART!JIAvpdgUt7N~d`FN4mj`b-{h3Ei&ZDhWtr&Bizh_zxpekJS>P|GScnR%Nd z07?MoPdctE_Ko($O4Lt7GX%WUW4w2nb{|zNk2&aG3^dJs{w{H%{>#z-pFfI<5UF`m zi3Y|L{Bxy(h%oUHT}7o^T)n3B-Bi#OkqXGI`gKqM)Ek%y4ZLmlCN{v!m8Y&{pk@xc zx$JV8G@fZq84SkCFAUQ*>&Xh-ye8U}Rz*DCi;1F(Zq}5WW(ai!E~g7B+^&eG>aK?+ zxK{YMTyOk?Iz;>>JTbm<3*VEC@)VYkG61`;>se8)@dn;km^-P!u`btTyPVOq+E^>k zN+nP)W-m&n>1J%ZOf0%6p)cBA1hx$Ok~DSvXRQAJpu&_c5XnovhHcRNs_?Exp%h_KGZ?^PCzG9e0#lHda$P+;|uUukQ|aTHES7Ne2Ni zQ#$8u_UV9rD5pWOcrC)`g{f+HOX>}g;vwfZ5W4~(YwNFo&1W)_1DBT3@{J+RCG%YU(c_$wHbVfk%00h@&tF{`k-W@d zA>d9k_+_J4Tuox6g4=2z2=n)UPsgeXFf<9(@r20!)wus3FZ{b7W=jewzyHhR=YO20 zpE=-{0=A9k8ck7?5g{E^T+hIRfx(4WNjDi+e~jsFoTtq>Z@kF#5BS$Dj+)9^SR z06DII7hPhnW{xwt1(ROffqR*awe7?`YOOp9d$e&*OC3*ZUz>*;o&eTZ96&zwnS!rI z^2Ylw(ZK%^)&6g1{w)F1a}Z^$ghYGtK z8=xhXE6{*~h&+GQJBB*gEPd2+`K8V<_c*uA1#q)LRwifjx)z}|R^0|3cX*YWj{@ak zt7B!m197kSiKq9>_o>7=a1{f>**8Kp?6DvJcbNXa9SArvl`}-+yJfC(*Zz0ukS|DEis3s zCCHh&d8=S~45OudiGjxLigv`R6=)LPoU zs?sl7h)!fH@!SK0LUMfW{I;jCY@{;e5J2qp{}GA)+qTia0Q>W7k3t;A^yqoHS!%NQ@oFk#=9X*iIra0m0!XFI|;<% z%#b4EPK9y#*Sv6OGwOfCWjAx+SCex&4C$`e%33W|L|*MVDJUq@M=S~h^#xbm+*+#X zxBJIW2ezfU)6;iA%qq92jxCJ%Zpz#{H{VjeGRxljxMBK_+?!v8#T5W#H`!46u?;Ni z;@h82piO$VEDsIF*jXikkZq+I)BmISaLdC;XPVLb?^IH#CN=^p&{TY{3+Da*u=M|( zo&fQ_m=ApMR0}Fd#dT_Si;c_T(Y^G)G1cZ;8i!UQACIR&Kd!vjI5o8)c>ej5F)k4K z`Cn4~)yg9Z;2y?t`C9c#tlRcmeaf6I1Lpidvs8NQUV|lF>pN<<FNnSeDK8n>_wDIQp?UL(M8LB@j@fx(I*uUeeGoq7`^vT>n+%Zw<6F z7>a~{m@nb~)BOeQ|B+Z-r5Dc-(6Rpdzwh9G28rMHfTjfC4%{=6W&UjsTbk#!M4o`! zBA_0N%cWq&G0LVdl2B(cMIkj&jQ5XmK%hz-qw(mA1|{2UB>XUGQ}CXKPSQ*MV)acb zZqe6-O?M?5DQ#ZdwTy3Tpui#eWs?LD4cw3F9Vn%<@na%n#>_3*Ool!l6t0B@1JC?Rvd!;ik)YgE&0MkX(- z)mL2k>bsF=ddphU=Gp4DuK{XVI&H7{_hPWqrO8c0RBFDb;g^hwUDE7d^IR!0K%;izaF@3O{Ym?h*JPh`8fJ#6du5n@>a8;tHgP zjdivo|AR(kae3ts4i!)b`g=Ghxm$tHcO8hq)O$5^*si@>Y`a#kd0*M?-l-s?R9eO@ zv<67jmEAA}RfH7+TRh(DyfF2LK~;_08-g|K!5yYtqR(VcGqC_&~e;!4q4VS8MWyL%D5UH zI+Wi9U_)?cPCBI7Fh!-wW-*NX}c~ zN8UBDT+m+mdY3fuI9pP-KB)D~T)ZyuAaPQM9YqUh=r2IxsjEAV;=~fcP!Z9H4RfVy zycV!ztWPV+^L1yq6=rhvO`#3Ll9g_yJ87$9V!C3@zk52oUi{jZl9_UnnVO>K!{0om zqjSEbqBAp7xw=$at350~`CUmi`z&IEnkFcz(0HNS@AUUq#^au#M}AHRv3QxwF6$t=AC6(>;S>JE zSR<=lfib+HSRWqE!iFYQpK5zpLkfnZeap|08ksOM;r!r3qQ98$BXVtHMYx&KVBA_@ zRFmSlLC#j!t`0}oghn5x?&|LNKWuqFSHBao2F-E$#LDM3^rkEQ6ls$ z%ycs#FwPZ5m>A%E2h-cgL@!|=zeK^@kq2)SeL;aNiPKCQ{Zk>(`Wg?i{xG&u5T#Y{ z#3h>v3PK{q^FM|#Fb1}2^ z5V>V3aY3xPF83n`Q}41RPm}CaKg!dDR8hej)lewr8lmkf@#{%#V?tzD&PxP|a`D5{ zXF7`NJzQnA7Y!`DXu^h<1h~PDWaSInPfG_(1W>x=0q~i6xkGC= z>Y~@ll(ZkZ(N~7ND?cO-L)?kpK@k0A3#eR8HbO00Qc)M%)XNl|<_Jc{Lqp@Iar z^2Vu-EMfi{h1ejyIu=ZcNuVlx=t;$_Q4%x>krd@MTq z=qbW~t&TBq>L_a#aulC1gdsU0P7FU#G}!io*0)33`$_m1!*nNaw|uw|ezfML0Dd%{ zn|`h~zyf;ao`o&qqTgxBX;H5-nY~m=QAx}bpbAEfzc@cNuArA1X3GRTa3#I`-7l^a zhiSyc*RpLsYFihxo4T)eA-|f<6-awp*c`@tP^#S2)YAPy3F9vNZnJ1EFt+UzBnbhd zHJsmw5h=!DhBue`B2t@Ilwt7%MM_mGNBOd3yH7*dotE8~p*%W~IWKdo1A(Sii_IFH zg3bd2Eb;SOeV>D`*~&(bKT&n+Ie6JpaY5$4FS|o|AfIBsIuz#37T2As5G{HJ#~ilK zR$sR*yMb4E$7g&?$KIO}*#`>tjIEGL?nN20u{#jA;AWAS23AYXv2Q$+?L&$2Tw6)C zpbbbF5m{Y+0@-iWz5m41jCVZWBAyJ7&}wfa6UP=B)t1+=jhMCQqJ%CqIrj3LR*vIk zrVWh(l`1dlq}Tg% z1YgU!cbED?&oLsLnA+Z>m`3xX5p!bYWRLxMhayP^_5La$@6Sk)aXPAgWDX%eOQbqa zp;ChumCIe$UDTK7Aq>`w?*!Aj)-jGNlI%&Y%xjJr?@=KG)?oENiz>YfID`FDU%&a~ z@Yy6GY%&iNp5fD8SyEJ>AyA?E@{l>Ms?gN4cFyO(5MA86BqiIGbA374d+XKOOCW#n zD=+i`@L}>*1+1Gqn?qOFY|utXt-;VYGwnHfCLKZtk`NdOCbGY`^F1mFuHIeq=-r8X zzO#~yUo;DCZ4M^ex#*K?Rz=jTURIo$WTJ5mzy@I-hY9)B1!oWiOfRA`t2q#=&@olQ z$6W;B2hn$o`5zvg7YVzzNUUT`I!CzBx~A$_4svwi3K?}%L`BoeN1RKyCtldm7ggjR zyC8Vd6oIY&nhS}wUIT<^PgWhIBUcX8eT3~5HJ&A z#KJ@;|2SgrZXCF3U|{-y!@`1n_gDzJL?>ZI-}#Vd+fve?`9Z1=4U3P-7WxR4MM0Hf zSYdj-78g5Le7gV+cBwCv+lD#kv|nTu4Jp|1^Bq_L@5_Zp?acg$s7Ml}s8huelG`H4pZzO+S(3#lYY8RtY)v9#}s@nB_IGW4H(-l2F- zUUX4%-~Md((A0z^o%?avI{gFB)Uyc^8d@%7Ro9h|px~9p~cjj0(JN z=`z8T5wa&Mx9WNER}i_MTIU?fWgv28mv%T+$q7dzjk0^{`EaLt+-qvjEHcN{Ve$$ePi{(br%V_KP12cd>%PCnSyhc+YpE^Cl-1R~ zHP;a;wGQg_CWdUWU;MaO)zUxf{ci5wy!wOnI5j>s3(p9%72}X)fCH3Bt%>+}sA*mPAKo*oZ(xa~}*6zelbj_IH^6DD&BakgBqd?%05esA>yUKC{f_aR!<)W`)mH zGwE8|Z5%IJCeMlxZ-2?{r$pa$Z+r6G+lGBU8X z3+uTaXN7D7_-Y(_mn({S>x!YrFP2-^pX+v3NXi+LNMGrir{O5S>DrPuRm2I;Ow2fp zRU8IO+kF?ATqluxO{pas5oYR0HgI+X$YX*I8X13!3P@iFCZ`FL(;%s*oY z=cr&nQ_~g_Y-1QJG~LDE4p&Qxzg!MhQ>(GlkNG_g(U|YfDw_1cm-1H5>y@ANkIG{a zdLIVSeV?k)bzM6HAOt8K=^Bu@iaK472GJ(d#|(XdcSz^|E3X)Q6@ArbMp^VDY@N$65D6KG~TOZW6a zt@;-E>Xffrge-dp;^I`eRV@^xhuVvWo}|xPgdbG)UUJUYQb`xmBAN%jwJcb zH%iqpl3dPbRU~nfRThbct^`t=!$*HVK*+)NT;2s%5A5qCQfha>MhyO#)sR$|hzE&X zWTgQLoHiF(Nl|WbXU2-O5eFu|u>~24DN^*@AF=c!S&!BG!jrs^t>pR%AqC$-&M)_> zAY}=bEu?YqoFVo}aG3j3zywNn$Td?HP8Hit*~fi#M-K6Vu!gX@B5%dF`K86~RZF`V z7ctJz?1w#5l0SFQu4j(8&{iFo_)?#OE@Wp!#*fvtHq|TVIg3sj*Ud>ewPN2+_s_*< zIb=~Q%E&DuUV=&vRy`6cyQC|%venhePgnXd z@u+HqLLR0S0edAN)vX^*>KAjdy-#S>iH1xWpN(G%Xm&xbU8`7q^^@AWsjyhAL#4!> zy=}cFd>JI54r~eYjD|j zwVgeUzqP7iy87|(Q;m8oy{SDwv;_(iRrN%>nJ1gD1@Z+jT7*+#11&M{gjq{tFGHi$XD9RHwZYn3DVq_sY1x9UIo(jGAVN96;M z3UWHwjyqFDgA8CJC^ob;hd%TX{iS%T_a{X(&&4`yTS83uN=gwYqY3Hi=!L_hJi^%I zRmJj}5)BbTU%ByD;#a*K!*XJTTQTo2hT7L!d9q&4`a}Ga;y&<**;dNaW2QAkRxA-E z=W;N2a>4iuZlG0poq(Hmm?Py~BfhZrHYNPY5`Yw2F$+TnuIg7{2Y6ew&h^}Nv85@w zzG6idK&`H|@NaXJ%Hw--WJjTxM=rJIM(oIF)4I}~6#UBZsdJm-FG*5J!i7h8Uqvhex9!CKlxK@$P*gfT5kr}lS@D=bAb+>pq^Ej*?R zl{n1fc1X=GfTzQ(t#zF2lEq!$jPS>^F<8xW37h3Knpq%o93w7=Sj7`**JXEd`pY}{ zV^#-`T3Ebu!)s#g$9AyQC~VbFAzK!nz+!?~iDtEARY|llLtiilD&f*y659_`m`Hk8 z+s^pwI$7SK1q|3)5!WbL);#mT*?Hd9ti`2V98xK_Q#IFFtr>cG^QO0wMl}h&9cWlo zhi9-qRNXuq7&R+O9iwlOZ!eeoUOszWV%YG2WugQMo5wvRdzw$?=oTwr*lJWWwXaTn8|NdKD&UugIH8$_;&SD^mBp;si5F0oXxPPmxvywU_gwMDPokTp9U zt)tC{?|3#ao9Wv&QdII+DWnZR7kc0P7sZlOiXFoC3eEKAIr)Mn;sGs7z_oOE;Qlhr z5J`<|VfZNy&1&v2Ud}Yk5BWox(jA-JNX$zu-Laq0Hmxxkj=}G& zTSR|V(>ge5`kp@(qC=M=!5VZy1yFhQE+!=`Rary(+>Y{YqYCf@r>}DDac5f}YD|f)O zSBP$`&n6FAt@LV1yNLb;jZ&05Is1aq8Zv{x&JKtWMEbydwYhgs+6&E&Nb8Hq2OC`_ z(xi?tM^*EVHLf(!P_`={T)f@hO>tVK2n)Xllc}M1&8#8))E)X^b*L5-n9@lvY8o0M zH}=<8UgUQHdE_16Nde5dC0LF^4>O$ z#JM;~OQkwXDyVAGl9+cO3W{v3rOi)*7(R3BTuzc_y&*%`KlC-Eexh=%f^1gg3s4pJ zKxAcWyKD>LzFyEtJE_to5Bjxx*Zu-)FK{R;791WXo&1I+WF+xzWN)hZ^`i$jQF%R* zdNWb69pyM4DrMLjCJNW1_pvd}CIHl_aUI0Ig0~MT2G0?XoaP=r>%iSIH8|)ou~vQ& zpmztUuKt?Vuro~X zU5c(MnXUce`RV4^V^F<Fw}?YdH$Sl`wx6$Nl~ zNnk;CQC3;sm@Y{?i`B!^&<&z^_%0dQ(*h%b=lq1QVy~C^qo0>X498+zE54>p3KV%L zS`fAkuo1b{ty!^KzI)@u;#S>D949D)us4u3=Cce*P4G~-Vw6jsiFk(rU~cfz{bY)% zY?>wh$INNsJ#LKP%~Pk=pYFEUtZ-@|+Syz2#S}w$ z59=?8ic6C}@3p^M?u>JSSsl)5WOlpC00v zBAd?emrPDKzIrwdUL(@hmopJ*1BKFG)uu^|dKu-swNhalhxePa*qzPC%~w{oLBTXg z%JA9Jcq0*p5jCjH+ks72cDAU@*R^Xck3HydRa8l(K=z!H%i+)cQ<2HZ$4?G#`=JGm zHMy2qoKVI}v8s-w5v;j+{Mm{y3QTT^cTw9e)~)yOXOE>oE0AC5N6$d>Z&(7}Uq1zN zZqV8Su=Rcw;N<2A!IUK%@wx5XJFIOkL56y4DfYF=#v^RDSZLo;vQN!c7?sN&OB|g= zwWLc2tDzo_hdvG0RWNyfMUmKI4{%_eUb)?A%397SGiwA9>><$mDQ?Rixo%S18O~{v zz^mrGK=T*W;WUYS~+XUfxRP0F|>OPb7Oxe-y0~L?I z`##;`gu!|?AI3gh2A-A9?TVrMtgXU5oTXTx%yH$&pbW4;gf3#X?44$=1d>T7V~y}X z-d6T}KUwFVrFk9@Fs0fN?25z-KeLTSQC&ov_AIYj@JF^b5=Q#L{;~Zra>dx$%VSVjidlPRQ+zo` zM)DE!OBy%3;=LO_(HZ}Y%0D1Xr0@v4CJ@0mFd3#WrVr`f&8bCL;|Mgd-^88)y&4%` zpy}POc??c_4ZL3wqsL9|65m{((dJHNE_Oh>Pbr>7Xo%TX|qoQHDRBXp0%9FJ%`oqlk@*_}D)ps`RIHy(VZ`X&L%BQ! zSzxl+S}^#L2*87``_P#NlgoZp5Vf|Mi;fSaR>bkS=`zapubB(xqMVQmFPUOwWgSN~H`6lJ`Sy`G1 zKqr$_F^MwIVx+;N{F*{qb_*S9m!uQQqJLJS)1!fH^x zdg&1^y7A;|@ic?YO5LGvG%8(!^fpqlCnAc&{g^5%7c8)3r*M|B>NfOb2#s}UIz`&yD>nX)_nGQqsPil6fPR*UN z9;|1RWQXiDEig6pZQNUw_R8=+QaNb2 zPs598JYf)+%C*6za7wr_tP`xhYp@DdtTNQK?&^hrpq$f3|;aLb4P> zNSnptiN6_N6b%wip3!^^U?(7~?U=0)er6D78}^Um9{;Vxt@z_78-#l)mbFsdy#{P= zgcpoTe|Jj-c6WbTmcf%JXSe_#E(C80Dy zG2HW$t)vE*MVE{xy(amvnVpVRtF-sv9}}r1i$!)dNlT#5D;ld}A3Q$@3aU31WN36N z8oeTf@AKMj!6r>sXn0R=J{w&s+sm+^ASbGI^J|v1SI(hL>rRVHw`|(-BEEQSxh*EvfDoel*uK)v=r*UqhdTkG^PyHt!OL zw~4cztL?batb(t$1+3@$3cg20_Sok}tUZ>|HK^!s-3m1*Aua+2t3bpqpUH0XMXQ?n z+_znnmA2#NAiT2+__r|_5u*QW*!i0b(qG%3Y5mk^?;W!2o0U->VuDVgJ zW*;z{!~xnvCgci(+Om#npr+IXQ9(>rb9e7) zM^>|tM@?S+Zl~t)=RPFK$IRteUw6Cr&kx5^Z0djA)~69zH=YO#12{9n*iRAe@Q~!% z>$8$22vV?JF(CL4lQ1>l?v+)PX-#Py;iKrBtu=J4Y0=XXRn+1=I5EtJ?avqnIS4Yb z>trl(IJ988FF>ENTf?DF>%NyBd$DC3$sfza325brAw6MMFrwgUA;EQ4#QPH>ZiEX0 z{A76LOT90BU9zqaxpd0ztL1&En1Mhl=^PBP8ilD6_0nQ_5_o@fBJ;b=MLVEWoj>SPAH({#TaQmoN; zS*;Et*Hu%Tt8AjB6~hb>YlL&)JcF~1aU}4Jkx}&$9AlY{@2^vqr64!RV3a%k^dZnQkVpu`ysPf5LhnzrxR{q*Fy&hv#mm23ycH8uV2e!RrV z_Detrdd0uZi!QmS#YjPcU$e8Js!=;76&3;N!MaI0L0DHU$_ z#3pZX6POMa>aNIB(v_&1_wQ%eK#958G0wq8*siG?Qp0pwIB+k*Vm?-PY{BljQL+;n zuvzG5!(n2ZjA(rbVwaspvzK{+`uIaiO;6P zDn0&8;SxxGdp*EuL*FHg}N+l2T4(Fe}F_o%$-myNnE8Ti3xkXNj=svrE>XRNb4 zgah4;71(6zxkwd2E+E5 z9Pj-CQ1l=Hd2^OVL9YR}%0r7MDKWdjJn!ctP^O^|%lyWbd)|)R@%fp?_F6gzYOhX} zwi-g-ci`n^AuY$-5EUaLScC*m^q<(!84mo(1gUt>ogbtr&|!3nsA(M;V=`Lkc6;!JoEqg=ZnbX4xu@>(CGb!uop}Ixbx~+IFj3Uv>uD z>cq@(>8Y7%=@;tPLqIL+QSz?0{iUZWuJ;2+gx1R;y?10o^F6npHsc!Tu^@6+FGLT3 zbsDThhi$No?WT-qRUBI^4nDlFIeuwCgA>q~)wcBD?x|CmEez8a99oaI+T4PB3Sv~8 z`U?ax3Fx^~gCu73%ZQZ^rNXXUzE#dtqzUUlv#QCv0K>+C7Gc;3)7?WT;TaM2n8RxU zdE_p!jCd>!l_jBo@mbGVgb&mDcd|qf%neYs(XY>apA3aeY5P)7qi{?xnJ-J)wG!U@ zuA`_oxCfg+vL8>Fsap+M(RB!%IRX_8e|6DZz7J=)VYf`^X6Ed6%$^hCH`5m+P#4PK+k{Gi};&NGtT(Sb$ z@F0IGD7YwT4pUsujo6avY2`Uz zY6Muii{ZPPz6#y|4a;=KKkNZ0PRKY8V z{7L+U=CO7hlZe4z@E39w8m@7+Lj5Y+`KBlZ>Z3F*#x}f*q807VzU+J7+Ym8^P6k=Z z7woL`Z#h+EV)kLS5%M(=vf6|Z@-m$~&;bgk^CUpj2p}WvNZp+mp|&by=2o35ckv1< z4`SpR4vZd-<7TVK@N``@vEIJRx8Gj2sqhrLr|y(g*ghBUT?2j-X2 za2)aK!b9}^c~gNm2DH0sehq_ozMZgDAB_MDV9>zJ$*@{04P7B$tSd?8zO^#-(kXzq zjY8r3s0AP4NKm?ir3j4u;zeOM-HR^vh=&)gbjltgRcq_(c@L_G$EZLN%rB}h6&Q72 zvnZ71%>}u?RbH-n{!IssWWOgjp6>>qF8MJJ;tl;HZi{v3XOAS2_TOM(t8&mN*QYs9C zl&s)vJ+PGhn95>MPR_${3G6|wXa%=i{2J)cdGS%InmIHUIIoadq8B=)v&|KX3H4Gi3jy|L=xp(#Tv+MTpT*WNKJz`Op6yN(vqD69V&;nf$+{j=;)W4Jf`y5n1m6K*IQxcWK}-b<)sqY~ba@M0DOo|TCO-v$ zc>qGdu(emXTbW4M+%gkCz6E_vq^x$$AA&62&uPgBV9J8O89zFT+^l1n-=1$Cm8xMp z&jN406RNzRV(^2?)d`xCdF{C4Zh_Gyhbn`Lxr$3Su&7NOyipz>%c4^qp7<%F1Y>EV z-4@5M&Yz&^GN1-|Gg*7Y2Juq{>3tqCk(QSJ`#-<}%xGvgp7s$LerSP<@v&Ja^UO`m zA4IRkHH(QqTN(*E8kl0e90cE|I& zcem6~ls8ck%(`y66)@|Z$U-`iJ14=c6LjO%#*+oJj(*~y_hBRG_6eXp?ppRTZAPLL z*Ok9(sM{?-(p#NGHCO(L-AzBs7)8qkZa3o1l-a{wL_vFZ`n&C|`)5Fna~NmxV#1GeE%7xA=&nyS$A0TLmZ1GaM?hwk~!#|B!DbI!0mAh`--hhSy+LN#n#Sc+sikQD-LqqWN~bdZv-&l}$ib z7`GB^(5Qro_$uVdz-zH=OeZ5>E@jdLL|7JR@uN2E3bndgD{y?z%kxFz`5Y@{*;{vZ zb90mznH^t`omMn_r}y^y+X%n7Vwo z1nk}KuRB?jLFrl1QqwwXrApxQXq`f+QvCfSE$WeE?Vkk%!>{Au)d4o3piEAazoZyv0~XN6u5u)fOkF4Z$OG9bP~ zw@86aAoLk_-$$!Z-t!f0q(31HzmQp;PbU`;Hhnf%E0LWl(6e>z|7g7A z!iLeAnXhZB5sG1yxh#mVRLU#GZ=YTTr}wEwpF_b3_kt+Acu1Wrl?=~L zu<=w7yT4^afmUfq%}{$%)hBvyJj=vMm!~<{ojI$;!=hO4eoiv?t5PFB&UNSnEqX$B zWcyRHo@|!MfOrr8?-4~vX$6>vjq`WeOe>Yzt2$}}N7~$jfxh`}^>&6-2OQT~tfQdl zC1xya#z#UDdt%&og?YbRN$pQg^S^bF)oiO*?$ zYO(Cm(&^7 z587O=3&Y~yepAchO^Z^h{ERJyHt*Of_|tapz{0pX{?OG=PC5D+&8IPONqUaz?QRr3 z%-xtugifW=L!hvK<5{J>KxwJOxlP_P?8nfryg)MLI;4MM-#GvuTW;sXE&*6{iEmD7 zQRf)jr}$Qi<^%4v#{7hmOu!-J2tCsMy@eDfR{jx+SLM$)|fX~gCVh4^6a zdYC+(C5!p^@?@a?RF})tPb=AZ_ZR^^n~Y(*V~oE<1~oTEL(yMn=4x))`O+>a#p(OV z#aF8cxc;C`*-mvTId(&Nbn(1vvQ>_OXLIH&uKtnR&{=xrKb+wu*|%T#%)WXLs5>&F zHSUk0@kf_3Zx@_l$TEPyBq;3t$XSVbgWCL4i|hpEh~04h=XuF%r&gD|Bt1^H?bbTt zF?Fu|E9a$hw38Bat_o`t6xr=*)TO>FwmNj&`_l0C29^lX!0srnK%e#@krcX2pqsEn zt`JF1vEg4fI_nc#&V`tbbZccOE^<-V=~c-!KYVQY0Tx|((3V%$)t`umDqv4EnaD;% zk8+tsM+m~f%38aq+9x*E!Axh09?U}3dhdBxt-y4?xRj)*$GL`GMG2Am#ZKlS7@%9C zZR6EgO^mirWjbK5Vdm2eE`&6R!MB3Z`jO$<^~uLvUnL5l)^m@_kqq6Klw76?yWpPE zHO`b6_1IN#pzHs^Q(iJ$4bN9%=gn$Lz`KzfT}&&r8oo}fIjBpkW4Wv!igdHiK(!pw z0vJS649k)`#(gwu*PY`py2y1+BMazu-_ra9O^d_Z&!kcp8dY^wkIwi9<+gR4^o)9Z zS0*vOo4`m4x1I5b6t_1A;D<<;NI2EFWr&3!7aDc`MYn5 z7e_{b0~;Fe#i-=T?#wYwM#+P>_a!2F8O1re&=7slw2zD+TV}Bdx|%jLtK7u^IieW1 z=#B`i2(tT@II4fs^66g^8CfR;ErIPU_3FZF@H3ca&uy_zaHM^)U8r zZnLjV?^UiB=LbanMzm%+OHks?@Hz~vGOiaME7aFk;a+f>Z#mt3v-UV(O=}MLRGWDa zf6CATXVW|usN>FBQc`&;h0%&5x(f7Hddj>a_GFvGA7P}b%z`h+C`@$F*Tveq!IdhM zG%|g^QhRrDCp*4Po3E_5Rju2dGU8NXH-tbHCo$CnDm^NXeUAPAQR(45fB#)gsqVOa zl~Y3c^7=9LZR)}VY8Y=7Cn&|`VM(lm`fl=#MVc5z0#y)XBxbl2I1;Jv6zlhI5A z<0*Gq!LEaHNm1Y$lBe9A^_E}Qu3iq>jG+sAO93_t`dN(p*7s#rV=A0Wprdc0gkpig z#`2V)wi{M;zf_iPX@11(?HVLCAS&NjV2pL>E>c;5HJj$h(u z-8t4hhowU9;?#5dkTqFjtn);(V&q))#)ndS*)(=Bs1YQB9jm%a%2Z~ZhDk}#4=)0c zTWe`)k_;z3w=t>duj<4E#IAHMUfDHoJU!`Qr}i#GBB$x*zs&;uLJls-g-+ap+4AB& z`C_}y*n{0Qs`B(B^qT7yzd*#G93RO3T)%v%0#r*N@Ds3zQ)!G!vfyq5LQU7G`ur;R z?(Xt1*@GuTKa`mRxz)+!%i9fVt4g%Uw~WZawZ&A2Yl7oUf10XyMi?!pQf@6f+>D8k0?L30w zgx*%)50(RPPt$4n#dgj@hQhLF7OnF?Rbm=f{h0{KB)1{i=*5hD0(kY(5}a3yc(4XO zx>Wnwisv9Iw9HX5?g$nbaf0rx;rS$rjsP|u0cJKv>|iOOX+K;c29bS4`5%9+_IQe1 z{03=N;Jbag;NoQDwqq5s*kvEV>ym`3fs8L&R2c+j{tW86HWk~Mulc3(@%klV;uZWs z#q{-5PM-aD+_D|!`jCiJBnn2)k7lT5=k@{S+OycElT&lgCH%vmCsxoO_xHBbmQe)k z850(DAeriV#U-p;aEIDfD!ox{68WPad3JYcJDZ~^o2x3-Wbt#Y58qk&1?P|&c@^2z zm2r9nQQ@;IaUFa4agfvFI`^ow;*eh+O}11BXfiHXU%X^TxI5^H_H)o!^fR7h2?DK~ z$Z~^k&-Tk{p3J0c+j!;K%E|q6)C6d4;nj+)!09E z5f=_jqP~jH-l{z-*CNX!aSy89TEZCJ+}LQBRVcMOr||7jryJV1On~G(X~H^%IhzEpM}B<^u?!->Qil>svj;Mj{cAU zXBTiv;fyaQ$!MDsqaVju)iJg3azO%Sac1}Sm%;@MI3cX`gukB`4A8vTA3Z!ij_(kB zONo%w9$@D9`b4bp2r zntdoeo%m?{VS_o~9L$Xfg9aA6Q;|OabEOxx7Mhajb08mZ?p#(qqh#~!-B%2MhvTu_ z7cs%ZBN~tP?S{u~^#j!)-UP{qxT({YLWL?m+Kn}RDIBl0U+k|bub7!lEWW}5;Igma zeCO1U{f!`Jp-2xfNGi{Qk>s9G98&5LOZkWH8r6jwD%LyU*>3*maT)=Yu88Hj(RkmP z9I(!*u)3i3kF4Iq^%cW(UQ}BP3kbwg(4)XI{NWr54SM&&^rh>sen(aw0Va5-W20CA z6Hg7}U%eCKyRB%N#@*Pg>sxv6O-VR_{*~Tu{ zI>VVRaio`*mrE!^kNBmKbevPC^kA2{p0=d;$56@mP?^~an5$n!+Wtb4lrMp3FyC`i z4KBk!xYJ{(D4R_K_gW~wGgndBcQQemNGPm$u3Nn}6n-+td2Be6n^fLZE`v+?q29*| z&vCfMV%jO5na^ICqeQD^`pLVwOWE-t2LglNN90;#R|=skB(D6b_%D3~n9D};y_)Gm zk*nu$wgS$yR*fbCc76xStEljfx2Vv;)-jX3pI>vutriWT#sa9V)kU`O4yNCkOQt`D+Qa;{ zt7bmE8$e*c_?&3%t)*j_I#7KK90^PL`U6c3ay56Qh3SyupT_mq>Q-2bwmoTDBNq?8 zw7V{yplR)zN(h(LD%Ti}W*Spyjnt_7wlmgAxRt!@Io;weN3+Ey=*e^~NmS^a60NI@ z0sErz^=OUZ+G_QPbtDb>DlxR@952UhvY7isM0nuB33=lA`&pS&{ONe5PwSu)R3`kw zOnY@FKdZAed3C}m6`Mf$X4KrR{fHr9Wni<=ML(Y1kf`tKOT}t@F@6|!)4^nqsH%f( z7L6G|sNw=k<~B?3>63kzq7jyW4%Q3xOPm+TkI^5RFz-!EQWZNz@Y#8g;H{6A@MdK|U4K{u88!cEL**Hx%YOz_3I{7Yl+x3RWrUl-PiB3;lZbnKkT=okqqF`lwX>ht^8P;>i)PwEF9WM2$lu)ZSY(%EJ&<<6jVt@NP`;K~?kq}SaAYDxQ8K~4QAm24yg>!<5;cKHPr$orz`8r>kAExQt`mlc15FFUZ3b$od7M)3q~|B(=SeB3U! zwzB0Hyr}0<)YCISs|+w_>rW9VwK%upW`L9r3vhhW5UFe9Uc0V0H}_3Hjqmt$hniVI zuYkQIfi1Pbko>mooO$f<>NJTynfUDdBWvRo^+?%4$1|Kku=3)y0Ox(^uTiiXqayYK z5kUxgp?U^-$Q6)hKmkGB<@}S-;mwniji2|glK1iFV)H^X-6&?>%s)Ppmsz^PoPO1w zk=TwmT5f7KIzU;~ktN&#RET!YBHpm9!>*o^fSgVip3Qw32mOfqDj!N$m1Eld5`B`? z>wO27dT9C++JW(y*VJAcAnB2Q6NW0q%DN(iRBC{Qe>nmje%+eohEr_=*k1WJ>y=_X z;@GQCrC{T`Or+c#NH48aEy|jDEUPfHp63&J*sC0AroS(0{|Z)-DWL?7_hf(Ab&g|I z^zTuubc^U5$hO!=YY`G84WIdDa?4uE>7!JN0ZR$r4Nc`@!W91F)&hjQ&dIG$nMzon z$)W}MXDIJqeP6lLfG>zS&eZ^8cM_;do|UixLW03XK};O0L2TNMFG(GxAI)q7$UKoj zCWn)=E?9S1-O|gyO0f9~K-oKvecdO|r9HPW;Etv|m-z+;vxA%tZ_5~eVfDD5wK5Bn z3J;BYk>a&5!$3opl`4?D@ z7(KE07he1OW&iSgF%gzLSj&A78Cr`(bj>oBv;zI-x}BSs9wl8;)8Xrgo1=jFlbwza`}bOT@Be6{XF#l9cVz9+c< zeDDAGSN`=yaVR)q30_G0o3+4mI;?k>M~!q@U;q5ixB9Oi@{em;z%kcD76xP>bpG8Q zuu14u*d7d^27jVw82|epH%sv#;$*Mnqv(ITH5G^u1{zNRb4Yz^);Mm*fBWtK%YPZc zE>A-*U_#OVat)X}wa-lW2o!gIEVszKciCXjUDyA_w@tX^uI1cq{5?uoUO@eSeb>$M z5PdcKC%f~~W};=vb!rhLT<|xMsz5;7ZZzq#_rI9rJJt(3yQ-H|9$WwN`rohl&*v~5 zsMAwJBIS>*@S*oaLK+PueVYC+CjQTB|M#A>VjzH}XhVJc(*OH@^4~2$iV6Q>KKRT3 z{*Ay0F+7?L2?V?+P@$Nx|M^*>%i4tF}?Y_h+a3jV!&`9EZi3W|U~3W<&p<3oO{ zoO7a2H^JSsdVmc#97!iwQBgO0Zsk3#3BijWr9K~TEyKu18kGb=_Y`8k$Dla^<%l8yB(@>Y$YcGep7`djs1Bx25UL98ZA>jF99MiJqiRHZ%kS70qsu z@G)@bG-U9i3}lzS=;J+d2r3!44(i$ca17xoXt!OS!btUeh%WpTqB_2VB4-1Z6d6=p zL_c^83CDa_oTW$CeHb*MFw}CW8&2s9-?qY%|8qxfH%w$=fD0C-I=KH3X0iAcro7n* zs~&Fge!&T0u>jEmof}pr^>uT4=an#Wa>Xmxg6HKqkOzk zLv{c2xl1n|)lgeZ60Um`EPxND1p#L@u*S1F`pL93aa}&B*dJ6_9!@tPT_pC5(RA>p zluFEec`>+hE}hlCJ10i?#uMg+b+Hq1crw+o5~VV4vt7aBu+a=HV$6LES;BfvOPjG> zSS=6$j*H`T`cri7pMsKBA_4mH4E*#ifX#tg?zs-wV(TV`$~b?P#5#8FVsm#+?vdgw=lNL+u6-j#F^XZ-ShO!Wp?j#CDrT4|Cd zMpyGOfai1b1E-^|c*~c=C+N@M_D+9w(9gL)m|)(R^HP?Vf@=CtOXz-Gvw_N7FpTUm z5pyIRq3ZFcrw+^TNLPrYGuofAo-O#fT+DwjT}l&AaaHIGAIRGaq1SPl(jm<$s&cA>D4*u9s93XY604Z6EzgaSFX5!qzAJm4>1i@@x+!BCx~%`T+9QlZ43~-JjKqr!Ss0W9hel zUd#b1KvSvE#>-5-Crewl`}0J%Jbe3QF=HwXPPU|%_4QokX%Eogv$!>|c{E4%q4Gq5 zI=#it1cLbRwE+I*zv=N{s=$JgV4qn87sF})-D&dx=|bQEB*QISkmp}A+`ne_|HZG+ zDd2G(!$Aia;{Ved|8}t(i0j#!Kf<5=``@JX>jS8uHk5b@zTD;Cdw6IB+KbzQ1r{y;Fw^(nyU_YR_d3gaH;krH17`)pP1*$@qmzSd@ z+Cp1{X%B7pX5&iB$|xj~ySuv$et99Xn9p;)3iCrFkSWq=h-|7foxuzv(5UuB#l@VE zuq{xn%AB25J7xK^52B1Ah&Ct@*whNIJ9<4GZMH@OYVCGKm6esrM))3}7gH?SE|m4K zPd9#vp}XND)j+xc3-oYtUQ;2xBlF-{@xg5I@NkEwC-p3GcQiTsCc>J{Ar3qj5`>p+ z@gAzjavCXd3|}AxDiwijCX=wAKYxx^TWT9kmA*={>V7l&LnPvB9)4a$%2(9juQC{4 z&uR)4+Vf5wmDi$5BR{N$XgVTSk7eZ>z37W=U0B!ajpbA0i9X*L{)u|JGZo?YjAXA? zJV1U5v~!zbe9a`u);+M&$Xr`$r`KVdP5sJ3$`HS3>Y+))d-(^;X}h4xV7lB;)b!Ff zLX&0A=6u%rnd{YNR^h9ud}Ze8ddD)l1~K}ENWFpNf@4%WnLGtrqlHH9)6EefjPDH> z%RvFRi!jn~B!{_wSg3=wzf6|m-BgNdhWwkEI6!!RYRW=a|zQo zUXXJiG(Rl2-`AMg$N?MbbP(AOx}MJo#l^){E(PQ2^~DP)YB>HZRrbO)%_l@vn|D|V zIk{K~tGim}bh~kQf85J#tE}O$Kc=(}o9_c`VlDE>Tv zl#qL@f5`8o&}>4k=8!7L&Q8rraf`n@PPs=f`mM@wmuWjJLN&yDe<@`&QJ5r;F4m+} zc&mhilx!(PI=%j0akA&114XfiuY`#+8N#F6inANr>ScM8bUq|R@-sL0t@hDqle@b> zyAPu95)7sQ89IAdp`p>=Gs=~I>Dw#Fa)Le)`ryaleO_DLdvcnWPhqm2!r2N?$ZZSjoyZ{X=B61W>4@594d~EHRK;> z8l3Hyecym?A^zs=M%?@-Lnk4hH<$5|Iz^kqI1=)0g=%1ck!&!BfBHa(eh>y@vF4~> zzpzjFhK1DX^&t6KNopw9?aReBF58Y!BAXUuszL8(JO

-L`-B)#{0-GlZULeW9SD zi#Fz!?Dl6k!uq;aQ#juL_lW`2)1j71RoDrv*&5<@dI)^k>*AGX-_vGK4_xTQ>Jji#MJVxak*4C} zZ%SfR2{HP|dv<$8?zegAjw5@uQTJis=&SmRyQ}S@z%Hsh2FcsK#%r_bTHCyl#afFn zt|O#|tF65EO`oWGj}xu?(*%NWq`e=Z@gsJezN7?E2i2D?z7gZCsoGL4PI$kEWd-q6LT^4M!x;_FB7xS&XYDOCX-;i4YPF$s zj4i$cUy|7MdbcVtHAlTwspxhx3ia+#fHZ#n{lw^h`f#I)!cjEaG*8=~S>lMst1apz zfqD=nYnS{8zPdNvT^S2-;H~%Q6-}iyrA^grj$~1bev*-q+3{*7fvS-$+VL*(J95wm zRStif`U)7fKRj1|@>yPdpWvl${oKSV`RAtgl0$m49wWJBJ?eJtcHxM?>Ir`ho9Xo6 zs~9ycly-2iNx$1vscyN;rPZ9RLYCVvMBLYz6=ZT5AH1;wkT0e>IEsSkYCG-Nm3yLT zqvm?^hrU&iws?YIa}nph`7^P_muJ^$MML;gW;L@kmEEra{3WPGUmS<*I`ae=nCrxPQwjM~}e3ypyc>*h_jnFcs><%Y7PJ6EUMo1Tw}kyJl3 zfAe@3%VH?gdRWs42DF-uV>K4BT6x8My+0{R-7N8OLiz%j4x}^_EA0g4Kh782c^Ca3 zq7TBuk>PRa`_N2trJs*^(Z7Xzq*$}zZHF36e+Qw5c3 z3o;_xQE3GCQ_^u`UL%eJf(2;0W1)3lgIY?AuErZNj^(AcR6LJqy77I5AJt^P;5so;kHuQ19mdVp zET>DbWB=O^O(qTf8K?Rs0h@_~(vAXSlyo-NgzxollDp3++9&9B>QC*r$0cyXrfxQ+ z+?gJ=Ir>0-@FE{9aPHSLx5%N>zCdWYsz!g$OJnDbxR-TBh(V9yuLfBY9u+BZ zao^k=52*UB9u~V^pVb~z7>&KJw_FkIIY{Gj++oVhq751h#b*g%9^jHy#0k9y0bRKr z17G}^{m$gjbFlG6#S@aSWRSdcQ{8S{yn;eKCS2*2Hq&TNSdf?jbzGk)-8y=o^k_IwiO&rbe_iwOrc(4?9 zkkyuZ={BLT5H1w_0wY6)RFxfh@p6%}?>qzr;Xn2QXq5_T>dGGQ0HZ3th2qR1rCw)O zV2IXeI#cQI>&t}|QCeF1`=(y5Ca1~nMyuvb`)z#(@q+ukt49w=0XvpvT@fzj1Gbc_ zt5@|j$D+#J;as$3nVI%;wu-3>VFc`Sbh-P2DMv=)-90_12hZ`XCAjQ&r)}S{va+JD zHJr~g`*K(;dhBsVeL~Vkp@`_nmCu8ma;!$m>b%A8#`N+~ylw7rRQ~!Fu>tr>ssrYh zR{koLaRBaRafF+_^)P)`I>5PBwtov3G+UH)DsWDDzv?DZa@$L~q7PQqXlkWAt(jRsI`k?vl zfM@~;^Q|J8QA(iGnq9hZ1Fg=Lb zz?4oQb@wBP89FS*(wltv0fIg9N6whv&Y^YZ4|6;{#GwZ+-!{!sX*94bjFYdrs8V9b z7kMuhA7TbdiOZuC*nE|EL{P+iHmRLiyo1qGLDB>H#k_~~@mlTN$yFoT@h8pd*Sbg1 zK8T@}Qd49>cH(d8=qT@G=NLN(Gl(C#DbG}V`iA+AUBz}4Rf_8nTA)_Lgf3{JhD5mX z09sFpZpyoO1dqz)cOWW95-tmXLq#&eKSi)SMYUFUJsk^-n`bp1|2B!1PSB}K!hkB(k zfM}C=C1Ip85P5m|^3T5@>NXmS;!;u*mv~a4oO|!^Sq#4Q&%{2})EU9$m(8IgOt5~o zHrZQus5|-kBR)-|QwTe`frLY`dk_xoXb+itTfbo2Y&{z(s^w$rUV)v-QN&*tnZG?F z7LXn0&b>TCE6t%|1QT#LrO`j7Q^4Gf2u@0Yr3Cm9FMD}Z{S%DOTzil5kwydie|_cp zXBQWNel6$zcHiB$jRX)Bv=FqGa+@Sa3HzU!ov2wfS_}Zg*NfM(@iPGGzI!_q>T79{JGvy_J0V z6JK17LHQ=<^S!{qntm)QQuJ2{UsM7dVoJtO_ZKvsjhuT;Wwsr@v~M_+$ZacmZw#h= zsL+V+xFOH8RcOhc-fmyn+A`Ap?WgczUtUpN0N~1c#_Zk8Oo;^X37;;ASnY*EXv;8@AHWp5VuCILvVWc>4+auck z*`z`!TFYs1qr_u(F0=jdDYPkp6)-#8UIX3N3dX#({7Wy*kgE^X*1Mf+c<6iTWumQG zO|`|od)19)y@;)c}k&1lU6c^%0T%Y#OKic5C%^ zTY(K&1b4yLyv=j>`x)=xc!BCe`6ZG28S^HIc3eN$13+hdpuzmfwp*iVfJ(j>rKP2n z!0k8c$dDGJGk$7Y>e-Ur5UJ@^Wjd4DLqFLyBE>5)Eub>q-NV`3Vu^;&!hmScWQX^= zm<{0_Is+US;2p~j%xw@Dsazm>I*=}&CejkWNDU@Me+p{3f0b-5q?$v(aSYZsP3D~# z&>K>Zb#Ic$a3LCG2Ue-M8fz-NiM>gFjn|4*X6pDHRzK(_0bsFRFHyN3!iG>;iZzD- zdNTi?(9VoN_51VL0-m<&`WF#a*4A%$d3i0^l?ZZ&qGDo3w#M=v&Z{6&nKzu`NW`~%en9)3$BUGSjv+gy_UJls3*4VK-49h;uGlz+5Ob^8gN8(5J&2&E5!+IO`2x=}kyh25U4oWE zf^WX^#omty$@J^Jx-bCF^K=CLYK4SJU%m1j9vOM)E?AIe*LC#c77*BBbz&Zu>b*oM zpoQ4DAtEByTGwIiN{X^RK`ryMqA7Mf9aEGQ5D*~UFw9C41k^xdadwl>&aP7A({-9Z zF-q)7vFQ5ggk}~<3>$PIFBtjZxAhsKan4Pl%YN<0Xqx!5n?M`p@Ln9{m#0u1?A^V0SIS z{OLOO6xB zymyta*~D}B=f0b|=*N1By9Oh5Gi5D+dVBybal>VeR({L%MC+#<>Vptt^sjyonqn0c z6(4_JB+|%Ab~0L2<7S`}<``p$kBSliLWdDkcM&+Rc36yl2psVMIeeFdtLegMLA>{C z&VHdFsD;wyFP{JA&CumBvboRT-m(twUX|dn+|Dz2A;@C-Jb1Hijy-sDh462smS#?X zH~?Q*nQP$hCS7Dl;3B!9486e&@fce>18Up*T)hZN37aK0v)OFUu#=a=6)D2UAGm6e zs&D@bPkhT`O=A&n-sM?+B|k3S=bo$GMmt-1Idedj;*#zpJ6CJF3BaLTqzoJ%pPbE@ zKQ5fJoltLOBDNq!4}wd%eR=sj;t>f|9`z8xXCFV^qkdh?sHws)9CN^5A&gpdU>+!?& zcMm)rC`f1OpD+;|j(cHd)T6(U2)=%ySg|re_dTl$XaoSWeQW`O&T-giEKEyE{oh0l~F7Y{MM^AQ?hdYqQA@Vv*{in)B!9ZsQuLwl1U6LcS>Jb#A}W z*f*|zgDRF7`@BO2op?2s+N1fTF4RVvuLZS0YaMR7zcoJ{J?UB50O2XOzC3}h*zie>71b>?1ia!U44XIqDm4j}!nR4kFl&M0Fs5Mvj<5*F%on z!Nh*U%sf&#onJ9zz#2xoDX@~Rd>ufk*&5~VfjeMZG_kH5r`Rp?)KLXXv;|tfA*&Iw zD)B^$=kmu@a-tJU-9VoI2g=%=OF?*aXT;8F7)rXY! z_QaCuur1fTYBQhu@}0+RUtKLa(LUt`Yuh2TI}BOC&8<;#<24;Z`XK2d69Q-1O#Naj zzoW^tAEJO`7PIlbZb5o}fb{lSZ`_Cdru)Wm)DBW~RsPShEv8LMo~7DKNR6CW>1;(8KZnm1yo0yGRnfLkmjy(tKhps&y+ z7LH5Jp|mV?e=o~?lxXgO2juidZs7+|Ezt)9{;|tKYsjE06v>cC6^jYQ+o&i8}8~kod9e42#@QEffo)!86eyNctBLbLb=33{+8aeoZ}p zb@2OHlx08%vAmK^TOfAe=N!?M^dP!)=?uY~EC6V#Uy=T<7x#S}hZY5OyM6h=L2-9?n4UCh`}8p(6>B4mYmbE+8rBsev-?KRbB9 zW;lSy!wL9Ddv|mKsFnyYpVF|83nP zak~=1O_rJ@2M1^WXVi57QMb0Tw3k}R zi9rQn2dWbrlM_r@miE25CNXA8;mCkexR90N>M%(bv`ei5aRnx(_5yf!~_| z5#=@%&t}XKD0G`EVZ51cw^!0CD)H&kk%+@Pi9;x{>hCh#S&Vtx%Pojgp!F_=&JrL0 zBFhKy?8Ekgq{-Kj5%`U#>oO`b<~rk3xyGP+ZUqJRl`EO zG||6B!}8a4RT#ca4GuS#{70Sg!%9b(A|>iu6t=k@e*N&;ogEXS>GB-ileNACSwQ## zs3;OU0H4o~9angGtg{9eG!3QXik5Jp-veN_wiSi7K!*8gT-2bju5a5y4+A0%O;mLB zBf8@rP)&XK&H|kbG%vWt5j+0DP~X{`tD}Z|&tGMWWsdimFvCH@8Q4-06%Fi4=mS&X z?i#_pKTOk4zKqZl6@xetE>L_K>g^TR&7w4Z!BFeGSL0t;y^InM==&n3@n<52&kuS> z9)DdVC4>x}a2H(*zB`2}{g;=6GbNB5xIvVabU1&Q#s{MWDXr6n{ZsTu@@8Wv6Xi`H z3XaR?<>!l2MP_LHVU^a%8#N-8_q82mkaWvXzP9|%)6yx`yuDsKv3>jx2fv}EI~3cn zO$lV>Q%Fvimtk@^_$)xd78BHj!3#=2HlTt4W1J|`lrlMtigGMT-6Sxt|3ck^@zv)c z`d3CzrM{lvF~45FaHRfQQgpSWmM~1xupea2b(9sM71<@3{6$2@?#`=GiZ)X_GgV=C zr$z3YH=x_SOSt*9$3(5IajrV&i3WsCWPsmAr2~=yunL-|uN?$a$9VdpSp@=A5Ve2WwAz->;cqrYScldp!|Cs3156nGTL^IK(m9Mss&J&@ zPeiCAaA{Kp%I7^6GO^Tjv5XoaP@K%|{3uAT7N>7I>nnt!l zPlxef8|%=}sHmysKhKa%df!U$%;AYM06Wk5WD1J2OY-3eRUMSbY%UJIth`F}_K<^W zG18Yo9>k$hzdbmPhZ*FM7KwSBM0mk_BsM{U85=tdbHB;ho9&4h$`Hb8f*Aw#<pnjbIS&Zohzu*up z1hl$#-9=hDJj0>RZkK$lz7F5B!r5pJNQ7buY^K9b;YPWz^&Af6dWVv=V1v!!pMhO( z#v-m4+Hf`V6Y*J52(5b=$trHw*4Bot2e@&Cy|LooS&|>pN#fl3Jw%{_rtdmyoTE|M z!e0E%-kzQR^kJQI#1a_n5!{C@a6kEdq-<_McE8=$jOtr^;DsPFtv%yVjis2^jp-cq^y_! z1EA?aa5+JBEk|jrI6d8~5Y(AqXS>rCvb_Ukk5Rwpst8>s`eqMspR4i6HLf30`U3wS zil$Y4-UO`o9Fl>8d=vB-C{aCLj2^v$4DYMgun}T^oCnejqoIEeF*1=<+77NGtC*;N zueU@|0H>p8&pSxJMMB}JEARa7_Sz6sAm8)WVP#YbXsg&AEjH2;%4%5H71ia&#+d@4w zUEeCOx*T;;+44`rtL`O~(sn96PWjazv2o;k;={xF*k>)gokZ&`5y0_AQ634$rJtSz zTj6}0LK1i~F$a2arP*JThonN2fw-HLD#6Expl&lHz_;RHX$eYlwK~Vg$F}S55I~D_ zbHrt2eW=w+`=gYB(ycPX9c5?HZUA^V~)!F$Ce%e5AFgxe| zg!ngP_5gtz;xJH>essSxhdhfta^HMNU+{x_*SXvu9a-9r><@i7EqR*H%eP%J%H(fo z&vaS~YkCr|XF1QUHHvDj_12O(*16BQ%91$z z9g#>=Avdmz_8u@3$TV5c;xBJ*YIvO~+2IG$xf^*7!$7%5*Y)wH)vR!gOYD5ikCX<; zbWq`Sb%yEtS&F$D0tt4LNe1%B&?|hD>5?Fx4nT9Do*xH#wBB%9U77xX zUEqE5!)KiUdM4$O<&=4`rlB2{Yt(o*PKm=v6%bsRw|2?~aJXyi9us+O0!VH(5a{^c zq1rr98xvT&)XzDBMC$PX^4E_$Ak*9=(Nn;iVUb2Txp_=$7T|{PQ5pDQO-Uj zOg2hjXb@KyvopxWgVD+Imz8O$fR&@PxOou`7`Cz~4OcN=05NP`OF;^KcJABym6s=; z6QgU_tC)9G>}e9^rZa3TT~rWdS_X?n7{_*g8KL(pvFJ|Vv8$kJZ@=a%_2dOZGxN5` z!>8XUCE^{{F!)I0J=iREMsp&>Q+Vdex`CQQdnLn{&gbrJI$a(P7-p-8R&OlhnS`AE zinx9nDSprhGfb0Z`Zl$kg-8!y)qn6%rU z7>?yeG1I&7)As^te>YA&SMH7BP`a-^F$R<$;x^GBHyZM3{_^7k1SmQT{^LNA6nouD zZT+HdKp4$a4*B4mOn%q|?4|^9S~vj}>E1C6QGix|I-PDwH*K6Zq+}Cl@d<)~+OPGB zt>rB;%<6H}t^M;s0tu1oTkiX)EleKJ7?iwj{GP@2r)U({)NA$Hax9?YR50%*Ti7?t z{1C<%a)&~hlu3t|M?E+$N6uu|!ZwMT2sKlath_=|)$d2vyRb^)+Es&;N_P_X=yO zTiZro5EMn4h$u*x4uTXB2t|=zmEJ)>x)kZ5s(^?{Zvvr%^xmtY0@7>fNG~Cwgb+G= z#8tj;9sK)W@3l|&i5CZ%F*DB`&$!z&@{!604H(c__)70)>E}wtY5VZM63bK zaA0W;iL(165>F1@(o~x?1@}is%dtNs;+k6vJSMFY6{`Wm(VGEl%*g#&q@z#On?UJ% zc(Lw$!v{&B|9rZM7^7zKjn*u&=lT0Y%dGA5q0Mf32_^WnBCq0r968*2QHn)$$oGwe z*a5Hzg+XRcx?@9wIm!9m1LpQ!SjMdv^3c(5jK8yyzvsy4%rJCn8PoUJs_0aLml~&e z*<=B?e zuE*t9_5&~p4<|Bssp1a`vmMD>*(6kp9TMRH zNA|xWL(&$wAX-mQ+yEZL#Bpk}1(obk5xn8da0+Xnjj(%b@AxsV!!{jc)Sg$pR6ROc zOJ4z+aH8nzRWEWk4W$hP%K}e2#ATpK-qJ0NH4nYuT z-N7JqVrcG386Y%6;b`Cs`~r`O<@xE+0r4GhU$4in)#P4Yvmeu773B}l`Z_?za)i7@2@x4j#*wiN4ZTGZLQ z>O&p{JWcGHD!s^}_pMg~41^8y6A$sH@?=6Y;1V5G~k8eBPxth{rFQqy9Uq#y!MdJ#3&NShfi+!<~<3#oL!*N z|J%`~qg=ycNaF#UT56ILpR?HYW>WcWQfruZo{+=Dpage;t>ewYi^Z#;ga80|uuOQh zb~X|b7)#S;W02Fjm&xe2I5)vZ;5+6s;Ls^ff}9^N-Egemwwbr^Ke)CY(WhYk{XIqW z%Qf-s!lKeWq1$6^Xy6RL1Ml{@Iv=4iD6-cv|BH;5*sm6BG4z|9H`OuubE9J>>OcrX zIv7A{#^zcurY4X1X58zeK-XBQq3NmAYj>ONd13J3)Bmi6(7)iK-oML=6{;C z_;TJ_^Ew7})~80=w5s-NJRpNp+hQ~w0W=CSSAbZ10QfNV0(TZwkkz1 zY}~#LzFw#1PU>UuM?1=N=Qptc)q&S0PI!L{$;Yg8XN?R_fvU$3f&BCAJU7^KIOE5rwq_M8J9 zTmeL~Ud-h@yp;y=b`5&yKVJ8+Mr0H)!uNA};lehNg`cq?(As%SH7@h-o{VeE&zalZ z)s&JZiSX}mf>!Zy#?ewN7+{5zgVDT;+Yk7HcO1BE|2%YbB8z^vUU4xfcEo~-`D-?p z1^}Dy6cqByxd(I#UJ&~8xW#MItcOEpP3&757SD{oNU&~3?Gq5ZVT|hX)^7%n8RehH z?DMVnEy8a$QltEp2&#AUVO>9Is$Fb=Z}k*PF&axiM=FwVIdQ>Bc9UTcaF(BqlR5B_ z`y(KVrm!(lkv(g_2wY%?^qlo_I`G~+lmQLO~#XIy0doPho^yS;H!g?M%JPXvknFz0a4AbmBUaXZxBbCEW_PtYYf( z-`qlX>Y@5JjW2$D*aK44BnAHLC#D0nr|L1XKv5IdnjD?e2;H{2M~zt)ZA|&!IQhx@S%}zd)*t z0|>uedZB=-;{?v?F$tUcENX|8DGyZke${ZRFbXL}L&A z?_-$BF<|iIfWEcg_V5P&<6Aq~pF&S;E;-RDnT_PB9;g-o zl0VyOuATMt>W~~9fRyVO*IWo*+)9%4^XWyfHD#GNL;{&#?Ij2k95~ZV)ju~){i0P6 zGkdAPr2g}@NQr}_^|q%^(5rn`o1+B>4}8HrR5~v_(QkAsnEj$X-!*C?iv^DN)0kCp zeXgiqjfo5{0`4qr)%u^L8?PljvQ~yy#_$oGl4tZEW-8DSB~d7>;-H@3$vZ6y1!^4B z>wz&cuo0d@)On5S%eSU#di;kZU_&8U@qB5hv!^{QXhCJhl0{0 zmfsCJB{(g-;-8)vyqC7VE8C`?5)s9mo=0HAZ>RLoY(rp1w%vo{$79CJcqLc>%fif6 zbsjUG5mvjbfE1VGh4RJBd7_M8Yzb&@2W=Ot0>S@oYd9!tSqfTPa?^H~D1IK{;fa|+ z(*B4lBa8xXgm-)SN{|Ms(xOVxs$*Zu*q{|9XU-~J+3 z1Y1tD*8hSp|AOiNmn#Vrz=e2I$u|lA?uUJUPx|eU0$1;UcEW$))c@Ce#m8Ov7gFWJ zoBt@pzlp{F`4d0Ak@3~$9@Q$-e|dBNeoH*ug@GPyvXB3xZT}Y^mig+SnNZ9%^ndZv z|9DBzU$_gU#bdMn{UQF(_w}DIi3U$LjbbpMr+@D@r2}ynW~3=T`;SlZU(_{70bl5L z1g@L@%_|wD!G$D){W%i7LnukQogt_NiWR$xoN0I~wrayq;fCjb@_4=@ZXhQR790EqR^-x0HT;`0=z?Gd z^CVIfbr@j(04(w$g7c#_257YZ^MEYQ2Ml-wGx$F0EH@t=gxouP$_OR!*q%e)Oh9Ob zz6m2Boawv?Q)YeN1t;&~$3uC(oFG;1*lLWVGm6YvAZ1o2vdj!Tq3NDY5YWDPqDOmH zMreJl)<^zAQs8jz%JXv8he3b$S`}nc=2z zy#^&#PUE~d=q&{t=bkAR{H99m1Jg*oIx}$?%87}QZTx?P!SNm$x&bsnt z$!6vJ`AM#4NH-n63aocC4J* zz`1vr*p7Q3iAVQq3ofgS8PA(0;{PD7B#sG)Yk^9bLcJaBE1u>JFUP_xy?WO@|yhS?2 zwIjTmwbfdMMnhP~C`NpuoW%CV8pO(g)WgZ-LW7nMaXjAAK2}jPIW6t|X-$@sF)JRD z%5f8>h4A8p18=ss;Qw_9A{l?(95Ie=eshg*%B&ijCb2SeA>NentKWk{y8TL!?REe& z++?mptw5&pVA%6_iDZt_4RGO&`jz9>z18oYTk9<9*W)`rJ-S!@UY!7+cQD^NAo*}F z#p4cx&#AxY-j~&&`+UJN?dm}bWS?R>>3zPb?bZ6XET@tpPgP*mtImZ^F(k*{Qt6Xhsn*rsJ$p41|rFSPINq4LY}?W!EEfJa|X+Vs7|`Htq+4WT-e zKhD&GePU2TZqxn?mUzDfaJEy zokxeH)^{kuU8n>xc}npt$w}B#ziMx(>*2JnCVBaoGyXx4m2yCAtid@ny;-)0Wx0S_ z{?7ERt1O_`R1kgi@~Va$w|D932v?HI4AYvkKKt7(!4Gty&ypu^QvYWA`U_tUx23;} ze+mvN_%E8)|GXqzrCZ}23LNe(77S|_x?vbf?;{EZ*ElabzS|tTeaC! zaHX7{;?4iLf&cC=1qM8tk9h;FjO>56*#=U8D{21!V*a0=tN$;>%qWl6QWRpvZ{sy& z0s#w*qalzsqV0qWj}uHxuJRflr|_8U6jh7n*4*UM;waNjvue9R{7@F7}LZ_cbf`rzVlOySgj%xSvzp9g-T<&VAiO0R_*NAA1N^^c~xICWc#(_SIPgmdSN?CkYkL~;%n$lY}d}P%RIq@uMTMJR8QD8xs_8E-HM$(2|F5wew zJ(Nq})xQt=@vT{%;7dKqV|Loh-(w>aKg5%&2yH;x>iniXj{Gn*la<~@TE#{~Qzk1A zxHS`gG=U0(y2^Nq!iqUPVBhVc+Kotxz1RY9YMWHoXl0BOfxu(NN&D*5)VunxW=D)F zZ3=iQP@?N+eKF$_ms{W(@Wyl(jki9o?RcyLL2n?;QStaQSGs@As%2&q9lFEpDotFB zX{^>Bh_eM*4xH>wD`a8o;is)S4~yrMO!p`fx9dz@5Y1JuzFUR$3+aba_EF9&s|R^) zYH>{mt=G1uVBYp#SY^&0EHf5;60$KeHK3nnUiG-U!KqeS`HT_fbE_<}pjuOZeu-BQ zam5fU*gMSup>yc>%@_Tlt??N#W4|qbMlu{Fn0P%2IK$L6pNwA-*jN#@=2C`%&QdK~AB_Qtrh3!?-BdS_`YeTpDA%83?e z9Yo~S8_2;RrAKzP&C$qZyE?LZ{V7vtooa37{&{=hU(ki*M5~G46j>NygTbF)j2n&b z&pK^#vD(DZMT8wm68Rp^bq*sxz+iFxlH1q4-S#iRyoZ>j{4T0y#;ekKDor%r-`W?M zIr0y_1gVAB71kdcN=uAke}TtV&O}JR~;i`@vzu~KRY?S89F#S7Vy_&2^xQSP`3HyW!2 zfeL&7tHM_V#9!hn%RHIO z?}qu$he)5&ic38>RrL0qxIMJ7c!2)Axx8&Q+> zi_9-BPLTjOwh!i)=A?)r321wpcQlqLh?L9PR!?S|iq$TB^U4tFI})H-98ze(5MY3+ zZ>qZ+@S;e|DhSE=J0-AnatyoLv^7V8R(6}6!nmR_epcVA$|gtW~Gcwpt~dwTv9((tF96nw(swLx11$DfMvQprud!8#Q}|Rz4=c1 z@thRL`@Kq40s&RVWP2gU>fwu8=yE(Y+lrwB(sncRgrTY|h~E3fm%Bb*RlaQ<*N7O+ zbOKKpU-~IZoDr&|=6^2MHA>98aN?SMoz&}Xnp1h?7b$LVg*3XB&UQ|X0j8Z&+!4Lk z+uVC=r%8=9i(=#G`8Mr#Tz6R;t+1#BtXirjTjj_yW}PUGfM8g^-BLm@x%TRa0%rER zhk_uQaQ)Q-G&a+Tb&$#D?3!5hD2oh3;a z-xJN8AH6b+j)ZPrhxe&HBMU0NG?C96M5s8m?0)j4P2Lz3rB0t`&HI^!+Q+;(%(Hi@ z3N7{=u%^M2A_%!o!!~s&8qL2stg38_c%B6Du-NE$F%r_VhF_zvUt<)vZ^5Z}S}-UDA1!`@@w$>mT2`1sQYHRs2*< zt8XUEksBOQpRi=ZLtzjmpeS@3Wp5~ORtVj4PkeUq&?rEgZ|I630RcJ3BRtgGo2e3p z_r35m*9zDWRPbEFi>9kjNMl5|iTT>XpfiZ^dj~U&5T~%8s^be!(ciJdz^@aFw^8hv z4)TRy1l28uhOLw)S(N7*-5g}%Ep6LeH6l~k zq%)HDIY8ZyNO>4{?MgzTHn&a>PthDnfOFjGQ72nI$9!)R?|>90%g7(Y+7H3LJLj$c zu^FKS-Ej^r9OLSVs54@ZND+>s(J?lx@+t8}k2&0Vxa^{C;zk5Bfn%Z**`=uA&T}Zo z2NBZs?eGz6R!Gjr^Vxv?py&m~Uq2E}ZLlHJ%TaO%AI1{B)tIj7#VM833u~KWz{ZH-+sp$d7KlGijC<#~fHYOa3 z4=2@5HyvEsP@O$~diV0vwj%fH$Jxiv+ZGZZg`z8BRiJS8Idvb#(E=PDQKh#JO1S7M*+b7hPI>8=DHfyvf6 z%q=j0SDv_#cxT4uk}@9Z>aeFV0MLIXEZ2pOUZDMNJh6hU(&?ZcR+7D zj^+g2oG_h&=;ZUj(@4Lhun>MF**j96|B-Hv$~`tYi0@FV=6gf~j8 zD=d30r!tW#rQ4avoiOQoAZT^S3UJJA9%fVaBTu${*N;j`zZQwu#geE;`b(CYz+)z1 zt+T^L10>#TnG!!|M*_a^80-{P!5$LPNG#fJ_*tu4(^C@V6t$<6wWJlT4Up1JPqWX; zez@sUIXl?MI38e~?wOYw;KM#tmoBsL^&})5W^?m?>WTV!tU4sI2tNB-tU4CTaF>x) zX^TPWiBcjbF9{7iQF8#&y(wH@l>eZXwC5~W8?I&pO|6ooB)Z3y;lw@Xf;JBGH%BIQ_6JXvE{^I3dKuAm#Vksh$p zZJ=7@GS&Xo^LX9--*zd0o`EcV$c`{dj7U{>2pO? z-5fbF8M;`chluEqDWhe%k=FTVY@SJ~za_~Mij>;Otu!CJTk`o_XA7`063!UWenp-G z3g*9-$y|jOeMdJpqF;HcbGnOG-mO(u~W%jVSVtrDFWyp4f zT{*w!Jo0CjH(n)AVPN-C*hV9DjRuc2V+6sYe5LHP$2tNA8I0QumO^)v<=Zo)1>iQH zOSWI0{q9aPnXab|Z=}{KcFVZ#eGmJdZ05ByH|S}b1odPwV)nIh=lK3<|Jr@GIl%6_ z$eT2 zXr6e|-Te`)lsEzeSDxNS4tIy0Jjf&w;%i>^^*$=GUKvT=?pCcc-WDy`v7oKUdsgH@ zGDJ@~Yktm0Jo_w;IecdIppB@tfMk{J#WJH{)}sCZ*7NkQ-e^jXLQW9$oFb_A1#2H4N3 z;vmx`AEd=*rHDRou1L4@1zDC2B^XEqhhRad8cpc^Qhi%=-8lr+?gy#6JVKPC29Q%h z)l20KPP0YGm{!YTqmV>^OmbWifVwkQ!rqb4+0Pi1sv=cUY^>I0P4`FfR%V6;6gyRF zKVp~kK(L>A ztZmMCOKQJD3PiyDX3-^+pF;Q>%o$GRsB=@@i%iH5F#&!wT2T3s&HOsw}?%&-~ zI19B^N8}QNL@-g3BrK^I+8vj&%?zyJ{(|SMeI9SvqDrx3Z&p>j$rZHPBkil_bxXQy zj>Ir_E1#()vrQ)anrrOGgUg=W#Ur>*?9A|;pT9f5Ca?gl6^r?R7_L57%=KG6eY8zX zHM}o@+ChK&<>K~;;+I8MI-nX^-iy!;&fnH)AL`y!RA1sWsQ$t}$(vH^wV2tps(OKW zVoTNcqPqUQ##@Pg3Cqe@^ zc*}jpPxrI=APA!|q<}gkS>c4ZzcMs_*ko*&^IaPzMx{qG;$LS0sCl#S0b`xxB}wjs zv(Ct^%M(*Xh8Jnh!!AQQ*xvaH$0l}q$>y5-MlUh@3#8yEqy(DrC~U>MMRRw^)U7l#i6h>^l`;8TbNue9pdf{!$)Z97KH~wR{v1U#r{$k=V zf1U7+_~>=}B&5l$&*LV|k&d$hH*2JH&uf2ClF8y^Nb9EHV&a`E@-ytmb%BA>0{^6j z0xPRcK=TZIU8tp4m6mZe2XXc`LsXNV;0D#s5>1I^!6jEm$pomIc-{7nFW)}nuwX0^ zKGAKe-;(d2ncPr+{cqfBd!DU?-a2xbqvp!mfX*ynk=(i^cc-d%)nnZ!yjCZrl?YFZ zHFqr`;~tDwzY-E$S1u3|ZvE6XJb zD99CZ1G{}*3OKfXd{MIP?-=R!hEwBF(vK{yA%dU!GP8{`+(i`1Z+~KDtMEn7&bAmN zR)#5=&g7*TX9<<`ItCDJUf;$`0=oCCxFMzmPz^eDBbNMh_YbGd`)y3yHRfp#X(Cv8 z^qYID3})ZjLBYQDUS2x1)N3n~9b(#mXr0b=OFleT3l;hP5rdB-crZtM%f4-U35mC_ z^y--4V%kO8-NJms9|1D+d0A#qF$KQzxz9qZLbd@gS2;sl9|em62s6MgW_w;QDp9Nk zCas~;T!dhA$$dEe1Sf7P1P7YJZEBdCE%RGO;H8>^b`Pr8*swkC8einNWwEv6gLIiJ zXiKg9F+}+EplN(SJSP*5enfC8woUB#XBp7T73GwF{n9OVcW(($veQz{-S^)|NSA+K z#-z%nt4r*rx~)&l2{I#>I(~Mk!TG_D@6tP1%hIZm(H};|mas(H-KP`xbpGF6Y3jS? z{MpQXTS8^V+Jwhnxn7SaysQzvK058xyvEs<-V*L}LoK`OL^H2DblY#M`x)Y4CvMj# zifO-ug)DM@I&k)KE1KzWwcidtLAIq^&+0KFN2i3t7dLi-1*nV~NuD_n6sH4cb`HVFEFg!YvT^|_UB|EJN3 zQM4@g^ODB8uj{|4qM^4*0vZ;+SbT~?>NtJXWD8(K`@@{1gwmWaH%m#HXhj@3REvM+ z$X7p784V8%yb69a1!yC%FxO}{e1CxVSL!%6{&<%+a{|-hHey2N07F}rjwuM;r1vw+ zN%i$H9D-6se=DLiNt{n8+vl-!KOKTSq97pW=WF*PNB?wSIw4RQ=Pp&Q+Qu%+hn+#T ziB~#7hjO=XW_e4eSJCWiCUNE5Yty>YZpBW&WzUn-D5qSgT1?pv;D4doe%VN=iukQ2 zr@gnOpNl(~q$C9m+n=U=(7re4!I)_5UNK^(MH>VgJHT()Z0te8ED2S{1!Ha54PRlgOC{T=ZGnhD zoN)FkB2ZmQ(4*P<(d^qzcs%9Q&xg(KY{Tn@d;=1gtdB&98a2LD=!}ms+pRDNOH@8^ zi<(w89xSc=STn&C!#664bDyt#fO&U6{rS;L=>Rp0xc98az>20=lt$${H~+$lFKPO{*U#2*$J$ZklNB#J z))$JDeCrtJHMEkQkQut9M9+v+M{)phmZr6FeNggPy~7r%mXO)89UU`%OaQ_V@3FY0 zb@YsWSenrox3u6AGOfTc4cfZ3gQihsEo;aO-*2w1ds_$>shba3ZfhiZlG+$wbir23JO$ z_zDq!^@csH-&kyt`Wh@5EYZi(^MTj4zm|KprQ91bxAdqF=^X5%288u`!=Y#V4V~;E z;{XTOaZeHYv1D9p&-(GJBICLz0KsB{xVX8q}K67_n6^;P_2ULah)+?H~U1QslT zcKf!ykURK=ccJ8Cl+apGOaDkHwyO2K-(OlEn=$1o(49dBC&DEHuOs{-g#APn=15Tmwf!>a=D(gKyHnfH3y}j<|7M1F=s#xL6 zx+u1x6Uy>3CYg}UEKBW&LDJLdAjk{h9QyO$9Nxn;F>|xeG;ErtZtn*zV3pp=T?vY!6%B72#nL31kg+hEIs?1~o#*L>|UKPPfE&s&xsiWi4L zgw5mPc(jKTU2QIHnany-BR@&p#3?tW&8x#2aq0w%H*PymOTa8f7Ub!<(#o1SN`+>) zn*vf$M-?jDRRfYor5^haHia8j zkzf`yIp4`;T;j_o8>cT~JS`_lrQEho`7KV(nscegu5xYH7jGto#R2&r2fm-b`6@@Q zkonh|7U%buntu#2m#c?<5qY$5`w{>_oTSWB56;=u{P)c-4)aZ27Z)gfwq2JsJGMe& z7kb_UOVtiXplGaXV~hd2(T2nrCH~B&>UCF5#pW*3dwP*&SD@$|&0deV`z>de%_-J* zT$|Nt99_`s>GLU|rF^T`@Lr0DV{N*uu!GS& z_yx^s8wzpSDYlMT+aU?Eed;qwydbS>NyXM|BL^cKL&qp;M{)wR$urL5y4|==$+J^fy;CAF$Sy{AmWTTSUS-SC z_cii*W#}I1kE5rIIzipBd0p%ylAb#7%h6b}Ch-zl@y(sPsB-U-1S8N1d)$*yuXOk0 zc2e}F)&(J_HJGv;eGjA1qx!wi{d+5(48NMvTkn^(3Yz%qoyLFkJ&raa(GED@p!T+3 z6sH(5XTuM>cIXFD6k_lt3(6pJR-=lOP!z@W!QWStn&r4(MvU)FBwfkXLDD?vh^Ud% z7+jFKO2nw)4D@}@N6^~4MgUxZ`IS;-i}-6OpS=DPmHDOdc~qX12-M@_CxJZdz`WN5 zYqBU-FnM=Les459d-m-=7^ux>5xza!$m;p_S<7t(E8*)% zPjvGvvt17!4Mg7Egn^tT;K{2Y+~JKzdL-kWx{1-3)p3 zE$9e*DgQ{&NZR&`8%nZ%Sn+|J#5sJBi)~xxtVW}X?S|m*mknomc(CHugGE|_l|F3lq`7L2*Y0BF z%RVuEGZI<`-njP_q0d^@C6D$W4#iNM89J}jB>u7>lUOLO@)@JSIk2|=%bEQUFf70p zwYj8N?aR*)aPuDm>g~(IQEg>uLFhR?Mk2|)WaWs0geS>udfMAdTuo7~{Q%bBt0okDnXN2cg za1cW2V%PkleQsmW^jOK0Eeou4S<4`HEA7HxY@?bIxu!RzFhw^9g%$$iM-W?_luz79 z+L`)HMP&nU-lzOCNrj;*q6bk?N zC&L_S5DVH2MAYq$^LM zqW2id%i7r){Mn{z+JLIDpP9)!gI-D-r1-o^G&CKuTGtD%zsX#Zd^OOnOX(xu?Z)w-3cgzYOGutm1kFnD1SZDDtge$TV%cw;Bx}rtf15|sh2>MRH%Ej z>tg&lU1KLPMRHrY6Ts|9KVVFDz3~oDr7Ok`#9k%T4vp;=oEK&>YgAiFv>!oAS6gI# zy~N;JnVGE?Qo01I7UO|>brDV0yiTn09s4kr#nNiWGNL%%29R;sR6@tm4N9VAmOuDI zJpILwmI*iD`^oq+k>f`j$`EAP+l)A1e3)7w6a8?1#Y~*t3^ftQK0HoOVmWf$dhoh!o zRAG$dkAIi7ohO{v%`WM?c2m*G@(G*~S$zu$_r$*VYFs;d@kYZc(@UkY4G{9o&8NpA?$f`z-7hB4^WkeT31`J_&qzG(nAa7SbF zCv;6c8`xpI(c_Mv;xFr5qQjDZZR<@==lzkNs?RoZ+Vl>~SaFC00l^Iluir5FW$c)}geflwBifV=41N1{ z6fObI`8yn|^0z9=-#DO7wJOsb+w_BOR`rPuktX>?c<))@f()vDl~=jM-_$j!dOFfN?s^%G^1b*Rah@btZdhw30TL z_iBl8)g0vjcUDWZztoAmb-b0b|LeSDX~sJ^9$rugG{WF*=6?&eeH5G zQswM!aMZxanqOgZDYhOlT7;aO1aTm9sS~A_H(RvE&DB@Kaw4R;HnmKGg@M%uwtEN* z(!J&UT9c*#(p@Wbhcx)bkMQmi&;2+H!@4+(J-jFCafjvH4B^3xz9nuxl@GQ;EgC+) z%)kFpT;%7y!F)7( z>6`V+o%D;ICqn_5&WF*V=Ww%XgB6^r$!BD76;T{Mv7_;8vq7*;qsoKiPwG(Ek5-lT z^JZU`#>7h!sirQ&hS%PiJf) zXDRe(C3%G?Cmf*pjY{^eE4WCV^t~W0qikM)-KsVcfRIy~2q89(R>1Nlo;H5GoPD2# zje>&+=Z(J;o4(!{n2zC!0oHi5R;7~(SO1BYu<^D_BDd*y)d)DllvmtyE6yhSqqj~! z`-?`>wJPsh)KSufpdLR2CrK^yg?oM5=H90E!K=l18#3WF9;YdHs!~nl-G9okpkf6F zMEgwg2BJ)BH(e7bX#bAUX9s9fzUDEz2^nd|G{Wkk#POw;3&e_MN``G&aK$?V z9hrxo??)N?TOdL$030;zWxdMkmg5b;zMU`| z6~pxx$T03sC3>wfX&|#~%Lcfw&@_`JJ2`&$Xp!?@rOF+$l_IbYXvr~Qzep&MXROc{ zA)VuDv{NvjCo1CZ$ZZgDATBX#eIjZ~6CN`BI^bA8baq1^%>=)oP;*Ghv5~H8y%QFf zJ5CJEv-=0ntK}4X`*phc z4cJff5VzunDHfbnVmwrLQm2;x%+bJ-H{e<=HCEIT9j29B({CddRA) ztL0trt8#^s7$fx;0-`a9hh8t`8x=6cX0i{RrW}NCwdl-5Z0%>AWOi;Rg04won)V3a zmR+O9fJz;_I@d3{KPt#Z$_T7Ktw&>t3Irn*G(J7G*NO zPtk3c7aPqsn;5!55MjNT((%Pc%nD4l2H{Pc?sMx#w-Vk+XRKQhRKFpX)6L*2NDAmQ zz3u%IsJ)pX4G4rKV!SQ&?su_k?+$Dz{r#5dyT2+%T?HyHHxA>Cp%?}6DBkCrG=%rM z7Z|?X&;rb?v!vyXhh$s$)?7;`XMbEIyHW=mNvc;|D{6Ax_( zblWWfPG@~W`!VF(AKug?S8jYXT{y^G;Hg1pJtgt1%-@*3P$FUzV+;aZX@C0S7J&HB ztp>BcjB7xB2!!g;LR*A4=+wK#Oue7!Fucd`CYi4GqLXZNM{p6J4D+SqhDvNt>d^z? z+r}U`zSa~<=klJNo_?~zFVqd``_MpO0_&PM;+?w9r9Kf;=(J#ywvh^*3RcADcW!>-FCP*8f5iKtBgW#uTM0*1#!);WNOLGjsPBF zdwv6vU$Q=jo?+IJfg4MTVq+~^lSOet|62ycBlt%6+&$tTrO0})LF()VEaLqxur9q! z@C^$ICB*3I{ zw!&|F;r*>pQS>+VeZ1x0O$-OUBY`uO6ZseR@vyaNRP1k}dj@6o_euJl z+6Q8C7MuCvOk=V2krgmRBke~?;bZR&=LON{nJHy=XJ1kI5iI!MUNGhk0oEj3-TgV zfmhiKd`ADWh71&=mDtCvz=|vOHfu2>-lE-Fv@+drTc1sY1@vj|po`~vzg>8F>vXYK zI-kF2@NB>2ieZKK*z>jCpRRKpSfkyj&~xHfhh9*MW@8#)^wUC~D`Y%;Lbp{T&z|1j zyG3-7)?ei_xjt+RR1WT9&4(KU6nLI7z2Nk&TNc3wuR; z(e!mDYp@?efX583O!YvHQqdho8~!V*2T{~8LY-x9dyeCj;-AAPK$|&jUn|)II+=F? z;?2G1tKu7DXC50~2UZO<;SVcAer}8}6+%YrUjMr4y8Nt90&%^X1|}(vA{>B3Y|T`R z%#9b9wPARg&R6Y!_HWw zB)OZjB*8-qcZ#De+u)@gciibElVvlcE4of6!~;CgBCV2pbbK4Xlk+L0V8JKPLSI{b zyaQZsmFi1ZSSeRY2zN%K1N1q-0E}RD*HP-xn1i(ebbrO5 zf5D8InDB-~Yf|yHemyOR>rFgVwQNxL=p>&(gzUvJ!b$xAm%Cx7SAR;IX6))uy=HdN zT|KY@Q1?g{)8zg@TY^2`c68CBT>y`i`mZ(uUQ3U)XjRb$Wj}^kURxw)*iQ7eeHsN;t0OK<+jT7o%1}QE8LWXS%6- zH!;4Sd@YNcA^us`Fu8rEPm(!dB1TY)#4jyUKT~cYSfbtsK0GKk&d;-1+RzxZfD>Aq zFtew5{E18*fs@@iz$x1M3;3w zqsbRTN}Xok!ta?h`bAr&)?toWxfsFKQr2Mi!_L|`jYEI$uZG!1ktg68d4pruUyrLW zX4ejlaZ7)7`yQpT)h0dGT>dVzuE9Wa`W?YJ^Pv!1N5k4!<+4V-&-B9zzv6f;(segq z*I?(52KD)bT-GwC@A=+FK1~lkF%FgSUZxDP!S}Db7t!Lh|<*L^bDjOKF_9cu17%RCMZC1YTgtmfID z%vzyvO&)7{#q*d}yY1{J^r&>Iv9ChFS7nQj(}pa2*2f6vO$^5#tfi|^pHcJTaz&R( z8#MQ0ac6V@RHQZ5qF`kKrV9R*H3!fXXj`A~>t=>J=*OKuVPfn#)-j{s6X(h=f>&iv ze4W4h9rU;iB?y8Mz<(#-wr=b+fu!VTx$YH${7oqec8$Yrk${bOD}v0`gcZa`svfeEdl+P_-Wbt&X2P zZg@@1f6wn#?8h|p`1d+1kRpejdTy-JeTG-@hbVwh%j**DmX|d~XaVyO6C#x(udLas z$85`q5N7flk8->*2N|y;Wsn!^ow5=Q(woUJJ`WsP2)fS zrqhVAC5cYN|D6-b{hC(0vaEq-h!~2#q$b*PU~X0x;li^3{QTtztmzHMRFM7R`vXP4 zK)vJBt^Vwn{~!a@gU}o5YPUJ5-Ng7_kO>dD=CH_rq`jz5b(Nd8DdGoT^{@g+6WCS@ zb}5+=#1!ZTJ-nUk5Y%fLYqVEQKKb3GqS!{eU@*b5dy&4F(P9nsav ziC+T4JQx?ehfJP0wVwU-oqj%Z^82p>XEX0LgxDqm#5eYZZjA-kVL%%CwHW(jHgcl( z(pjB}TBF!Na?OwT2-nSp6x-d7kC)3~{rgX=$9Cn}<7NF)mScMO;^4(~=Os;z9s@yQ zadD=Z;OShGE)agsZ%%w`Hp6Wx>70<^VU!)boHO8f^TvWO_II}CH#18ve(1uWm~e^3KBqP8UY#EGPUeE-lO8a~`Wc5uxezAR0s;&{j5s^y zU5mn<(_8%R7wy}=G4%M4sfpUyh$H2>+9Xx(9;t7{I(fbah@{LVT-rst z(cDcQwINi<@5JR4!AW7#UqLD==QkElRDlniMWd-YT4Whte_vIHAKk{_52L0ZD0`QO za?u5;#E-YJQLkSeffQgkDH#L(zP?LU$LFrJOHO@g?_`F7ZzC0?p-^%@uRo_M3~-0C zO$9C2v@|DZ7aBd2JY3{$sJqbcUaF~kT~Y{llcB$JZ|W-n@W_bY!Hagh1dgt%b3^E1 z9-z8zh1&~O%=A1#e#MvN8w~H>;aQ?PJL&ojWhk%P*;&H&G+cPrel3ZvHpMr5=oFnM z?%5qg#eEomZ-d~e+0;pTS4|?eT=_M|JyF>d@h#lUL>iI(kjD5JA%GieDKN<{9F?g5 zG;8C?P~;-Q^a#JsXkK?N_3)E%hT^tb6fP(DC|KECao>FT)b!S`EW+^Ia?Gtt$?{@* zjI=>tp?Fvb_}K#G%qPJwP*;V9kn0b;P^m|ibDOe8P7tK-ivZifbu-c5fj7d~k}2j9 zBUNo@I)yI};_K4uXgjQ*{uJbi#f{QznuJhLD|m!TJyN2Fu3v>tM}DMXUURe2Wjr#= zCd#R5PY-a2VWFjSvfg#Je^dg`YmVxlWOX9SNs}HClkdhU-PGaMr-#p`RwA$G)&X?dAlZ-w{L3OFCTWOSG1L--Ng-^ezV>E z@y}TK6{{jJc{+1Pz-M_RvTFmWMyfOv!zEw1sr@`zrGqcPyu~a)wlg49biDts%cQL= zB(D_{X7EA!)Lz1%!Z&x;_GxsMY)-lRz`2?IBS%L)8g+$b{YUPP=ogm9>0GfLI9JDm zd8&tph2q}Y2E6W1(5!q~`4(S%<;t&#fztg(we+-HL%o+0x>5xzZP;#g$cC5q=8tC^ z39@^X8bMD#@!{oW=$Yf=hJzv{`G;ZR2L(gJH9c)>b>>BP!ar}!4e5C()@Kmx^YY;1 z(7W`5BKCpfal*2579ca7;~x(`qn$pU{H+t(rYJjT!QWz;|3pl@&ivTxMHb3x4VE?f zuf8YkOt3Et=hEh$zIda_rSwL6hp?XP#;KpmRZYIALZA5KmxIz;%kM|8$O)=rDgFF^2o^JLss1wif7pBPcrO3H?>{1v{A!&+B{M&fj%izi)qDx7+oP+vkHLypH2|?&sq! zM;A63m{AdW6VuwqbL3Lm-|oC(fba>(D2y+w$^D6O|9HWF^MebtSr#Vf{(4y!OBop* z9ZG+Pi^Nh$5&Yl%umkYO33dY$odEz!Gd9!C?Vc}~l2%&l0U9>$ilylO@8^ZlL0lHli{Izh+lGg9s} z#CCGrH`blhU$2SplSR>==Nf2~18KpD)7`5uTuqG$y_7M}1oKr-cDAVO=HM4>8s$+bXU$m*@w(`Ig8+TPobFWeU zSPfjew_;cE-+G~5G$-;8wNA7Yk1`fM>yw)t4!?tSMB=jkI(jND+jQ{!&XQwF_g9U+ zs@nVKeZ282c2^EJPReQfKUHjPwAvX5GGOXI3Gfp;!t^@bjvjvQ?sXUlRGz3f>-j5Q z*F4Vpu#=Y*_F3gYg^We7?A?X3)fM)r4|AY>kXa29a#P<>rzhyUB`4qQBtlklQIXpq zJaGooMj0iOrg&U`^L}(@cBllQAkUdk$++0pa~_XkUb$mefZhj2p)62p@L1U_DIjBD zyk*|WxIg~VYtaq_Hc??91N-sO`IZI@NMGoe8S~;{NszrThD~}t@Zyk713p4_LVbS1s>MOBo?33Y~ z0rOjK2N|aIAg@fdMyw`;^_!x<QL%JqDCy5qqj$4lpsnaJ>ii0LZq zfDo}POg8l_+pBLuw>P4SI`>B|4JO7V&HQHxaOKDq(ipQ>9P!B&F2%2&rM5R{Z$V-Oh12TDn7o|ZAxFMGTS1^)`E_4DD z{*R^oTnbEcOo-CuAv1RWwmn8%+7#mHj$Ud0OONi~backZ`Wd$ePPpe^zBOWWqTmK* zm}~9+`obDyjA%kIbO}+^_^ZEjAF+|5^ih9(;gy?kJ{QJ(j3Mj#JL^!7c=Y}Axv&QD zmlxW?`8?etU_AV*KM}qeHpXGqTKUcNUtj2p59d?kOgHeS|Nb{lc7p_Zq^NuBe|;gB zE}YMm%Cik3e|IaPAdv)*-X_Tt{=dF(0`oPu(5dkl4)Q8Bf!63XkjRd>_Zn|MG?~vI zC<6@vWNgNu6Q^4E>`{hFim&tQoc^QzO{uB~gMMIc8CMDBuk<)0U(e?w1721@@I^)5PqK2dK^Oj?kBPW8{vpyg*7XdohvVgr#1$0Ae zRkBnd(?eTfat^J$S|yDTW+2N&o+T7&SK1waNmvxZ;$}%t7y1?Rh`Oy;ShvTseazH zj*Yde`m2*|F}G__X`hz)uikma2nvU9;$43GOAiL8){W9p#Iiq5rgntO@l!wZXL8z1;aR8~Y(t{}pD!ej3c9mY22dsWe9Nl-7d3GT_EPdBOy<_8k1 zM=R*mZ;qT}+hbnq)v@6}T!fF?oQA3aI^!Tx>98a9xf0OjEZ-au9pQL#4(%y9=68uG z?TZEWD}axPrxtXojpSuxX0k=Jmk%dlE?bE`?81S1UR`KKz_X2hib_4v1tXXDA4?hv zB!*=D+JH3MxxehSKOH?E`l))#5JYPE35B;F1o!NX-F7-nAXL=hJeaCj`+@&Fa;VI^ z#TArn?c^d^BTWZE&AMrRsMscWyZY9*EiHatAD^>00w*L)3U8L>vm&{CEcXQyx!SS# zjPHayFWd(*Jh8&->{w#iY1g<6E-Ya^^v!O2Ul2;hMt^?Sc0F|I_FH#?o6YR;M}Vb> zH_9I6V@gf(Hh@RL)CIKR`|IAcZ=39;k9HMb4T7aV^WVPXW!%&*B~vfWGKE za#uM<5>yY#NKr!>@s`^Lkaq4?rBhm$(1+Us?2jfAkg!a)fDm@0U7@YO-U zv_m!bMM3xa$#$hV;qjB?deEjO7tUz6Zp0TrFlctYTBZ#^aHE%@^>h$7^;>2LCH5Jz zE>|otR8F1%_CpkiH~-vrM2nS-zrZ(cjptVdVpd6IQzPs$D^N+Gj<+>#doI~We-}=$ z^mIrk6S3l2UbDJHnCMM_pIvkE)4h_{^T6RyNZCMW5G?k!A9J@qLJXPANmfhJo2A8| zZqvU3?GiaK{FD2vi`PQK5Mx-pLEJKZX!8M2-AmHcINUgVL!2B(i;k5SPE2L zj(fiVXW0Z|r%%*7S9JjrvH?o(0~ZxH53cnkxX$M^zw-8XOWx{tU(;g}EjLEyZcgiEgk z#MIqS{Af{gS(G9l+XxFJlEFj0=DZf{%?U$pK2KYx?DJjlZSvyRXI^jTBYF~NE0UO% z;?2fBbR@=Q(La8BhrDOmEG5@0h!C;nh>rs?>3-qMzvT7Pm8cJcj6K-~j#oj{{M}}& zpfWh87`FSKBr~9NPwZ*r=PlNb4RW(xC6w-EiZHjzT?*RN!}&|*R#@NAL^?H&0e7an z-(I~V*BHgIsAU62>i)Ln!u_|h>M*>1;eJmp=U}T`k%+Xs#%!E|ATqCW8FMhjGNC;}8n?!_N=CtJxkO&mYN(yI(BX>Xf&KU*5hN5PGVW$2&SmYikN< zXW^F^q-A5dOoSTDY*M3j?fgd$y-hJLVPP^=kF9$zKM05@5ED--zOHwMSd1IUf@1}p^{uT_8w=>|y=-W@-4h0FNONa;(bty@orou zGnkD1O)#b5*4QhU;!WPa^XSjby!f9tb7Q#e;fTL8XWR+pB^#8cDEdEM6QEG$oK4?d ztU6o~ECmwXGyoW*Kq!15rO{WteGk4&6yPQzx<5tY{)&s*RUzpt5z#IB$6sh-JglSM zyv(p;3a6-yxQ^K?I94v`(`($nZ zE!VkCF!woU(U@HO*4Dg-eU5-?Zci!Z7X1m3*HG`Kh?I5N zE5|7ep{dhevVE|@D3?IHZ7h1AD)*mI-p+5J;%Dn8(?ISYMukFwk62>r6jTeYd^hKr>|H(+#mm+HmQ;s;Cd zOKX#Rhv3U++>}1l#A_!AO+Cre7a*cOcgG`k=NTD??Z_2H-~9~YwUk6NeZ^j~h5KJn z7r&1vfyxG}(I2b3ZWMB`&G#duB2@YS7j%)q>V9p#CVBe$9-Q*Mh}QJryXKCXQ?-T5 zGA@^eTyofTOWI?t&?Vyv+&U3h64z#$wEMU)-^+~NuP z`fvVfX1O&8BpNDQH-!xP3}?5cA0W(!O|?rMzJ%cJa!u6)wpcKHKH!P_3it!4fvG+cMh=f%t*T|3-61YEmo-WXsr z`48q&gg1x7VB#sTM=i!Z3gN1h(hM%(={mc`yh(jfz3P^%#fO+mmLo$f2epR8U6#NG zdr>t<=GYPQ-!_t5`xE8!*T-cNxVFpt2=sc=hy5J~D~8GbSWzxKp2o&Dgk zNCk7cWMGx^sy-0!Q)d^!-xpG*xIIm~+X+PVHu65i??=xky25tzi)zsJTk;*(rjQV) zJO#yZCeomJd+=eUz4^ZJZFe|5Ru#*>{-+jzGytW9@u<{hW+tw`wQ>N+^eH?^} zOJh+xuf>Y+OP&)>^7&a-$<-S$;ki6N?P=^?i*68fm?Mgik>UTMk1jJp%cfjjM|X)! z*5Y7rHu^ISHRavoU!V9<|68JDeK=i6U--~f%PE}ytG=f+V+488`801zlg6gS(aX<9 zF^Zp0O3Mh5<`(DP07q_Cv|nsL+p3=Maz|PK!q_^yYq%x!<0kxTd86zd74ETZl3hOz zO0PpjgZ+AQ>vK$bKt5pmB(bvHja~`*rZG2j|w#nU~;HdNw4a4$Slc$UT z&1RmOSW;UQFgkW_d_~JGR^_(}uzhP@0fU>Hcr1mjM>JXwG#)}5r7e9I>~3Lk=~f51U#{Xe-jfpB$XBfnJX{4>c#FQ+ z@?(LknpqEk=fc0;twx=R{W;ZvKc~8vT)#0=4yP*lpQl>*-%hm> zB0W{u=)qTZm$*m>nO`&{D%$zW2^WKCMfF8a_KYlI38dO=4#{yyZsNIU5maRH$-k>^ z@?d&T-rL536xMq8oj8g~n3Wus79rk_2JQFZ+nm&4z_R-BydTON0U}#OMDgvkD(L=hR}87WRF-F&m#4zR=g5N!oV#Zj4J-AB@?M%`I=%f`*?&nRJLM!S#Q>X zSs@6bCAS)Pm2fVFT_9h}%X*;1wf95TzH`KFrOVP~dui5W;6AsgU)w#5;>LG$vqrV8eHmp} z!OY%_HFMJxb%yrMd%@Yko_JrrW+9+eOSymO!sDT3TggplJXf5o_0{=iC6r zhUI5he>m@et!w}Y<3x<-uPdf^K^CFj0mCZGb-A!l!1q)b5VccYr-ufrTJVtTCjJ`T z9dS;&^HwZ1A()cyrt;ZjNGxQuBVBPNNpXJ*Gd{Uz)a+0$G)Oo*UAbImcoii2-FBAD z38KlhWyVS@!J*!GcEX{V^WohMl^+KbqZXfyOmrlx&M3$6w(!bKK{dp#)}Jo40XgT5 z0fuGdoUQRoToFq!Tp3mGxq>^l`(y0`C*3cS-{x?(tuBmO%p_P9>}>BM6iP5(L`(A} z9r8H;^N7B1R+m*_!Y|QEtEgPD0dRDn8iBt@aexPGEJbyrLz z3__k*!drm%bI$rCY-(P^2ZKM1Yf}-*jQ}^YHpy_9@8o(?i_4bSEEz&0OcmD>$EOVV zF4lnp^9Z9#=>nCaW!>IIw))zT$g8CADOOc314p>RFexB=Jt97C-gP zduaC-%D#TY!asgCSgi0_&xWxRRNP!=q~7PAo*W83g%7L2F07X-ozs%*iu7@@a^WGb zxZ4$*uUriGVOC%05Ic9txK)ek6AJI2`%q~5?3e;?@cYXs1FOaLjV4aNr}9uonSx>@ zxmEiPIEMxRqGg%D%tSPxFtnaC5!vcgFWQC5x}mf9@Kr@fZX1p?Q`~sBuSJK0!k=qb zIq-cetm{1Dm`od<`y(^El2!C*?PUd2z@Drw2vseoDJNO)Jh)mJ%==Ua$ZDmco|`{u zwmwb+iYOAv=}ftU#H8`rb&5dMZc|Byrr@>{cth$<;vlotR49d(RX}3bnU~XZ?BSbgDBWEsoW=8) zwTn5(y1CC1(92pieGHOvynBF5tUy6WA`q$Ec9uKaAWw%rR8k3kbR`74!?~|p~%`R9CsrX&n`*I81w8D(c`U%IlnK$NAMCQqkT}DEqu=%RlZ%kQlO$+ zA?}xZ1%PZw{4%PjN$B&IsAR&Il4?2w2%>hAL40ZPPSmVoW4K+k#)Y??Z$N&fng}J> z2RvcqZku?s01T zg{U37Rk)xy(GiWMdJ*VY(NHXhA7tdZo+nNVSn<6ld+FCXfRllWNWXTqt zF*B(mpiH?awaxKl8T+9-S&w1<@&wqeNPiOMLDN!RJ@Cge3|M_m&A~*nDb=tpZNuAp z{<=7Ad)?bBBpNA;?Y#TP9H3NHvG+T0Ce9eN*$jAHH;m1J4Co0Oe6^&w2f~a`&%Gt9 zU=IP4l2`#t@)MW`J8MwmxpCkT`K+Dd?ai{eYY5b}knQ-AD_`B zD$D_%U%EQ%iVDB=IEi2bU1L#ZEQfd3+}k?>plAj_ftFbnnO-?`w1Vh*1qU&5E>qplCyh-w^b<>&kd76jMKvrRtmea=W-Y zVGltmB&OJ#SCo>v?gdF94GdoPixURGcC#BaPk)^ZwpP-{Pi6^G}m(qj%+ zc+4IQBg^%yL|g8%S@}~Yu3L6vxf>t_n|vNAF{)#^BE>xJD~YxLs06f7q&P|Tmtv+6J*W73WPL5n%~en+PNW0 zX)w#MUN<8@2VEiitn5qFh3t{lyR8qm`g;lgoT@*qsX$5H+CizZ{mRdL1ENMH+cs7f z`E>7S6V@AjXkMls`uRHuc*l}3T**E-K?hTu1;@#eZ?Wax;l(E+66TXOJ9w!gl`=bn zq4_s3!`%G3XuS%;o)&z#*a5^grrj~Gs=|OB$pctZZi4Q#MN1%0N0F6I2fgKwmjotX zB%X3=7s!HzzKK8rkD#;M`|Rww3!%i<<>~<&Y|&zb>7zi}p6I@fFLT38qN9AJjEz)r zKbx0W7;1bbV5fsU&Ce!65F!t>yq3w+y~&W7I1|8&-tOz-!~qoGEq8wOWUwIB5HJGY zdynmgn$62PRYp&DGXnTOPqoAVFvveH=fr|P}-&Y-GaK)7a zB!!h)@e1vx)$O_WhKkL*eo-|tk1OZ^>T>rfVM zkmp1*%Fwu`l5*+~FR47FB$mR@*g_&0voFnr;bLV?c+-rs8WL`uzo+e!X6iFds~n;e zw&FR0-=D|SkIJCa-34SPpTW-g1bizml}~bH++QTR7tLZ0EYrIVHlQ9xZQRRmVApW{ zqvBqmm>ec5`mE?;%hD~>Y!}Kh&0Yhs)b1-0t^1Xphg3t)veke9jW*IxeIs&97_#I% za8eezB0{y#ZfSqh<4bNm@vnUi;^tE~UR(~V%R5p-VRG3bR}Quf>} zpQ~FY7j7M+7Q@cBCN;`airRr0ruD93T2ao_6;Pkk4k! z%!srIz5xucEqpplSX-5j)&7rH#Y`2^-P%2 zra>t`zx;on}Uv^-JJ1H!xHt7{`aL3V#O0P#p!Nz0dW3GQbH zom~>Ad|O6V(DxJg+=5QUMs)-T`_u?DM7X5HC^ZmVdGOS;;g*l?NJ)Mlj44%}%T$<5 z77nTolcBS~T+LqdH$(;kst=<7m~!<>&0rXvU*487AVT=4M(dE~ znAC^T2fOB8v#D73g#h$t?)ysIE%4#;tipA7oshLi#8J+WR8RaFObjRg#|Yg|y2>j_ z$AG;h+@T-F&9YJQWRL^1+v{iP|UFA+~yMLO)HD&41be6XOT%1+gEP^XxbuYs<83$OldL#^@kGQLN3^p>txZT51(7mB04^ zg1z9c4w`!|km4V}P>g2gVotiXFL&!*i7lDyhdI0gOTrjnZc3o9U;?<~;8edd2IuAi zDb&fnsg8p2%5dq7?Sqf8+9(L#HXs_IJe8q#eArC$lI`JVx|cYFPD`_pv66b)D^P}# z5Y;Qw)$fKLbuRz0@%z*MCb+f4W@DA;fu?O7T93zH1*`^n=10nl$=YmwgJb6NBJG@5 zkq}=sIxtd!@{mM6yvM(g<6ch>y?_kH@Ux9?3$s^5i$hF4ARqL0Hq=|)P6Q~4Gfi^{ z+sy;s)I68HIJu&PfBx&;2 z#}Z*~S_HQmOxuP+MNuayU-E0Zg(2Gb@C_q%-!XKlmu_?k3>rgnD&R=x4e*)q5R5&F)#6nKGgIk--dQaPBK;Mhyopy}e4SX#yj zB?Uu6=@_y0ReHmQJ1P26b$|^&Q0--y4s^w>W%$td8U20k$7}q_0fm|7Sfd?MBd?|s zNq^$xnypv6cWL_wv|s=Y&&4$)M>HRs=a3N}nTMf!iG5Upgfiz0=oqPeNU;khaS!Va z2JtnZv_MrQXgyY`hdNy5s|zDWQdA`z>aYuBYh_6KzxBhGe*h)4YaV{=v9npWRN{fx zuNQTG?@`*1fuph&C=Mb`)*=FE|9aZ*xh8zPcwYh?{7jcJz`$eur-1p%N;W48kM*#o zdX}bPYaEX`@kMFH^C$N5)!BBT25u2w`I>B!CYoM^wyp78Y7;?bamM(!g=An1!-2{O z0(zKh)P~tf?SaTrjbn7mnV`-QOOK;BOp6Od~oYNu5=o=Y=u$1e0+`{l7? zvNI12{aDep!LH9}Y$BpJ*NIL_X$#zCQ@7l;XrQc;T5*e$4|@xn8h}w-$-$K$UA|*e z)V}^GdT!ul<#j5yxy{j7(?k}5SDrWKLhodwcitmZ-;Y{HQgm!cL=(^@%Y;NU@uX{$ zNJPW$Y93~ebq(*8Gvh}#7(9{v6Rvw&>9%KSakC|kVL@@K%TNnR?A32D$9h~ZiE|Q# zi2LWf$BupeQb}gpXr4-SHWyh_Yvy7lG?jzTjWbBUC0f4!@TF&}{IcYYZcUzdz2Vp8 z8!z$=KfZIThk^nZ*M%k2l7Fpyl>O05v9XitnOp# z^ffkF~z^NQOn|fu6cyVTkz=t3Ymmn!xahPp4aLE zd*d%9BH-cqA{M$z-$4xal^t--oF>BBqcKU8m|-GG+pa8$=TZ<>Ivut9Qf_dGaI)~Z zB>PtWWP^wBuhLgch08DH$4JvzU{zGe7+)m5@`wM2o9a!AgmAc18O$oUsd{4WWN3Ky zOkhSzU3+Uc85xENkcoqX<+(Crnj)455kh#7hQu@1jmc7Fwaf2JtBjkJo;6w~$YY`+ zGb9y8%2o=UA6SiNeDv7gK!zh^xRw}{pV=gb-TTbf+0La%i2+y%Vip`!Pn>Jms^rG} z&-DS*9Ymzhz4cQze<{>grh(>O6b$#_8dAVPd@?8%KVL=f3OtQueIy4^_?4JwIZ3a_ zWk(xLU;Sd*E0y@C-?cM(U8=wA0~J}M+C!pb;}TMK@E~yvwS|8KC6Z^xB!Uf0TN{s-tUj_FL$QCz}WlqL4!# zyj5yFQieDiwi=i?;T^j`;1yP_{XM4LHwo&u1#RKbci{j1@lB;9GHVo28EH=Fd9W1* zmBtQ{Ky$1-x#~rCNp-`pD@bj_N9_FGpK8BYf*tWoF*>X*!ATfsicjytIS$r_Ov{eG=**!)QQwcX-m}vn{&hQ*CMqA)*}Kw=v-FJ((AS`2^-*f!_|{v8feVnX2L;G$d@Q{T7I7{ zWWnFXDf=fh>l9{8+{z=s9H1jQ5!!-4A_iS!14Za3JLETY&30DuphaX+I?9=Rku5vu z-k8UxqJ>^U$#j$sM-AtT3(Ban#N*)`g7!1bVKCOxvcx_QUY__XJ_8+* zF$Ui^clQ$qFEHJHcNQVfieCIy-WuS?^!|= z`mj|waP<4)%zZ~ns#4!8_e~5K_TsjeFeF{u{NcxK1u=rdVlT%xNhlF3I%qDheXx;(=3zILl3?dccOOjbDJaPic?x<% zd=!T5U}kuXdCRPw&H|e{S>GLhx&7nIIf@rRcy0I(csLi>Y^^+F|H5xdb3?{E%hdRw z86{Pn?JoPShMUqSo}|5Pe!eSR(CvyRK)9wMZ{>4;)2BaHIIHbhbO4cXOX8%2MBA&J zEtxKc;JepI3r|G0+9kN$w#?XQ-s1{W`u<)&0ntj+O0-d3IqiSpLCa>Jsp3!VOG;q~ zy@_!=qVE@NP-Gdq@BSp!7wcmIwL`o3Myudzt)(nv=%JUr^6ytKLpQ4-Et7$J^LJl< z(vXSiSf!^O#6eBqR}}VG)Y9s~X}Dez;pVK#QY;+m^Ss*);Y~CB+;*v=J66xZV|PD^ zkK5Z)Ne}ufJ zjwFkW^swwy#JprX>CZv_XkuajIPu~e+Xg7{D)u=d>}_ySHg`bgM+Q0e#IxLIeW=LV zDa`cU=C5y>Bni9kXyU;{Y9x$5wt>05I}>BHA+-6rx2F*;YY2~X*wear=axHtU?tOt z+5YFmcr1Oj4MFVlMA_?@)_8MGF9*XUvTSE!O@EC{oQ06rJWq%Oik?WHpaL(Cvjrt| zSgPk#(3Jup_Vz2We?lpV%SRVqdBZ`vq%HJ*wb;4*#Y9sOuakfY(AJkxt^Bj~P%kvO z*_)S~5?g3Fpb9ftMo#Q$<~<*RWWjn`DQ(hXTI2r5X&(J^n%4dKAxor>K|nw#yK;JA z!jCoz=9y%_zrdapNE6ZZINt74@#7A%V7{#`u$YzKj6>pHJb#_~or?&+(t>f+OV&l)4T<7O2lFJqYdrHh9 z=*Ee+dS8XI?GjNvqmN|vp{#4yxkS&y9mflbwC3nA1{pc%24qR$Q`~$9vG(B79g%L$ zhZGC4F9jHdF)*sI!dY{A!y@u1Y9&6&6@^*Y&YA|?`Zz{1o@*w0nlT0k4;E&YTIVHEx;iz# zjVF_x6=sZdHxOcGuQ+;LH#s0v;P_*gj3%Urdh?Q5&zDI4DHze04I!q zh>1LW9$dnYJ#J65*+K0f6=wXBgDw_?iO+X_O0kS} z-r*bbh;U}IheYV6&qoQrfD?IMWK{I&s{O%!&Xb>bNVsPV9kH3Uvj^lcJFMCjF6+k5 zZBEc5wB4j3UO#i%>Toz3J(laB^=0~Y)vQQi0-V#y!XlkK``EfBU?sQ0}9Jp3N z`VZ5s!93=jlFzz7ht0Men3eWLU%-Vi`j|6Glm4!`PAlj4;2yaDfDM@`Y}HaEE!$YJ z^!J7ral{LsjMdnQ^&^8iBiDNslSJZET-{_h?u6EwTVk~LTIWjo=&(+K0XFqUpHC*{ z;8!r>NlObN16L50)ekmVb3>;GA>J^!Gc0iwK64lMi1`CripOSj_-J=bH|<}ZooG!7 zTzf^nNgFPfl(&li3p*xRnRe_OY~rF4scjcApuZb z|B}16v|D`XsCjY(?cPvBxWh-N_05AH;jkw8&1%ASX{_ZI-?F@M zsh=9Nlq2W)BoiWq>i;0aPKrgh#Mb&{=wbY&{sQxSM3$)8=BFH4s%+Rb{s?ET0I@GX z;a#-Db({M4 zVEFPe0Sy+NrMKCJE|v}Zp_)s)I~tF^T<@ZU*I$($mHj{|fGoxO7BeSSG#6xM*84^^ z()Kpj<)T9KOqxmxOr2hy!4xNhh1E>Uv6HsdgkhN%Zc#51zIl3`uq^gwag%6)<{8LJ z-^(FUSO6Qph+Skg!lV=Ab@i9f9qn)E0|A3g-TJ#QbCwT;=4Q5#R^GUO=764P**2^jYKQfLEeAE?JI)mRj(+@OwePl}n-E=u_Z1hz6pv-ocifnmrF1zq zt0EoeaIco^*iXETkl};1%H8xM^NQPFeO7g?8}qWWh;NG;L^@j>mJ7LtRk<1gK2 z)wt)gK@vXB2{ROIkIQ#gipNl@{c|So;OnIfk53c)Rx?WNN70Qe+p78vc?NJ$)t-zc zvrI;+zjnjk@;_oE_0heQ(>MKzq0QWC;Li9u&eTz0+kKfzekM%Z62@E0=#W&kL!D-{ zs=$&S3A~7*+>(x&A(U`3w|MlN-Bvb7S1n18A#1qX86c#JEtzYEMZbR?9 zqjKFcS=qhhoq`FW?Jch%4gRbO4FZdN#Gh zrY}d3s#lQT%~|Qlv#(RKzk3iXsj!EARyA zVICg}u^LIg<>TJ2?U$J@$X9Ckb|IXy4LgecL{FRcux)kfs}jHXFJaLin?`nLz_8Pa z`WS`;zpKL`XtN(xcsD)1FuUIelj|0QJpw>-dj23#pTZ8H0BefNKH zDPjjmiSfi<4DT7dts&gmEkBtm=RHJ^no>=fVm?2Fk7jf4{nj zf(rAG%{$npHgIr^Ei+N6$MQ^R>{3 zpdjJ9YT%_9ci6}qPrMj2p2!YF{9CGY%kd|rV*%A-eX9(MBM#M6TUGN>Fq-T-WxC`i zd9o;-63wX7%6qI2*=<(xl}65{qH4-i(=204c8=r1iV~!?jb0z`3@O9RChMU_HWT=t zH-!RF@^0_ zAlZPT>DuJz7u2U9_l!}s#Vfl@=YkgHVJUP(f$Q*IHF1onsn62krZSs;zWPN3j!qVG z=leFL{F~`gIx+tG*ZB%G$3$x|JR9O+djTs?%6qofl0RbVtdAE~MHaX0?56FPq({26 z&_WvTwg1wa-Ey?VrUkI_EX^X-BAYS#NLCFVl>3;0l0e}n5T)c#oRZY7Ybh&#gY)>Q zI)3Vg)`E;@RJ2LmSCY7A)IK`%k43)&YE0p0#7pCBzOU{FD{-{OC&LEK0u|?^J?$m| zsM-aw^8lCj`rW6KAXS5JLyT+WZZ`eqHhYdhnq6521S{k+!bEg-WJ(zjazNXILny;nXP3MBcd9cML0T6lXf zF^|vKFRYBX8#k||Pw$`SYyUA@-vTFX}Q%FWK_G0|wu z3{;exR?;-g^lrZ`(~=2&rrj^1iJ1|VAzONX+pSBwPnqBfoAL#k>jKm?1R_gfS9rGQ z&D(Em6ZrqwRipx;zYHOZPvtZMJzQ5>7<)Z9S4(q)TQ-`>p%C{psVXL;uUOn|#3^ZD z(=;ox@0%8LvE%%8lDaBmUfef5-ZSZ|UQzC4qIC98+X4$Mp#GTsmP$aK3B6Xf<#mL? zzu!$_WKFV)caWA>>6&qxcKM=@wCMV~;hyyJ0-+44it@+8SX}q$21BIRnNZGD!n?jZ zg@(tp8}&63Q<1X<6dXoaSTAl6wH0P8daUpFJ!Rfe`AL(EVPCT1>H(N*-Bv7ycHnwe6%t=p6^2Efk*uUamN zl9L|r{z_YVu=2UF>=fWtqY|7j=F7xJ-n89KTh^pUTArN^x^kHMLg@>#Pxvl#E~5Ar z>!3(ccj9tjx*?UBZSrRQ+N!x}Lip-I*{vIn&bLV#@qEZ^C-b{A57;@IlvtT-TLfM_ z?EGHzozyl(VZeT-cf0p^rrc!1Vzu9SIi~lp!8N@&f`t~%Dv->Mfsu_>;nHiz@b&#S zmlrxFT!}_f+I5-k)~QCfIm@x;d~cVxrZeyfNP5&YS-;K9{p?v|6C|DHJnW%S8B|+v zINNk{>9U5A%3m(w&W)8#TqErKLKsZc`%QmV{%RU} zH>^Np#nRRFx5Yk+U@!bK!3!}FOfuG}kg|tR zEMF7?WbMvd3;lB4F2yYw98Am%ye+TCIdk%o4OrjCjJx}phOlnn3^++QgwSH5gp2pX zD1v>&c=6ZbV_Hn3E2hJ}qaDvJ#H`FtJQT^an-`$VUO(@$9sh;1z`#!5FD6K8Ib#@Y zQ9SH1vOLqiy|X{jCH}1(Uzu%v9Is?uYe5RJVa$>F6He6PtmcM8kMT-W((XY;%I(m0 zfjRhBGB8MQ%LJeoPXwn*NP-EdTeCE}l$18`1(K60c_hW?R4}p=3yBqF3br>h{Rpfk z)nhL#bH-Nb58FrF#~CtCJR|(}=_r37+s{F2uV@SFrY;x6&2$zX26Zk~GpRzVrIl;> z2@v1Tps0sCuvf=qF=o&bf4%LkyZ)eod+8lco{xM-LqUnMg{Iu@pY>DT4|^P5>}(ug*b~XKYA#pr%cC#_5+WGgtDqE}OmV}x>`jd6= z>*fyw@`mKsfz(ZfLt8omj?pZ%O5s3I7mT_jPN#c!b^2&SWdR&hK}{q$;m$TNCKpdn0#mI)ARVKza( z_zoyr+C*Cldxy%MleW<5RTZ1-7ReGlemGT&D?dV%g#@vN57@VpMVd16A{t}u%Y*46 z3v#S(VGpyay%*%mF3wdBF0L-%_^H#SA9l6yOWdY_Q>ENWWlBg-X+yJe)w@#BHol=~ zP0C}zj|@k89XnUeQ4iEy=Q<9jK1jUx13asb5T?Gc#l8xmRomM*lg`R9$#gU%Q18$6 zwkaS}4>PWTu?u|qu58MyNmfG%rlX&sRBdTBR%>RPcG_AxemeDoo0H7yCoSGfnyc3i zK5Bms-Q>D*IQ2>h?VLDN6eM(aPM<|9PtweuMdw=c&&T zwhAuk->&&TD-}^)2j}`Z`kbSyP7a!K-=kFK2Fl~&J-=(g#yCqI7Z5wIwI@&C_Rm_; z+yy^rTic{@gLyENOGxO>M(nH6R>Con(GrU3>G_Vf>G>Y#`p@EaJNu^E4V@l*W)It_ zPz#-#*-_U6qqLhG=0aPGbuD-ntj&;VN*C;{i^t{G<(#8En^|?+ael9^@ZW$3lt zlp68cVkyTb$-LW`xG}KTD?2D zEARgKQF+>1)YF)O`x0iAPW`rtDyzL2#~i`>QjV|CyJN98O^f*N6nxq4TP+h@tX41v z`eV{3{cclaF-zD-zxxtn9C;CbnzxRLoc-+1cG2U}Y;x^7B@NIuc1){$_hI`-c9oFU z&eDLzx82O=ryT?hOX3F7c#d0KraM9@s0oy-#R1#SBWUI)duz05qq-&hxTc)IknnXN zhJ-}%?4s#6SnoGoDQ)7L<`y~rk$ZGz5`Xsic!d}S)(`Qf8>1sf2}jJ6Om^tXx-Ywl z2)*JP%+R;@a@`9%qyif_^l+%^i;!i73Ng*h6`SM|vS1s>__th@529Hgb>=oDdBsn@ zfdxA??B1_>t7fw5sb-fnC<7PHR;izP%||X=XT~UC^~<<)>r0?H`}+(E<)~EmZW3jF zJ3Fnz)1MYw$Mulco2 z9q3C)GyTTzns`<@{->a-So-l^(+rAjE&Zqj41P`*tOTw zjlt2{>#`&VnBPxtRz1-7u&?kq+Ag=V2)P-|SsYg6@0>_6ZIaueX6C~AWBFUjjWNb~ z^drCh9Gn$*r;BNmWHDA1k5kYO`fE;lCac0Vd>LX@C0XURetEP1OlQO}#~()|;2qn@eE`< z&v*1H@3~sI^?e)Zh3!?YtDz$N#&7Q^51GZ=s^TMy9_zR+>se%N&ujYb7tjnYb6rVH zl>CjnO#C8?Pm7?~!F_lnrGPFn8#}bw9sfx7n>w*5s$;H`#=!;Oy}e{*j0OTfI(ME0 zhm;(8yJ+^tA$Rlcph?{iHCuxbd1*`VHv(H|8s& z$2F5}c%1KC?Z1TVyv{KvIx#{_f1=DPV_xs^vnTou!Q{zyymIQ;xX27XzJQ0X&iBlX zyk|cbgzjhQR5>NOPH->yjqLsqW&6>`94uDiO`y?{8JF)j56msGVBwRwwMQwxN>M4D zIdpyyAn`loHGHwa)E6x-je6kpN-P);0;Ck?L`&1JNPrX&^=;OWPb4#MCp98B6}!Ss zsmFI{g44YiJyj7i-#vEEv4Gw^hO(DlzI}0J$=p=H1=HFq%J)<{;n|-6DK>xN5w$p% zxnUdxNU)1mG71AzzrBa@%mW!{0xU(wQR$HG zlJ4%>=X5>yv!C&f{qDW*`|~>nAJ!VnHE~|&bsl~Ef7M3$M$HFH1qlSO%qcM-LHi~C z#jLkrR^lPu7nr!G_OBL&?2#qgTABvfHZ_)tf3;OAEKIFXuOBEjf%;teerx98Z(;LE zvFr?5z~*2sYuk8!wY-_Z_t{>4y^zG-bEPSCfIHZJqISO-MK3M^L_@1GHiG9>a{{h> zHE7e9<)N=#K{@Nc~m5 zUg+;(dC6puiGCp+|MkI zj{n>THBRWwR1KL68V@LZdv+$>*uf^?!ZAWr%*%InAfj1Ph`YB1DI zp)V^&Ud{1UTOmK9gk0xwnRbRMpEGvURMn#*{YIg|!@`M4;lU4osG8jY$FQCApNnrQX1LM4 zbLcSS-&wcK?cYf;T>y%t+_dh{_}`T*R2nTzps_7C-VYve#il`H-;o=8zn!Q18np(ZS0RMm+TC zUQN{CM#=QUBr>=-K$@(-SP8^I6@DN*hP*CE;hva#a;33zg z_ohQHtJ(!Hdk{2N+*#>ZOCc|eN>mDh`1LzXVcfF^?Ja~+zvFL1}fVY%Dy^w>N3Wapot+_jlD6I zuR}92`+I)J{Ul~HqgOzZ)czTXle8n>L6k2)(Qrq03ilm{O7l3e@I{w*Hfb0bBN6vE z-pXobi0Qtn_a2nzCViKX1Q-14J6^N=21G{~6~jH$(lZV!j9L@)PK67!ia#0YRYbq} z(a{aWDlDw^nkBg=h*CCZm~lH`tUua4x`tW1YG>^OoZ*{h#{LJjVpsDV6mMO8uAR%} z+>bMYFLEMaFCRv3uxia`e<;?j>8U7tXa+~q9V^(NXK>DPmM8xQlZi)n=P&%m(66TMx!SN`^`Xr_rCVZm@DQ&q3o&@LmuX=AS4U2S#XSz zN1idsgFE@-Cfc|y(53ACv~cgfZI?0Rrc4}SVf3WAoK>@Gv|MplqD*^`CBnU1vgLXW znNZj?+vu2)U(ZrVgV$6tqz}|Y2~U1U3Ch;m0CGk=6NjhBD3AV*sHduzlbdjXc58$; z_dl^2*8rL1a(5Sxz7M0df(;`gnXd>u-NNQc;_&oIzjU2*Y0Zh&OUJuQ%hIQ+LLPoI zh#mtHX>slFOB5Q~vR@ZrzME4r5!M1O4GhJ$3?90A)2XgOLs(A9V;kBWr5J6kmq!A} zp`OhTo%?urPFsio8wfHSG;@`k<|iTuxf(ZiV|_8VU$1&rN_LABpmDu=6;Au}NCyj4 z4cj&8B0+H--nC3koLf(rrNokH>*q&oO%HCqDEeglLl8QP{|xw%C(pIS&vxdGc0I<` z7rGDW$*g|m$QNlZlMMLOT9Xgegm4LuH8g5mo4J=EHQoH*C->e!pX&>YFpydR8UQm! zlKEoru3ULX)X7;y>&wg0j!eX0!Dpa$@T7TvcI4LoHx*x-+tkw948Jzmjqc+7d#_kx zx(OP1Y={5XMx#P z&ih3%#wAGz)n`1&6{P>!{gBNs&k==qi>kkgF>h8 z@h~%9rz|Z$OwphIk$^2l@wu%TcE#T_0g&QU>N?&sUsRXVJO3XHqBy*S$Twt+i;&`% z;{4DDhUs~zX+mv?1Da+Le0$QJp^La33b9tUI~4p!vQ?R_(LzFYbw>8zUm}qnic{N+ z&e3|vjbnl=_iv&KmU!KCrLc+$14n777A2C7t~?mEioTv~%*nI7b4`CCc*|1(lGY0M z5z|V&I+;pIQ|(YHiAO2u(`rBF5i}S4n-HOOe}xial$J(KEO?5xq#uKeXWtO!=@WRfe}qQ5ZDD*~GKcNC zIc73bV>2S`{{As@=JoweBj!q4q<&1of{&Vy#>EG8->q^%VZGa%5gm8O;TiX{@95Ca zc)xk@et71!G>J7<`CgvnG(F++ASB?oj2bObX0&M;l4Na_BJ~sp3$1T$(ByoaKY4NP zDx-<&qr17;k*gEkI^Jp($*q(DXBDg{qzUg-0^QX0pXkE=G~@`FNH_%?2UqTsy@L#x zs8d{!c)!Q@R+xa1-aSGv+Ts-qb_X4UOK|21L5 z$E-b)A94=t-icOQA^YmY>3HTF*bsjm^21qbv9POxk~v#Wsb;k5b&V>!@*KxfN7fr* zzNIhtdX*KvP(fVj8nWq9c0${Z=7#kSPp|P_FL*G0$LGX_+}fxAjGojv&r9&OGWOil zb`W`Wo7I%>%+WicaP?!(W9L3c0eE}F;Z~JrozQY@cezX$dV>RmWTKib*4*k>BZybb zmoZkTBl!Ro`z&vEq&A-fIn0p5@e2?w; zCJf0_v^JC+!>@a-N^!5oxse_{iQ)(+ucB+_crkcKzRNSZgEyp=gbG!{a=TsahJ&iD zF!@ztd^F`<4|1CNGAcy|cn`fa8;mt)rwRa2h1aV=<7-G#KH?7 z@p&`$ji;vwnCRS|ADadU+CTY;aQee5jvd0>nxUuthSqFbl8aZ)V$t5l4dBX+3{q^u zubhKAJez`@uaF?{Y@;{%kewY>eqw~r==Q`jGQ#$o<@cIS^*Sl_-S3=}jygOYerNUP zYLvgN@Ho}dWIKP_$$uu@P2)-6T)8)pPL{_Z)wT_`)VC^!bzNSXO>XnQ|80Mf!uQXL zvTcl>*AIe{yVl3;9zf;Of z!kG_MDhPi3GN~$qsdTUY%!n;*4(`DAj<>t!?#2owNJ=~=26WkdcEsJ^9z2_>H_ERM z=YCmVIpb}L7%%6621J@!)$SIT0%dZ>j377v{@@hNfAEIa1K} zpndg($MeeCRbobPYTxOYYxp+4aTPS^YT{4#kH3;w&f*fL&r=z;(;W;UXMB7dZiERk zOmhvw>l@>|dY_m#Mk()bGf9kR2x*1kk%tWmh&+hblXQTX3q0~*%?NJ3EBdtd!wcGk zRk!i^c-1QBu-~fVne18FkFOb(JUyt)BOMRGTxzWM^_{!B&boS$3S|}VUaQT-+%Inf zHS{o%*TS;P#e=WZr^+)#O1DvVI^XhNK=cv{ob+v{yER&`$;%Z2BdCbKl@eJw-5i9CN7$){GRR9z9~ZBzG! zT|CGr_ze|*>XumK{JzGwa6|eq&w2PRQawecFvW*+9HMy?MbbwE!rW70{aRxZjjWM) zAf{ms53BvTBf3BXvgdCi7r(ba-e%b16t(3uH#y&XbTm|@Xc93bTMB~@OKtbodbcQ8 z4Mr)pvMDhHoi=@IhGJo%JcV;}Pxgi;UR;BTByrzGduqGehO90jc3nS*sQ=amc@Db9 zHtDVo;UI7f87FkR6x=yAQxzmD3h+yTOh0g1{EeJ@%skPtV=N)yM0mWzHuUD#P8J9Y z&dmEZoyO;W$&oN#P+_eM4!dnv4!?0}SByw-hAwqdk-dX2-8Pi+fW2pnWz%DE^SQ%- z$XT-pr}GX*TN&XTL|jj!p@YiPsZNeut9-Km-h9?V0vAZZHOUE)W!|%dxbrRf_opA{ z7@l3YQ+!}St+IAaZ?ZTM%%*scBYhxLPHnLO)|)to7Qi2HW~Nyz|(_*d;x5l5ApPEp1-ec_R{>(J;C} zDbV`5#7nb2U(t2bWjSr!Qs3*Yn(5yA!>C|p5LYBB^x>Je9GC}L_IsvX5@xQrGt|y{ z{FER6wqcQ{^J*+?eqpe2FrsXW*A52p*;{(MPRNF9wJ-IX~1B z!t}h=Tt%b~7NLmrwA`+i*yPllTQA`9%RJ{Y-qe0?qv6czQ5BXQuL00b*bq>l}5HZyzRmC zcm(EYhH9fa7mC+E%EWoSEw2~mp}&;qp*Ig)#&^w~%^FS&I0%l<^+^QZsC1lPcjtoIRP zxal_mfdxy3_Yr)Q$`~G40KqJ2F32X<$NYh|!gWJop~7Lqo)_3Cs3foR<7XH4OJ1u~ zbvA&_Twb52vw-%?#Kj*_7My3#O@N^^mj=6)yJn0+&B#3*HJci1Vi7nqn#yl3qAnv3 z-ZVf1OVq>8&AZ$|Nv-+Uh~qN6Z5tvRc{XW%7wR0XeW!y9oPqLy9{2ssYHJJ3y=SxdPtQQI>b+oCuv!U7jB?7q4wNdF zx|o+=suRbS=+}xee86zr)!1C}GFoJ!8VRg?BmXmcpeO^e#dn~5!HNSHp z2!nP*(O?f|SARv!f_+mFb=Q6!6I#5Vz6pcw#7tL~5onLRka;G^R#s~+Wk|F`9bWV-%5OHdH!zTaj0|snYV`-tQp?aINk{V_(qdwVWQ3(=;m?FLEj^tvDX)^aehC@qB zBiQ(N4P8Jw(6E__A0J4-EJCIkH}Iai#DdXi&F5)HOEeb4P33rz+}L;H_HO5Y8qywH z=ZKBopV}firt!cX6ll_I8n+l8X6Ht%I&TROXZ-z~pf-)=U`5a0lD+GClN~~gyCZt> z2}gK6-Js}OB)C;Zb_n$z;b=nN_D=Nk%Yf*#`dB&^Wj=cS_`Qdz^i%kNsR%z(op7FV zh7tARlY@o<{hKhjlpx%TTEhP5cRERS= zusd%_Q&#KgqawEDJ;Ns+=49FJeYmG*wx&j4x%ZYZj~=ysJ&|0aIBKgh_{mZ+M=Mel z40XdNw7-b8g?nr`#de#ki8icD%2FQ(z~@01GjalzC%Y3vn+s>h2@H^jwQ8I=~-&CXG!nyL=#_Zz16MO)v zE%ulR6(F@)d!7ayScgz!v1GE6HeGJbL`=c+{lyj$4^2HMf`SEWSir!gMad6pE#tdX zq=<(L4SHYJ{mf@TD37!?Dj;GCrpcDV&bC}KypERE$6fN%oIjmx`qtvH>eQ8S^`Bb3 zVT=9&Dsd!$%s>jS#JvWUxPO$^K0o<*qU{gL26`~eYtVF{d7eVKSnua@WOu%MF2Eo2 zOs6g?)%TF^l1$*Ev+E^Y;)is!3s5q3_Eu|p-5sCicD8RqDk|7%mS%u*p_zA6@?&={ zr_@}G69Yaw(Rl6Tnh7Lm7u1^EP}jefW+8#iAq@Fs-}@S8trZOsC_)Bw_FM4dVHm}; zbDazY%HTukOh2h>lUj=>zf)ZM5iNO@GUt4PZT2FNYCg7DpRN@uzjG9iu&|E>t)vYs zZe4-%DBIRcO3vP!889Z39qu)Z&FMR5Q^%nwq+KOQN7}uVh*C#qr!&bVXtTmmAymY( zco5KhGF#{#lpAsKqs(Ta`Whpm-^1?0KAZ)xvC;v3my;UfZ-_&Yw5t-a2(}*k@ zRLMYktWvB-yX1@4?qNPb?!I$f&bofpziSmXdrvLGxA^MsbZ{3Z{4{pMqV=FkxZrC1 zm#0vQF#+rP*uNjs3Vl{T`4}kb=;vH#TEprjWJyIhc>>Xy3OVELJLbMax)%o7)>(^T%qUs4{OC*GjFp zI@?mU%WVpDd-#!Ni-ON8Q(3656=-1If?FVPst%&W-9dHd93Q zzOwlDEVZtiwAW|1JUi7TlSeD~*b}X_>jCUwSf8X9K8-Z%5L~&GWP}t+1v+#i4&x!~ z0vKU6t`I3L#@JR{t}M;L@ESk4WX?_+FpclA%WMoybMRSX??wjl>ghnnZu=`MHNiN2L3}3q|Ti*WGv%k4YQN!C;hA%PLW4(}8r<)?WLFq#zHpsbuiemkr6OOE4Gb zelYLvu)C@YA2!6jy@hEVe2bu%K1|G*f%xd!YDHYl-GBVx{qs$54;Y>@2217`?|$?T zW?YBflp7kEN1vX9@{HyP{}rvX8c>(Dz_M#3o8D$m>biYKdOpjU`ttE+)&u58fC!OIw5`Z*&!>uWX7_X9HN()z4scw@ur z`uER+PRZWTL4`*W#jVqI62zr6MErBJLp$XR(=K4FM2Ww;m3Hv0x3WWS^j)rOe8}T+ zY%;JfE={Ff~mWrMI6xhaH5LXH3wPsC&W zOtJaExFy`?rlc`6UTtl?#*h3|6Kq`W%55H@320#~2qm8t6lvAdEhu~jlM7wDjKQ<( z^oyORHP+kQ&#l7V7M@dCFJ3NEpNG^`qhOxttyUXWKQCC=A*Bp$ergru3;6(3fHC< z^2z(L;4Ab1aJ;wgTf3nT0ei$7p^x~{{b2A}7o}E1*mY5xyr-(3P3tzkV_oD_M<@x9 z`81OASaLqrH`l9b4G4l6b1z2Jq+SBq8c@W`Ww{=B0x#mFJ|7N&FLNyBKRA~Hw-z6A zi3T6vlFDmZ-yc+hwj-Umw$|$T@_4zVUxd(;vx-Un+9Ub34~liDCJ%vOchk*Sb(tDc zP-^#-8I909B32zK8iUFG@93ogRD3v)+tnO!d?rQZpGpye6o#e0hD zSRNneE*UuqYQljc11))C= zeKRdE^N#PP%NY0ggFts_6F%sl|H~9Zj>JUwLe|oKna9DToHH73AzXoONdr&e)A`x+dtBh!?NL@ zi@ouGU;9s-YF?T=AF5E!liL(*EOZwM7d&n^b9*C&<%9kQB@lX3H(!S0w6Za`-D{PBU<@yV z>ATg4wv{rTjniNNq>4Pm%X}TXEtoFu_D1hS_RO!7oUJ!2ei^hdB;;SgJ5F=n4ZWTj z&4w*Qgw7K96j6a9n`Pv$ek@13be|H-d~mOaPqS8YDEQnykzTFuA9QkHf>!8K+|;uP zl0CnlC8sW6!}LEv1?Peldqj3O^@I!vg-Z=cBJiXuGpaDL^7q+pb11O;?x%1cGrP}h z5J(6}%cx7|?y=}vtkvy33iNde>O2-m2FA(5i>D{ztjNv%`aX$zbJMx>aNA}lO8>0L z1WYYz2$)fAL)&{_y9ufV1voUT7OP3j-|xuRFaB|)qSM?4GhQZPpsj|7oxp+XFOKxt zd;F*YK1-FfbK6x4Mn=}dPXk!XGKQ?L7kucBOXgv@3jv7zVnSCxe>nKW5{G11^yC`t z&NoZNv)t*z%c$JevrJr8stFz#+pnr`3ni`Pl4XTfxa|D_WRxatdf}~&Tk+7xvWZ() znikv|=~wXG%8GX&fvK_!dBPnO$ZE_hK|Z2Zs%11=o6vK#piUs> zAN#qX1C8PeN7^S`=C-dkn!8*XS95b1er6An?5p(N2p^-kC}430*LQX7Lgl;m!= zcjCT$wbm(4^v}&A75Py13KqcZHeZE>EQfzEF{@EC`)d6nU3@GhmWV&%HW{;k>&}zn zXUUh#ClB(XXpl7hu8pDwBU~-CLB?Grc*A-8;HqMyGN!}BmF+Aw>aQz04^)s)D;3$?QvR3Hdy6H&ayA9VD2>R$cTIzP5vb*j5cN1|h!!V+B$E^Tfg?KmU z(6K(a3swhuBJMg$&wuu)GSIH>f+;SgknZ;It8;gijxYk-H960+{!{FU_YTds<%T&M zY}29mEb$-dt<$7aZYBq_ z5d9U(cwcwWJ)OP?f;MDQ-q5mSk2C4Ld&?lQdLRJ3ovI6gPsBWbqv_hc`48j3&OgE^UC*4;(4o5d>elTG>p7EBQ zAQ*H79@D7(mAdNU12&A$wWr&z4ZdfQ*@S$v64O+Lwqbh%g*u3~{p-7Dyn+W}ah&DW zXOEz(9wU!~t5}k*R(6`lyEMHZHu2pm`%++LU>Vn-SPm|7A}46|h4BA#-|c@{0+bd3 zo*C5uXeEKX`2XK7;y-^bDg@ps_*p5L{BQ3ELg8BZ5AX}A{;gjKfA-HnzWjf_=Kp>T z^Z%UY|NAQiUfK`CNdV-R%t$L6CufGk%MIxB&kmN)D~n-Fx)pLlZs%g~DO{8ElcMy)IBWbHTcJ9d+{I z(UTen_i4i7T>BVFHGtm6>Js#q2#xO&ux+-YyD9sD8MmxWmf)U5@e2j2u^4ck-~0Zu zHSu{3N7$;d^Six0?#1j9P{0`*O*)+#L~f zB;;qff8(^AW$Sg6JHwLJF<1v3O`DUEwIHB09Tz&Sn|Prx3GpYvC*bo6+u;bxW!0<* z)bX~yCW)um2H@(_HlSB*a#Ls4H)b{d?fJUCn?h4c1sSNS`?vKG>i4)-w(vMQsHJdx z@d@yNuii(o_^3SX~bOKpYG74`%e;t95{GyJoskMX<;p2r>7&GC-s* zRM!uJ2~i!Ul_5d+m<~=R&H5hZ0B~mh&B}qZ$T+sNxB(FERSOvE2q!w8ibf#=j6@^| zU_&H~8JtW^ZnT1tD=Ns{L-P`vne zUuF3CcE_)uul~#L9o+Avq26Gq5_e`qc`uxMFHD=id7i6MRlvinBp&;ULG~pTwo=VY znyj=h&t7UVWd#~9oHJ{?I4>N_^>cE#rg-kBPVMb`3H7C#3>?86_xSEP2HnZ99w7XC zKL@M`C}CH%`>!4Uz5nlj{+=@N2aue2HcD(1zX$)}&qcxmsO0U8>sP-2@B_ea)}VUe z|3)@33I!_i;6MDikpQ=I*SttdYB`PkC|9|>s`1$#wf^X95 z9h@BSpMM4|1dE_L5H;jKeltYcTA}QhwOe1^IUy|D#f%2kSKNzW%E}02gp1f)rADv3Z!+6M+5!Ae=l| z?TV(h@t}F&??lmNWCkETJ+>l1EFtdD(5z9tNJA6|IgeBTVhz*kV21wj@Q)%{a2HM2x?3z;Erht-g0u*?a!5@8Y)DiY(V^s2)I|_a%^YQlDjyY7>IMd5h!vaM+GFAvL`$ycdg2 zB;@6`0T_&!s7p4*tZqec+_Msy8m@dA|5%i8f<2%pYkHGxex^A9>iL$lPnor1jalZs zcYxmFqp@Obbt|JU3-0#MC{6$iIuKx;;*`W9$Y4Z79Hw#`z~K(};J!?kS?zWPYNm=G z*Sn;h*VxiC0)LxJzw@D0c{ms+7W^1vbs$Z^az51;s9C*)1IDFM3g}{6^>J!}xTV!S zoVu7`d7YQ85CMZd>7(@^I@b<=UTqKPCCz1S zP+qAAR^;m%GnzQ1L+2s;X!oV z-kJaLb3Y`h?EJPVICn2gK3#C?8V{dK&~Z)99AMQc?^<%lVZ*A=Y*6G=d0!N8*PpWF z^XJo#X%Leq0~FIZy`tb@U^pK&W1L@@>(7!~_ax6_%WG=gxpZ0TvFB;+;E=wAd-@39 zt|^f!v!5vpq{E2cPNf0eeSM?CjCGdgli?^{rX`-yH`|O)TFrM?Y+dd4aHE>XKfkb1 zv1C71>n-z$>mRa&MX}gJYHhiqy3rM$dblA{;i=LOo9Ua zV(F)PZj`9Bgr6j`G~n#ZP@r3_@F$8Nb?HJwrka;$VO3?#sTiRSl@mc=0lV>h-DIy{ zV6H%IT`7P2!_f|X)K=o>YS$wm|NjQ4am%1YNWSCrjl})b@{tSj!{2WXYFt`v98SDO z!LcvS*SnTz?j1F+58TXF>77Gv&AU79&$!d`8~(^-6!xDPm7IQ-0m7}MrpC@(BhT9= zQvcT5Gh!}_;lQ8Wasm}%WE6{h5Fml6EH{TDAMz7op->0z39hZzcYi-!C|su2<9fH& zh>V*p;g@ghC*`J}zB-+b@BXCU|7KP%Awlh@rKw8BO%aeCE9dtWdiH&duZZ7gE!}hP z4OHC-_G1@-4-jqv`K_ zdL`bIDC{d#eh`jF$@cg8o`}0q6#Jr19!7LcW+c?}>`U5eCdtlyecDr%A;MtvYJ9#M zFwOej*xJ7$fuCc`-(O*mF!y9_WXNQ21X@+$N*1srX2F zqOesL5l^a5qTMe#cnuIG02RX83LwKWC6%Ao08NMxG1S`VxGKE z))ozH7Nacrai#|EYZqo_v??dSN5y_J3^AH5E5pnf;A{j=HHOVZ3~1u#aaoNMWM9(s z0Tp6dus6@R+6|DV$e1V>No~L=z5*l=-_a4hT-gM*D#8jn$?n*^g)DUwQi{n}cw>9qK3tOw_4e}PyF|Dc+%qXkTHhimzHnMO)fW0=0$DFOk zZFV}SRli;zxh~S&o5YdNUaIT98V8U_LVtuKq$0`Wh3{`FzswEVQ2sgkvIKy8{v(EoZj@5+N^M*yT!$R^nwxzbjuH z-`wouivSslF<0@myQZRM_d?j_iH>Epp7gpkq#*2hk|#YA!M6qwQ!spz>xq4y19je+ zd=vfDFqd`5bTGZROQ&qjy7C${JMj>d<1v=@-@+m1e5f}R1{4&Y#k7vIU41hTy5CE2 zOQETXe5Sy{YJU>ZFyHK1>Fv0h@%wDmeCipK5JGv>#8vQh0U#|<+B?wfh*HMBm8K8W ze<>D(jw;)U@CS4KHGEoPlLPK&^8gC->9>9i3v&0B=#QHx{x2OZz5;$i(0bh+@MKLw zgnP7DSf=q@d+maufjCjQuS61u9;1q~h1XtbGvFU(0cZXpa9o0`z*?Q`e(kU@zE?eK z>~~E?)Ei^S%a5SVm{rV`koUz>Ft#=&=S&cYeMsVcK-!5tG67gO zOvPG~Ul<49sLcV5@%9db%cE7XmzB;M_&})9Oxm0qqp%mG#Xp7}{CIV+wLr)^yYXNAaVilMRx+T} z1?6I^nPV*M2mXmE z5h&wv_>_Fs7y1yL;2=Y4w!Gsvz?TrVYiOnQcW)<4Me zPBKL?X=;^-`u0b` zc?xm$e;Zp8X;XlhI_}+&@mP&By~PGEVa^MqjzEi5V~k-nQmb#K%xP=&y<5kY4IpCT zeTQDr{n*|>R6bkxR|i|U45s5OU=9}L4lR5PcW6$yprOy`Fy76LPQ73&{h3l7N7qiP z)PjP>jga4&{tDMI%?)T6BoO18Dtk2UL4 zf*LndyX*}1&Zf9#bKoY)rhUciu?rIU26q4co^ntT=YMN0B zU_~O~C8S*JcBg5WjR*x1Jet;A4w|9n*f(#={?NNdP^=S(912qQfU%6{)j|CROnOZk zy<(|ESpe>5<#QUO8j2Qf_D#LMf^bPAq8}~gDX}T#u*RTv$g=S4luGS<`KCTE77D~) zG~nUT)>t8iJP@2lD=#BTX^URcSk$up?AQBD^;>LMc9ahEtL5NqK%7z7KUv;bMpW<@UxN8lz*v(l8- zeoK{Hyf>a%xzBY2@k0lKR*P4?&_|6jPQZ@~Vyg_U*RlVF7i8cl6eR67ob0D;G>A^< z7?Ol?BMOgu(HIn~7$>1+XQ(ds+0NgoqPDai9~+g;BUjPEP&IB%Gl!$AgF_Z@zk--XKsv@E`JXHfQL6kzw;T+!ok3jNZ>5bWb9HPf^ATo4H!CXDvdzR{L_O}%HNWMAlQ6jD9|ja zJWm2gIfds^6gttAfmOgVQ?!_4L&G&c5MmO(ciEe@I4Dd9v21FfyXSPHD=Vt=&Ci!{ zrp&39)8Zvf7u;wC088W+u|E9gS19r4q-wuMF=s!g8&CI!Wm(5ObLNk2{pSz!I7m>$ zsVF>mBp7fVWoQ^s&=?SSIy7EuX6t^Z2a)K(H32jFLj-|wheYy#LzYauP5HJh9>hEv zN4TxqyamE)9{8Cr(1u38srD)zuJ$JIdUrC9fZ9mScP|)zQFEPGVtV{{8Fy zvP(DTng=CfSV&f#15(J$^Na0Tj z8(sI+Dyvs5h34MM=5;}#I&^Hn+<;IiK@CkWHj-R8y${WU*&l-k5&1Dq^J`+O7bX}8 zb`((z{(|mDfhLBlwKJ(fiVhMEUcCPOGnFoi0(1U|$Xz;gRy7gpKpS2TK%h8jW-3fV zzuV#;8RM$2$(#N_7`$SKNBW;z_OoJ`*Kf}Nt>D>sG`RBlIv~b2l~v`n z{}V`=riUFM)E@3S6~U*w@NgD6(l2fQW7uD(QQ^rkRIlRV9o5PK-62qBKUX+fA5{GC z2yfV+j9aBrz~_e7da_IuZJf~uPR6vDZ*tcnNy+Z4eU5=_^A-!9qVd@nuaP2Z{(Lq? zLXH2aO$5}a%qbeV4keS2zH8LcPIq_rr|=LzDzkj7U0S2lPmvtN%Oy0l4ZvR*kx!nz z_H!FTM27g#u~`h$h@I}&EkAAmG))Xc7mvxX2PymGjUYP2$-oLg^jAR@$Eo17D&bD} zPXDoAtAKjd$Y(%7*s6$IV7$!wSKM>uj;KB+HV;79#=bu+z7ud*K}k}yB4YpT?$)%W z3koJDdb0t3k`%*<(R^oY3FaI2F?7&WG9!!kc>UM*}FneE&_e{85n)Usp1f>Px1y{ z2j7u{g0n52u8M>3C!f-qrt&lhdhc;UuR7|6|EauJK8gK9osR|$x~9tsOmEJ{I4 zMK=JA9ydc{4Oa&a0)0!D{vk)x?K&^TCL3UehCo`9q@t|JRtjwU!{f=?c@M@p00Wd+ z4Gsnw69u5;7(xc&xrJ#%dusYEh_5haoZ#ptYT7vbc;KtcKvp#QDU_bA$K7l&UH95- zoQS9xeg0Vr%(I5_3QnuKSVBOE^NTUU1P+*V!ij2eg+P|R#N3Jv)nFI5&qt#0y;$1- zh^RUjo1U7>TwzO}q#)pat_KD_=EM{EcJvQ`(qIi{d^0LmoX~L$-{R6q^GeuF1uoZ* zo(+0)6wx)Ck}#Fj-IvS09niFjTjnU&Jul$?Cxo7p;`)3)Ka{26qXq|qucs-aPZ5`j zv-8$ymMlxX_+dra#`gf^IM14R9)PZqjW-`_vP1GWM$ypyuh1w+hJ8?DHBoNfts?`T z@ecY_R*%E^jonkZ!#tPMyL8)WoVChGY-PwWYgIf!U$IIsn z@7B_Om^4TSQ*&ES8y9rF{q>1(803iki;Ckve9JNfTopW1xzf1-^8sGj?1L8{5@q{V zmK2{t6vE4$mO@_A{)J%=F?61(LJiz-aH21m;ZeDRO3K781#?9V#zK1w!L&uU<@yj7M z0&CdwLSXWlqwfnM@L~^celDbgZ)wDp2sfkPu2JL1!?PU=fD6&m0nhQ{>Z?T&5qy8+ z39XRv&aFBbPJIPd4%@<0`%gJzR}^WkD8@$$s;_Z*P|EBE0G-+TSp#@SUnsuc7>hg( zP~*z7*TNSP@V??)U{LTa4N58n1g1Bfbjd&J_dw$-6F_O>E>=S453^0$15s4~(%GAB zPqfVyI;m53=fuvF3Ho*qDfi98lLg%?*oB1dZybkH@IS_2H%P=yL`14Bq&&D@9GRrn z0{r8Pc$e2h23VkzGcM$YC(=d;VTb$Yv`v9n2*d?PdM(8t*E8_~R$iG9txg4n7oW9j z-n}7P`Z446(F@>xPv$-M;Ur;n&GMickTDXs_mR=hn-DrS)RP`ugL&_H?9$zS?afQ$ zH6_MNs&H3SH$yqv23L0bKjPms%Bjj_JenPF;@Ew)O%300OAWGj#IlRMg?M+i1HyZR6_0v?E%R2Lve6niT7A$SzLt?^7j!0FWwXJ*t|Q%{(^`wh_fin))%4Z|20C< zrQD9xid47!?FU0a$7E3Yz_D7;1!X)rQ9Fg}Nux-@^iuUwGT7b4G6@ngqy`6U94Jy& zEYhHH7TQdd=yG~p*o_gptnzJ*CokoRt zfHY&nVv!8Gf)fSz@6T!nqXba8D5Cm!5b4i=d#B2&jn2n|fY%`(+;cvgVQ?nHCz~T7 zm`$Ay3QfK2YbP4kFH`SPF2J^5&wSHZx`HW7bu!PedyLk4V{V;lQ0 zHBbo6cpx|I*&6v}EK2cxVs*^n+m8JhPoQdr{*rU}0hJM!9oUJEIbyQwXg7Q}os1-* zIYODCVO*SD)UCb`hqxwlT6`9IwsQv-Fx0s#M^Ww&{i-pK*YHT-UzJuC{|a=<<3>W& zjx)HeiFCbB$JAnG(j%%fwn(i+xzGZ(mO&Jz;5zS459+AOrTJ+ z?v(MmQsE_a+Br%cEKg}p!0-c25iAzZ!5jGB9#_`8n%qaB09K)f&7|55s!X>|KQMT{ z>gkBKVM9Vns{g1mjM?Kd`X;$|GEBTmog*oZ4dI-JcNq82!cUUR>%#zr%Y3yg(;TY` z6?nE-o5tN5wvlNqG4Q>FsO*3}Et@_v#X$=KFz0QRFVfh|LZD3ZiO*Io%|__gO*pb&9XmMJKs7x%>*PO)+XZ!rBKFv?=jZmzmd% z1klN_-AvH{^yiaJ%qiEW$&G-Z*Up*cKw6LSuEh;lVNTG91NMUiFv0Y-PW8P{ReRE5 z0;fAu%4k0@P%gqnzKmQlWp=zA8X`ek0yVg3c#)6RQ2q-@Vsk)|K?hCQRzvyuy=V?U z{*zsRg4+6p2YpB1_A;Ai-i>;9u5luY2MwkN`ci<(O}m<=0}@_`8^b*zwh7sKiMbmw-5Hgf94pDu@>vOpZkhA=QXb$ zRLd7I2_Okj!)prg^TMQm-2B?vm&GDh_E z>Mti;!$8oHxLg?kES_W;k%%L$clKahRfyYnm4Vgn>kNzv@UtXu9-RS8R}!`M8rBwm zu5BjtbJoAg*5%MH)u8p3Xyp||5UB+uogfJ2)~Y`|IvzcKZ!rWgyEzxnuYwrA;G#`z znmG)c1I+Es=xc$IBl z=#M!qK7eYB?hBwwO#^yJqs1V`d=VtW^HME>Axha+f|HEpUCFq7Wf!-nI9&d_417K> z57k?+cX}c^&>LZUG?Oq<1`+IL4IqB-yjm_4{Y@Rx>4(f)#H3Sa5|gL!ZRB!2jA3S> z=g9HbH%_7TsKMZbYY_E>lC<55iFx?Wz`w<$2x+3v*1udl^}Mt$6YE;<36$N*4kBRQ z%pM)}U|@L)`~?_6p<4i|25jLn=t65K?nb=oW8@HI<`aSoOAqkY)w4Z;RgmqLK}po~ z<_CK8CkhNtidfZD+5b2~*UGOjrN6MZ2Zb?a`PaxkR64w`q^@Gh%K zq+&=$Z%znMzjeU>)CTjuQiZ`srC|&(oUPzP8Yfwy)M5m&}pL{K8zR;M-KU)qyN+2GZ7L31gw?2+xI>+l&Q;gdtJ$ED_g2 zT!JN=UsY+=cgAxOk;4f209ct!{DuJK7#)V~L!~n)$MdC9)@g!_q`otm$l}%h^LGI5CoT+;ID@gB7_7*d@Id&m`AFD+Dn$i z0sE^aWoKd_;LHV~Akx&t$-noExZ5S9m4K1 z`PyZ&o&J$O-2*#8@J*6obl&=qD@klW!DWRz^QHfBZ z_?gYiRb2<6sBz>p1&ytfeVqcu#I+y4M=_~)-Sc`WpH8r@oL^8f{WrWV?A%(^78L*$ zbJR7ZyN%h(8bO`!r&QZz=9aB2(^kJlEmMkuu)ZS=Q~S;e^hY!>+2y@*7vM>~yRYUJ z#HFIsY?PDiGPXOf05Dt~;IKSVij8^R_;N*!374IrhM;GMhFGyi@85MkM~0?fqHv!v z2XnYmjRtz2+nrp1GE#a!-m?IjxVwcF0EB=CM5soopNwQL(01Lz5czolQbD10$3r5U z^@@~-I5B|DCDK#bEG0n8l+SE5DAyl}kEq;w+=YPns7jEnrg)Q^I-*q^*-!OUJ>m+e zQDxYgK7wtuQXU@%0ZJk$tc<(x3f34T$K;SsWLUfZx^?l3-aqjn2V~IzGD~q!UXr=b zn44exIb;Rfg@X#ePjC*_@tUXsL@>?md?)*68dvS8=73YHoUh`M93|elXQ1sy*!|;l z6U+E%^oka&RU{SVrOe!$Hm_f3INBIn24&(0yM>uEYqdl;-VEk*Rm5PjA&txVuN2_% zO3@f-UrYeqVG-zK)ZIXl$_znMGZz`s`Ik%UF3HfXUiBLe_I~(Z6-r$yA|A(Bv@-xx zcLH&(82x)s;cAZpO-f;NTg4e$08y3dr*K)z#?{tYq$Wf`mON9}h5RrMAK8lFl8lIPvXa-mPLP z*u+o>AeA6f6t=g|07OVcWB|fs$TJ)PQ%U|lY)Oq|6PE>05>i^W5Fk2-_@F~C5xpaQ z=nG#mxV+mD{uzRTg}3(^Kt*#w?F$N&1tP{~z3#wZ<%ARZk4%MP7H?oZg@}TWf)Cnm z47u3B5u`AmgaZS1JVlv<^l!YLsQ^MO{828K4eJQF)stGH;u6RB=c+~L-cNU#X#I%{ zYBm)>CPX$oPNN65_<~=Kfp*QH468D$9y1v84_YgxD-O)>Vw$cwX}y9<$lN_i>P5>C zqowEe($&sVcZLL;X znDk9eJZ--(O&ID9ts?y>)C3BjRK5kos8z7~Gi1&`VQ+4;Kb}f)CzPf_&JXB~h9HK} zHYf3fR1XIw8nijja8u!6aR9)^^0pBMrXP+2+UD%2I_D5U)fM=wZ_e)`HQ%*dGEN8n zz7i()EtP^Tv+?c7EDpuG#+^xbUNJ2e$1E8Ccy`6eD%00;(DSWghVZjDns|_N&oA)g z-R2{~lehc#b4%5$%Rx{6E;jMr@&HBH*I<8#{ipKcDIe;{NdEOWMyn5A7Rj*`9f0;o@*7 zMtSd~m>lsd@r@6g0T>i{1-CB1&jo{dzsV`&s8X+V7Si|T9W!>{SM9jM)Xb!c)=yKF zI($M2s>|No{en5Pv;en2qN8ef02F*KC=5*X8ex2e;qoTuB>fRUiqL{E_8pLfcud5A zT5or5-^9KdrSm?So+rZTHXZtu4UhEJuX>KGQ~37yydOQ9HzGg2N!WU5LL-;J?jmso zHtz;-kz}oX@W=GHjG$APfFo;R&$I|QNS2JF!yV0k1ygt_qbjHLq!qR$evbL<;uT?u zN;wWHuZEv8J`KUfkvmxKEbnI{Z2*0$=PpK(xGAptizvG*rjapwK2^FX02*ez9ao@n z5zGMHAo1Zw6kosU!oNM7vYQh|p92I3X43?Wy3gvZmeo=ha@#NO!Bux-x0qvWzFKj=5CA2&Kip4j&}^dZ z!IB|yz`Qe{UHdu4jUor9d*}x=-%F%xJI!qUqDYm06itV zV7_@ItpXAKT%l=Nq+WNV!@Ocdx$Pjdv37#*>){5c@Gd`%mxC5E4LHB1g@yj0=8D*{$vQyJV@_slhCj4N2sr+&Wn(2v?O}g}fevREk zV&l~aC2Rd~Y^z@OH?U@%U!$jJUK1$0_vk8!)+Axu+%9q`q*F~CSL;mCBk(w5K@C$G zkYeq2d0+}RnDT-0WKLNNl)t6pcV^DP?LD=nd3(|&|i zo!}Pt-BAs?eRDfv;}xN=gWJDD%Kv=$O;WM=Rfc0EK(hx2+Tu<&`f)e;=2$#qNJQS~ zUSNI!10Y7hiac%sBEW8)_k@oFMHhNm8u9*}lBP4wFRDnDy{{~f6wyCcr^OD-(R9x` zXs3#u%eKNtbjUWioDNJ}ECyo9r|L{68TF)PdcNix*Iq$2L!bx5Jr$qn{wfT*S*SD( zr;gFit2MX@pagLvIHSHklyJwYO?xG!ZyfdX>Hx2T(=}(mH$?@IHBL`pj1kxf+(P2K zP_2XgUqSH5#;Wv$CemX6#0Ai)*JYWcipqZD5WwehfO9XrX-I#1e---C<(TLJv>=1m z8z*KnP9teQKyenKriJ!P@1F+r+{d)X!IN zKn7tuWC#sq_RHnJ#AIwXZ#!S4`8bo&eI1#Hp?bmvIj}Q4q;&U@jHSzuxB$ib7 ziv4*-KpB)9Ae-%Xp|*u>{lfv^7Qe)Ccyx*}0D)kH=~(oJmD`*&6a%@gS~a8~^{=v| zL3S;ahPlA0a7jYsScCD+gPUt(_x`)Y*;}*y_;8nSTcq-#{JmW?Qxe6C7sxMSed2&x zO00`w=SBm|9J1xCQF$-&Vt7zq$@%`;wl|e-G!BZVf4ym`*WSMr!P3g3=*NJ(g{_%Y z|JOFYlOZ{av50ihn)LTU5w@AS_`Cd(Y>fJ;XY;qo{H}U3U?Iq)M9r5{I#8~q-UXp- zzJbri7iu?K?P~@kK3|Vx=r#ap;%qXuPyMS*#P7vLX<&b|p`k1s*FB;M>auNzmkmQQ zydPa7>TlAcwnu^2)(&(O=u1MKj>%V+DS_ls87>BHlL#LaiK%1;^}3%6LIkpoAlJ#& zbX=?>Pen5tWGW#9xW13b=Q8fuwo-6~FNe`{Eq2|jWM6hr+ySkf?NHI@%8dN`bJzZ6 z9k9fGiMcH!3uI3m^z63j{pkylhb0_$V-CRT2p?P_C`K<_qvASTV)@{aq8$%nhpBQx z0#Q+L*A_T_yJG*p>6@g8#{cP?oC~^@0kM|i0Y!QEP0m2Yx|(oqV59L4rdZCewk;h05g3}lA#%-9FfO}6jC{qd9T zlt9D&20kpo)`-@s?Lq}ex`*LgOicj)U}Z2nh!r|k*KDix zdx@0D{=YN>Y3`)sH{ujxS1Xa{)bo}%pOW*iKB$Z>_k^IE0uCV1_!Pch3BQ!{5j0Xo zpkPm|MeE85y>U6||GxDFP1@ybgdIH%e!Ns#mIXQQ6$}HK%T~hYo%|&298mqKmk?w+ zVIl++ZCleCeFa^J@3TyqO~q!Y9A)OE<#e_2K0pD+4;xmkz(DiF36&)8r-7%_?U8+T z9yvz19r(QNxyc5sE21?Pm~f`TkX-Kn8V%wD<=sw@wjlmwj8B>M%VC8XD3CXCBJEf> zVgYXoE_&W<4rV1q>le^rxf?M=uTHBX9of-x1wZ$lf+uTcAHBOWmJS74x-bpY6noR9 zu9#;KG8A39Y|q>C5E=s}yG?Ow^|~SzKcHS1YqZ^XE9ovcC5iB5-<;{l2E+EqK6nQV>^bYApH?IrFsp1+918U$o~I`8{kcT7y9Y@omPj4uIPi*oeg zR|I-Fsl#7s?6OSc-yD1kUa7G*k12=&)``?sFk$;wdMT~w+&u#x93Jhl5$x`@0E%+S4Kns+1%_rVhrA;s^Bw2yagE-_4hVOQJZB z47W);Y{I9!{F`LgxK(!c3|X(0V68E$&?$C{UC*{; zjtS9L0E{Lh{7(HtwaP4D3|`h|6Ie`xKOZBz4oq**z?!#m_I6D)*=?QbTFU1e!XFcA za_iK95B-$PD#1o0%hx{?17JHSb!Bq4Ju>(kb|u`XJO|*}XNJoGySQ6f!gYuSvMLK= z3Hpe29@GixjGtsbYrE!0nH^=8+}a-81XZ}R8bxe&(Y7hT=*4FI?RfIKqK?}pHwbsV zdcgrI5e!;Yx#BEFN?|~hU0E5)e?U1V@&)AF1SrD5eJ%uWol8|tuGhKU(a+LF zAe1QviBtl{qA8S*q#HCi0DigVIT`&pRj$DfRpBe>fv*CH&tB$WSnBJ^$sU|3m!sIH z#P?wSN4Z>GSq-6jOuYnLDt(CBa|_Uw#Rma>B)|?;_1_WIAHn~eOZ#&Ja)RQ^k&yOR z7+EhJ{w}HAy`+90X6(1YE(MP$jU~Gs~lJ`^ZhD+(>jpRS+_|p~>z9 zh8^_bjIk`e4E7 zhP!5C$-Vlo=u(LaqDwu2fG0mc{eL}|7~rn+zz16fQv$lr!Hy@v!!$n1j2}ugsG9&p$(# z6f|@TaOi6PgONTgDB&K~_Lb{@@EH-%$wE>o!x#VgjHEUgyOaMvFUNns3;+LlIlf_` z{Tc{$JerTVz1%H)t5z%_LJhna<9c&xL9j@JzqdU`DSz~gh&4Zn?Jd@9VrEP7Ip8O1 zgD=x)$+$fFn8jb@sYeF}A-}ji7ucT+>;!wrOjcJOB;3uljVOAUPkY?a%uWeCJ67@la zT>RH4N!1&LO5hn}BC&J#!xxiXSw5P>(Kt%>=`t-+OSP=uMA4BDuuW6~INB(rYS+7A zP|Fc{bLsP^S(4aXkLjsjESixw05>EBV6ty9_--DIFuNpl8Z$*5+&U8!tF%#XF+`7+ zDvPvnORRyAets5fByk9EBgtuVl`|7*c{&s@G)mmLdUEYf(89E_QXocPb!ak)kA;0W zlZ$ZS?&^~pA*}bGU-UH$*xdfeJht)$E_M9mJm0K1m#$*kPGX zHU9R7X#}5kjPFbMhc2rEUq@-rQzz_6{w^}=wgRz;5zZxrKl5+p#PDuy0(B$~U%c5= zU#b;U2Q4#-r_8yL}oJ z$Zm1tyeQ(yoiRdUMJJBVG^5~j(nTW)V{Hgrz|p02UVLWBo-CA61RFt2_R5ea-@RA> zQ`fpNo$Wk##k(U{G{p?X?!<`ic*fS_#9zV^p+I0sgCg@oHXhYJv~pKbYAqh`5wRdx z+JtnjzmVKbxg4d*U!6X1+MBz8Pcr+e4_=UOB*)vh){}P|d@7;)?APSY%>q1yp`x5H z#8vip^kzVd}KMYvArY#c{7J0d;{uW^ zQ8bsY$IY2(3ZQB64b*deaZ##*_`h4I)rVgj&~8futt_kzk^7~EIs(Gl@~rCGOGp{Y z&y){AiGcHHP5#J&1CZP&IKz?KRU8`o?K38sMT~P?{pvZ+?gmE6JobLSX}=>$JgT;am-|m;vI0E+nyyDiab)lK>&14H2g8U z$@JDx)PqdIi;5X|KuU;84&QyfUMczAi3sQ8BMGtFeH69Jea-mk=%r&k{C^(3k)UbW zuS)*yf1Xks1wcx*k`qh$AKVfmkPf-va7F$HsSFpY<@`Sn@{GW&M|F~%UfAcVY zZtAMNA0&na3#4-q&Djj#0R9m8#ILPxR7hN@Xis6uv>&%q?Lp zkz+lnbV!Nq*?>oappom&;&p8ex{0}ZEwnoZovv8F-7xR}o<4Y|BNJX;vn4^ZLEC_B z&Af!ewEgShA}P^vK!NR<#dy~>~{&J${tJ&&T;5w`g6z`=Z564C)*veJiIqao@`#le2h z%lNsC)jHd-Q%eKCxih}=em!dW8teoNcV@6in0O^^;M>i8;|*Ed+Ga>xo%<{Sm)j;b z2EDqfN8$TaDK?kBKMuV5{I?(+q4Xlto1pMr=4jpkp-g^$SQ6C<-#vF&Xc%IGQhK^; zCa-7U3IS8(3gHve;c_j5{))T6L5%837t+`$I4UrV>T|3cLCWFq=FPH~bDYU1+Mu&n+K?AX|}zH0ugT^iCQJn z`?&v}_MRx37Utuu;K^R}VB`4&9>95YxC4W)*rAkKZe6Xf2U{)tGyvo#gx@5AfDHfqalZJKIRv{oly*>(9l zoKnkTw}X9q?l)TWgxgju(0zMC5Q6J1WkSC*zqQr4n?oMmORnw;2Yy}3K2t+~Y=`^H zdgRxDJhKqc3vlxB!GJBUh%1ZtWr3FBaCyuGOnmAF;YvihuVB>NnaAV#RLjqH->=fE zrcIxXwic=j?neW~smczp)$@8cWA zSzNJO8huh#TCHz==G|=hrPr{HmpkR|vF#PQ_qllAzy-HCUsTt}HhE z%~HAIvjSg~IXWu%^Sp_&MV_R~<^KMlvLk=L(nOF@HZ}IR_V&~k89+drps!I-XjzpW zZb_^nm4{kN+!-FBx|#B6_AO!OYM^WxKu6Kl_Vb%++dsgCZ4abrRF}<`&)PfmDkquM z-K4?Nu!L^S-O>@N0RGhw2s$xq6rz;+$p0J#aPn`n^U=JC>3f1?=}n1wUQc7y?r7|HCl?fvJq<_NWthsyxrC>qhP;R zkR^)a$*<%r>lfpC(48fDbk{jwgkQuOb zCvuiVW;r2^i8YiUCb)_kT~BbWD0P(N< z)ElU0256lpaByV&{FfQw>KL^z)6GEUp0QAy-f5gITi5Vesh;n-jN&$!`||txb;6*0 zkW7@~MXGYKZZ>R>dz56)RW$MQ)jT0@yqTJ7(;o_M)(3aza{I69s#MvBcXyq%NK!Sn z{b z*erWVkUe|ZeVBo}ae=u0@QEmaiQK-8WxL#&HH#Mar1wC!NVs@iLbcS1ZYSW6_QczF zbk|;}&@gV~S-^9Cgs9PN=`~BLl&iwJcefCxwEq=no9x(+UkXec1?Xd5S{qVtD2%Kk z*aObP%M8~Rkrx_#i=tNF-DnHqaG{%~4!7QH4ynpJx6vpUkVSY|C)dE_2A&xYA3C!Z zd`IRn8lxV>;s5e0S%{t0s)!=ObxV-}zl9t11ERU;!Q_UrdWTbPVkCQ}D3W^+SKF8X zRQ~Pa3vK_pW!jM6u|}sZ*~Y2xup#JF+qpLX-rnD7|ENBZD6LV?!A3duw|GW>vNjg8 zpyM5zi_+XcB>{tTcF-crY%B(qda{|0YFtIfiMrA6QFR2RD-g-4x*S`pUngZi41WhcK27pHD!G7;yO12%Y_R{Wi?IMPNAJnKvT2whe zm-?yY;+ICzuJl!mm#m%cDsd02YYKDk<8ZHfWrkZFSRQb-3I@5myHK^<9P%ifSEK+V zz|!+0E(5Zs<*7D-_;fpD{buyki-*P|c+0RAgn0v0F->leNdC8gQ|XOz#0Og(NtLpn zUtcjkH+JQZ=fDC>rpNbm5)V^n#e+fj+qx9J zG$v~}%OF}JGpG3gpI^x#q(uq|uLFxv1+zwEoVQp%kG&_ZO8xK!N$7hAosJPy-la`+ z9L89i=iCq~E!9?4#9IN1bFjX~#xk?Ccc4-{66?&~pOd1kcSD7Ql+(f3b+ub~P8I6U#1W6?gtgRBP3`UPN#o!xwtW z?#3e=Vu+J0vqSx(^WR7Obotth04Q8$Ed&e#E}`3Wi_7BHw<25s5Y+>h#|08HMIUal zKqnHc)8Rmo<9x@F&u>QeZQ2w?G|VlJKi(a3Bic=gn6D|^F7`#;z*m25y*ZVOYS+3D z2<=TSPiuGwM!qoZ*1DU)1lNjAI*&Kb^6a`d3MQ+NA0HKLZeWLjOJuoo??e~PKB%J2 zcV4=n0U2wv?DoGHU^E;7?}Qb5_pi8`aSN&cPRQ#ZOsaa9le*C6)*Y%e7*A8~W|}+T zMk5xE$JLkB85|%5W=g1G)XcM8whFoEIlUQ%VVFB^dJTfF*MNMi2>C8H0Pn~(i^}u* zriI1&A)b_Nf2;qxunU$F zij;klIhwg|mw{>`8)bBB$KIOp{#Y$@wWt+$Wato9KNnbh4lV7#Jr70r8Rg3_e>izDLH^-Visap-oHd3+5Y#f60aN}Qs-e` zAVk3Ua)8p5vUh-B9AtQS6mdsrhJVHu`;|WwP%HyQ$_ou&I^0S%!$O|d7dOhi7q)0u zI(l^WR@65_tH$&G{L^ZoVO#?x?ED~^eKkK~pB*=6fpB6Vh3Lu=C9n~QbjRr!t|3{-+NWoS!Clr+}b!^tnE|2AJ;HYyvV ze<-7pKk||lBoce_$ODEN<<74z30b}sjrad1fGiPJx&O(i(jQWOkB(`KRHWo#dF05Tj++R+r(Dn7;*ps?oUEwsuV@(Vb_ybMB)#`FE8~t`9)V^OLIK`C zdh4ealV=B~so$+PScI03bVaMW%qjgi!;e^?``dmb(|kWDIr1s@*2p$QoruIknt$7E zc%*uBUq)c-xfp0~=xjR2h){wBJ0l$x;>%E>SRfJ1$3D{Sen>VcCmfjo|2D|a_-F0< z;}E3Q64#x|RzmwrowEWn7mP{mFdVD)L^KJa^T;QPh;5+LVn{vy^(xThyW5Pw`!(S7 z^btOsfcIB4zV|BiTOlH~8IWja#}JRn_`&yO9`FcO9tJJ)uv3g_iSL2QE?}0hpAXbD zxbF#gJf^@nx){<^VE_xj7R;*bj1R20dN1%~S*u$d7`%(}3wrwl+tX7rjm`1gc8@2sv$f`N3s{Q$ zo%=qy*3fOhi)bL0@$)%U$WG(@tLmVvwV_ZpgPFUKegV<9t5$EBLP)M0!8{g2z@av< zmvlWU#M3=u;pLlm-RLIJ0y&UkAAgyo?rQI!R4|^=+ql<=b)df_y(|Bs2*T;C` zSc6=*|9`81$}ce{Zopq+AhGR61wfJb4)-+G(*NZJ(CNJ(^ts}i=gk4UUb@}T_EUWR zPQp9S+uusJpdveb$0nj9=ke$3KfkZ?0R05({zj!UXLqFNn-N({(u!RHB1JbA^m2!= zPS9s`h0<&1sH&>w`Jm;8;(m3uS~WUe{H^xgp@|q*n+Pg2JbvxVj0!2cAv2n{%g!hKmwEKoy!56;PTR%i+MyGQ#kzCK=XOO;b2qeQQhJloi4xsJU~f+QqQHJE zx)UY1Zl=b51QT1%1P;`)y0f5`MTOR~RpOl$0Q4Cru_1d76NS!bK1P+8)A^pAJ&2KS z^a;(jKl0&|MuP=Aw?exAzG&f!Kio>s0=)tgwXkF(&yP(S3JN--RUwa7`qXsZ9hk4OVevompF!v+&)o z*W?}4zXZ11O?%TGRcEPpX#3Sg7J*J=s_aH9m5`lCH^8T?Uaqd@5HCqH@V_q0I5r7) zGNkO2D5Un#B;h%PIe9A1KE^vVB3-=8czZAYz3E%*N z*q!g5VhvYbQD5~(KEA^Yd3B#cby5D<;Sg(k;y*qv$FBx6z;oOgN(+-am z?99gp?^U#%;Fx*bNdxVzGH5g~_yu)Pq32^jd-Ix?I*kpQekNX$;iP_t9B5MJ(+RWS zK`cmg|G4zrn<`{vyNs||bB=>AKzR(3bFHm*R%w*E5O^$8mK>^MQ8-E-3qrWe-c@z1Z! z{hh>l`8C^&l}m{IKS7C|6KzC1I91{*upRYwy|(hrrww3d(ev+q1y=afXd>YTX8Sxl{suKEo5P{xn(z|pFn{n%8UJvSSs<1WJ#%`0Ah*j z556EPRn_z3Y;URQGn~CG0S5IN&Eo9<8aPI1(IM_&6fW8D=UK|?JqeSrstusUSsFd} zLlQ;Mcnt*DXpFrPFsMvSo?COz|Gs7!zOFu+WQ+@Be>B)G{zFik3%3aa2FP8`LsX`+ zEh*_nK?0t%m*tjaKtA}?3ZLg(Ydp;K?O>fl`=^fU2*$190#Dl8v5%!VqQG+?8?J2c z#i}Yc9tbcLYxT|!Q&rXHvpU|UBedE%MO(=5aVrnOiS*h9JVS*-?i~wSXOwmutn?vM z_@?QUYrPk$)CL2tU*Cfp4$juNR^@pfEX)&Hm3{iA&THV}?BeIM4BEzA_FIhlXFwvI z%(`>F`)Jm4;4*4=`{xM`YpU%Y4d>q2d)+fmX&Fq;?54}3?>w9k;?^`*_VOs@ndpIu zF;1s*Y)f|hqR|u@%4IKu59@BJ_-VbH-Um3WR^pXNyFyLEZX77VBDzaE@IdOqN%yID zaBCN!nG^RhR)!s*-Y1n8Q)L0I++9urI?dAFzdCU_1J{5_-EU!Fus>Jv{dA~9cZXGJo}qcYgq81aMZ+7^>&iMA)j#wb@Xu6oaKwRdXiY#7p(hZ zbi{*?WV8d-q++IFjB4G5K{t8fD<`IZW<}3)?Jx%S`~XOjK{R%KDavBrXJOf$K+~I6 zDT(=zL%ThX$CkN$Ycx=~&{v>tyE3;K;KAZ7@kPdG(r_e^RehrB7*EjTe3&q(-lpxi z2s0Qs9>n-0t;iD?9JTWdjOA>X|5bdHV}lMi^}(DC`~vu8f#FD_Ylx)(Hpx!|$dYPQ zuVD0`YG?w(I)5j$wX?8lK-}^JIGIyroZ$?Hs~`Uu{7>wEc}7B!NZ-+8-S-v>ZQHQAFF=!>K=)W7ECsn*7!&f6}&_3!jPezRqls zq#}`Z{)l2*ODL%m-Onf7Q73$zM3lTn3~0q{3b*@$*HCso0rL>5kF%1MVck4JFWWyE z0F6{z&RYyd1V1F~Q)cC2l@L{#T=&8iv$qNKim@3U1?k)Uh;XXpMDBEXl|f~;yO(%t z_10c)+l^OP&>FI|!$WOxsG`UKMIV>X7_HvxUQd6-Y#c8QQ@3`}Es04H{Jnb*0IMSXf_T3|WCI+a$2;{Z--n&d>v)$chhmYUuiWsTrCGm* zt34s@LsG#!-R77cHJgib)MVg5iI(o9IeJgXvUBb7d1AIuFyC(Kbo><^@y>h`^6tpC zrsf}5vz1eA$x%^>yh^dWYgIk$mg*e{`>)*#RhHP|$b6F<52*RNdKCmo- zX_@_0vlrqaK!gt_7Qhis{vHv)^^@r=6X-W{SIhdl+uuC)EkH+|i}9|uYH(t&{Tj2= zOiR_TPsS|{fLj>#)L`o%eAuMRljyBF?k_e2_>_RlvFSR7}9Y+pR< zel5taj-qR+e0vpmwSLst@@eV&kQV^M`wn5ru9KjLMq-1$OnScH)u?E1Kq3`0qxIiM zrdDH|{fmc7VbZt(;p|+}tHD{N^W; z2QoFm+uNTwN!rQJTTe%1M{%znNuz&wD~m=vi26*Y>@UZ3i*@@8T%1^_WG#2&|GXC( zN@Hi%^o70IJ?eJA)40OPhT4wVoa@{k8b6(`SFa-AtiBPg1zf0>7!1J6 zya!wjxeu#G3FO418MTv(YS|6%@PZj81sWWa+zA|?`_4hGk2t&kHAI4O?b3ViB<887>GR+JZg8c_GXB?7t5$B!r zM*ZIk50CYx-Nt@36^p4Wf}gcCWIgddqlgECnMUG@XmEoPI6Dzm@6q;L$S%pi6>_W5 z_^Mp2Vbv0Ix;1KUGWC&kHMaua9qlP=Z#m~ivS{8CdHm}U)~;WIyBcgRO;Tl!m7-`0y;(2NgA8t-xmRd%BBs=gZJp|<;B>`alxfuZR+PJ=kEERm4by_zT`X*) z>>&04-S@L?y1&0JoV{Z<%u4g02$TIH^}6)V{2`E9C9|#v%q$A5)kYPL`#!v4#Y_pP zd4*Y5z5dMwTp;z`kaKBL_vJ4EWvifPW@A^^i)1;413ow@8ebgS487=6Go{BfInSb` z#Qt(O?FOu*=aP$X!1a)WnK1EfC_S+jVC`l55f`3Y_8G-4&#K5{rEW&6YmwyWm!kbi z9>j>7&4*Ew+o{SB>7RxpsR>|8Ps}{vCdq_8NCYvLR{Ck3W;Dp?4EXO(0)Y&MJu`xr zs&J+kqp}I+gN7u#+4aYsLZDVbK!UUi>ke&OPUWa+a1|~3uisXG+IPEPD6H8U8Y3o- z|E>;~;B_+phB(+$rkIyuMdy%5bzFaL`y?oV@&2E#(-#UDM^Q>MaZhFGdG1X;mk-WGYn>pO6Afq2T=N7Tbj+6%EtT!bB7oqMBzy~uM5|PH;+Me~ zkO+`&p%utBj89~GWk1{@ne5&?w`mE=P+-wbVcjnZAu*-b{~Z{}&MFt@uhn#)Xi1ot z?YAOUuErVe5mMCs*-~NKP2kF8U4%(oZV-@y<8O= z^)$cKiy9kfV#1W{arN_=G#LDK5sUFP(~Cnnx%5dkTLMe3OFR7x&=!J(GS~4la5s@7 zY2Otd?{5HPE)jY;u5r{lFL5az7YMCSu#1QPUBN?ZFVpM2mo)<@*ocM=Zk6)RtM0Gp zFvCKq);IH4(m8A=s(#9CTn8nNbbtGJ6}5&N)c@f_E^pdLS^9Owkr>VDb&-S!D(yUw zc)zMI&qlgTiaFAH!dy%*lQtj%&3KSh~kM-8NpTt2X=B{7l4t56OhjfzmNZGXIdypBR7gl|l|@4v>) zeTib88T_aHVf$n6rY|pkTF&PYmz{3(Hs?6W97hqiBNQ7uo&syf5`#~fSil$y+}Vox z)+H-&V`GVl?9=0=JdfYN<$Zvehmsat9a<*t&1j8RKUz#DS_sD%ruSJPw8AK(Co7(0 zwU>o589C1VQaQi<#Ku3J@c7~rCLb-Fh-@*ospV4jXMQ-eO{7+gZPY%|q#9Lv-q&NIFWT7H`}F6g3un0V$Gf>x?&cI;W>-*lLp}R<+S*Oy+dj$2P7! zS)~-jL*k`8d|en)AKd+F zrT$piv7A&Q{+d|}{x`4!z~M9fEt!f50^CmuB6n~_2$@3WQF{B{3-32;Yrv4W@@tBaEoJ^)J){Z#Hs9 zw;qlQl_qUZOP(#CCSt}wv27@0^wLE*=D<;hyQlas>a8c=(%&W~0_&b3N?<&}+`yP} z5zC2On;9U2Ovsf3Ipc!%YSb!pgY|=a9 zqr%l)%|SKx7u|p?aVU)e+0h%=FMG6>CR+TC3NsDDwkDTM8tb|)XyjuGD{Nro)ahbIv>EKSq#H<-UaWIp%47E> z5A!O}@z&P!8N00xdh_T)kODe_Wta2GkJ^{b447IUFX8;E2*up>)@u-F0$RD~n?4(jz{q%=Av`Xoc+vTHn%rJA?bFd~eSdqgH!EHGO7gZg z>)lgw@n3(&n;K;cl3guUc-Y);-n|lAZ(!|t zSE4_`h+1CW;&h-QRhhD}>ak&IFDLbcW1H;zRQz|H3+tYeagPzJ9;xwVkk0}1xTrYl z#$*3)iofriXsw5fibp&r0fli<*0=WoXa~+4&nQ;I2 z(5J!|k}T!9c#I~hI?Y?(MJ4b?!w3;-FA(>aCETwL6ua7Q^)dfhSjdtCV0rVxmI|Pi z4vTy8#Gf#p7?U~+8t+h(i-`AJqK+*}hutvSHGh^W&p|KVN) zjL#r}KIr>-ot=V0{QS)S9=g~-dhmfjCk(747DF!SQ!0s30P^gzOGLiH)!#=kaeisd zEL|^Y)%X{Te}3IYKHlD0lc!FLS8XxTRD{ckW;@3q)!Tc_pBkMHyw#fqFopt<@V)sR z!`;&R7`>A4VJknX+w&GhSCapJWlwA9JMvyz;$I`E>V?ixK%K%)e;HKR>7e*uKu%&} z7hMlxJVLY4m*T9}4i$9)Ep46JdtwExBN(1wjMAz}jX{I3>XNFWRJIm5w9C%!5fhsy zJqcilYD1I-TxaMudm1;Lu8okYfDoqr$%c>?!25_Q-Y;`lTWzlS+?`=~Me5mY?kVB0 z-^0BpCnBbr^VQO9Y5SfNj34~^eQ0Pv=%Df&;Wj4&-B-F_@q>%rQ%D1O}oa@PLJ@=;|jlH*p_z%}iDLrG`u;<*mr9VW9ce-A9~ zR4L6+;=^`dswG0!?M0H)$@wfFvNr28YnpZ$AL&#!C(*-IS>Ai8L^>^seIpY(hi}up zZO&tSuU}gdcj6Lkz;VFJz0@wR*Gqw8hVT z;nA1r3r&;7qF>vBTw8SSmrDHIwg=ig(jp(s;#pohkRECL&J5A<(q2nTKM*OZ7^_p8 zq#hMr^>D1in|*k{dO1K*Af5g@t?iS!_m<+SAO*GP8gFR~`cYOw6f*Dqv!k-IK52rq z_!hN8Sd~z5qJuB~wyp$LB^Kc(4Jgv^@WQqo@!>)mr8BQ{f7u-Wz8j*?(5CzHz_O@D zR=!y0(BDQ{zWCjavOZtl^NSYnmU1Q_O`y{~)&5y$L6rn^(|-2BQsTN7Bdg2A_929a z=>KBxEyJR0zqaoS1SJJTIt2-7K|nx45flkUWa#de?h+7@lx~#n7@C1WO1fd_2I~?<)rErgqFv!96e#I#QpZQE)w>Y3gW()PVqBw`Wo|h z-a>1oS9X0}%teXaHTxO%vpYkxDqS!Gv5(c8RdcDld!M_f*D3kUYnBviYQ8;wZ;Otc z4G-|-Ql@@J!DWUvd}_%QD!E9shxO7nZ8<1R5rgnGBVilzH8S4#PAQh}J;Gc-@3Mp& znZQVBSZaZwCSVXS2nKF-vNP&93zT_lxUp$5yGtXt{dddrd4$(tbe;>=-xd=xvm2CF z@AB$hzI687R=8Xv4vPRI?_r0Z7$uGp`iekUd@?`mzY`Cr*iXktUB@UC8c|E}>i6C;PIP{B_amQNocAeC!LH9iew6 zyxDDZ>*5R|`|#e5L;h!fgnB_NEG+EdvrsA|DUaee^z5C0^rzgim3SSeDLPWL-$`QE z%}HW)pi~BER&Zp5^{ejHG_Muem0GgYN1;hi4tjA7Uf4Lfr8x$<4=?%GE^$qXWLz@5 zbMOuG#YwS&3uJc*-IL3lEhM0H3AU;7vYIpM?)5lyNVmEWfVL=1o&OfU-Wyi#!`;QP zX*^CXA(7TXST#1(7x08hn4)KgD}qN=#u7XWl2sbclR3}l1=F6x%526>Gm>vcaVtI+ zd4F(k+2<16j(iKB$bN=#&_Ks$%M}yITAO&om>IgoOZEL}~rjat4+N52;Cf`o4*|onfFUlA)eE5;rC}XMJ#tht63Q~@Z%Ou_> zdEX8O$KTGVd2%;OP+E@sOhFLY5X68MR9kgVLcI8?iOGi%bNK>Wp4%Ap=*)jsjGrJ2 za(nrruOHS(&aH#S781J)=L|~b)IKWkIN2u~y4aFkUh`9kv}r`n#_imNoC=tvdgt3{ z{~%URupZg8>Gp!rZd^30wiMYoGQJ2Hd=W4qCg+>tG}Aj8}%(T57VQP82aU`Rk8RSA5w zT7HQce`Rece6JT*%)7Lo&7gmntRN+dWWk=Lv&Xa~00| z;nuKDfhnu?6ebldw}xjfyBTm-;r;e5@h#{3O(yxPnl7J9^k2r{Cn%nvyUB^|@A1Js zXpN7*tX`E)Rr3U6D}jAzO?p(U>EDQmSZ{JBMlo^0JZ(%x19>UphS&PCAZ%xV~te5t@4$;=RW8?R4_H_msb}c^W59Ovk7;58#aC8GGUQ6lhsX{ z)Cp!+t#tSQ>b8H|$m)H)pHrTAbpjvo#Oo38rGgB*sBR_2P=@OJ&TmCH=CP%GrlxvHA;8eMtP> zLoU1d8O9q~37CiYN(k|auREbrE_+g1=LR<4livAtye0I26S;V zb5cjOgl%R!XT&C6r=c}$e`c%iF0Q?lc6C0w13k3X%If?TDy7o}-|u3@c5|~&z$CTC@J;)fu5a4l;}oVOvgy=?Ltt!m;|Rw z5;5`Wl4g>N2@N7?hq znd0|SV-B&9GPrx8?!P-;E6IbEO;n00H;pUx+yaW%QTfd7fa*j{#`YuqTL2j$dOL|593}+yHWhOpljX_#XgYth@G{D7FzdRwjx2Rm6VvVnmmIj zJHe8fCg}&*@m|uV8?H58JcO1^isC-6$8ZyZppP?WBzTM-5{_O)M)F&fmDQj3+1ssp zf1WBT*lZW+ynXdluitx{*zFoSv5yeHGhVF6~f19|?q%giz>J zFRFf9GZa2S=QkgTxS)k8g*`hd&D62a>hh_gblYWgeqDWaB3JoUL(K+L8SxWqucOQP%vv2Z7ZpN&nTo%4IqF!+(~e z0W8M}q%cTA6Z7a9?a90q^rq>;Fr^ih^vz!N#VjDoTRup}aSBESrqXhq3z}B>K}dxc zq79bOHfcU(vnM@gk3HG2uUe~`z)PyK;OuX__BYKS#f#zOv1^q*IK0&Hz!L3y)+wmf z;U5I9dUb?ob5$0-?xO1%fM}{PZGzCPZj|c$IS)EWb!Q1HSK`B|EM;8PJ+}p>Rii>a z+wFPw%%;!Bj>PT*vs$m7AwQKsbGg>sU3ZgbMQ&&)Z&V_Eg4}F%gbjZ7)riA1Qo7ez0atGsbjXt=({ zJyFqqq=Ls`I+~qz%yVQb^t+WYOMVLSh%UXVx-V|Cuox@VWdZGiyp;_I%TakHX61w$3s!*1y{&zmuODB-m-M)kZ(^h-K7$4;D%wYMsRrW08%p zo*PG@oqr}1*VpboR;qW?x}j_K-y`!YW5h0BLMWv*tK@EjS}Ha2yJ`1(FtGb=BAVxQ zaX;59I5IlfqLMXBD>0j@YR*=})plgqt|h`3DpNrENIZ6oYukCk^OdXX5g+e>IkH?~ zMtJ>G5mZhiXm~Y$GNlY+EHCh+S{*TDy~Dq~ZJ*nk$ieYDCh%vl@9e>m(M1@T;$Z$; zjvACDKX%f;-sNyfBTE(tjoP5l8}DzaG-_Ld(HRMbrQ2>CnVF;|KX%XV`V~S{{-P5H zqyNXeLw*QIqoUca=Py5p3PcLd(me{iy7)fN;I2NWB!1b`U(6Sw9K8P3*%4moK22n&-!qy52GQwH_N;fl{y>U97i0 z7iD`KOXQJG3n*PxcvgMvw#&#}Z*DqPBm5Ge{=2^edceE+mqV5j{FwTGfDVEqUV5Fs`U>VtoWhB|M~K zI>mL1Pq_(ur124Ourn#Kyp5xTRBv0Tle`wJtm7Ib%L%{(h_{@c_>-vP9{ak>I+q|z zOVL2fN;6rrks7_OuFTqB)efES6P_-(Biqb{-s|KRH|z)x6}M!+hn>{b-flY=!oq{^ z2v9g(%&lIf+|V#w(3U#i)lm7BpB>g~$KgEZZIYbqaWpqWoi@ItT5JC9rt9hJ4x{(J zrKy3NLb)pgpQ-p?Q@+hk0C$PpKBVh_>h>(d>Fn|M6fG;{v!q#)?osBi7g?6uq@0=@ zlc6Ugox_HHZjRgTY@RA98%ybyE&OG(4x6<0B{bJw3h)bUSjScPoDlqqFWS#DbUtK9oKOy%t9sEByTJ27)b8hXB+r^!FVbiCAkzi9Z>6qw$pw| zEB8S64Ql@?nXzd8rOV4#M`O205p z0rlPLp0=oy8dF5Pb!`=CEIlza3XqIDim7buT`2;!Q#`ro}F{W}2OCNKJ#q7X|Y@ekU+c-QYxj9eadwTq-_hJW)7fsBKR z^G>e<-C0;o`d_`_KfYb}fIwojZDSYwFD@zS`jfB%c>rg6-H_tnd_(Z**knPj_x~^C z|NS2R|FV#(mYq)@ffb=M(W(0!sl0Aai8wWn$EQ` z<=Gowm291PB5}fg)NGLt`AT5oa%k6N=GN*k>ftM9f4}kZ;$53eN)L&w`Je3N|GE!- z$FZhwel`p(EvJ6V4&+*ATw&(@7n}6Y2BUZL?SN&LRQXB ze_kTRyW`TljRLD>x=DxNlCXhhW_k=lLZsUzMgrf(uJygTor;E0PG=fb!dxt0Y#ndL za+BV&!#d9~DeU$*{kz-f0CE%U{Hxw%HPh?L z4Z3|Y75%)p105a9kbSQulg?-(+U#UQ!xBL3%}02qd;29zJ+v&FDtN^QesOE9MNVf$+QhzU){jyS2Jqu+tA20oZLdbh$ zwxW)(Orm%&-q-^Srahq?MSOub^xurm%0+eXkd5Y-#Zqae1z&Vxvn0Ty`*Ca{~MdSA$%HK88NW#~~S(PR8(O`Z*svIIDxMAN2T0Fkv074NB6w z%%h1ERO8Vyh-_CVF&QR3V<7zc6EKEX0Dd0YvN5%KPiUuR=>>M21R*PnFCbUG@w<(` z#R0%2GTQDdp>|!guFrqgNPvlmQaV*@uB}Le7zYQCfZYBH`f>4U;^bW3!Wfw z*_a2aZFEj)j=&Ja4R{h)wMQhKJ$e+&+gN$QnUOHv&7$g9Famd0)6hQzTr$H4AA9-7 zI>5M8y^>nv<3u$}q8+wD6-9i(EC0{`@{q=&k9ZDFTM8dl)L;I;A36Wce?-@Tuqo?F z5b5==_l6&MCEq_~iD+Ni{`KX_!v*QV$SM8$ ztACdspkeT0z0-Sgi!k_KU!M15AhF2XeLfWa-`rsT?cHcd2wr)cp7dW|o-D3X<2)0e zBNdW(K%n!u*siwwiFW5Tm_uJsJfFY^I2}lUu@JS#p(+C7(<@8|oAFy6&L{_gV~iUh z%3xF``QP{vs*(t8cPlV6G@%f-Q#amyd_PW|urjSTu31$7rq*NaPS)w5zxCrYczy=*z=iN`4d9 z=+QEEY6VD!#53BPp$=;U&jFi-0nn1%f7LOs54G@OKuHERYdh-*^!(|h8geWf-WQR@ z**#W1kEfGF+hk@QTG%KkQBcJ@X+-gjhHAQnpkNF>Gr&T^Tl++Ix6ai`I(dze9gN`) zW6meC4;sOs>*F>vVUJMDB&({-519En zb^`)ZpOpckPevC^rVIkZOPPPHs^uSk2)8L1GHrG{i&h%XgQW6XKo-M9c3y7*LWv1MJ)w7(N(wl*L@o;fWYZQoht=cYrAdwv(cx-JF)Lriua2VJFW2mu zKYzrO6CXPR&p#6j&Ux53pLdMTg!eiAys+@pO?3ejNBh2MmNbvZH82jYIiE0**CQ0B z)98iWYY{K`V+=ss-uLOPmkY0|x}Hp!7{rY7r0Lw|BQ+k+N%N|+SW`8=C|E@RAP%8i zs)*G&<)ZZcp$;mW`UUl5_nq1Y0dIJ;eP5J>GIxhQr74-7a|@>|UjmbPZTsCt#bm8m zkCMBh=RBHqD$SK&Z0?NYDz;P}$FfgZRcke$zh+e$VJe?C|I*o;1D7>nh5M{KE9-qm z-A*2-uamplJPzOGO_9K^Gpig(*57U6y-HMT`G!KDVHae3zF;R-)45ZJnXD8+meHIm zMtE>nPv@!Mf`w)+gxt|1d9GJrisOg3Chsx}D2Fbm?Pj{p?VkSNhNdhD43~^1TBk>d z#{=fy>PrBvAy!A@SJnhk!;$-UwbvsEIevx*j`@19Us2U0+sc0Y)>A9+*%xC}w6Sk1 z8%-LHQzp#jra4WTnd!J;49{iDQ^BB$F|D_aB@8i6xF-h2K}aXA_X`R42euH>ele|{Xqt70#mW$zjQvSE#mrykX3RhOS3Y^e43+ev4<3IN zy?^sGm&e?iF2So%G8M9t@vGs9O{jw(J5H-izIp zW^$a`ssJKyrU0U9oDAkv9pQ`N595G_po>@yI`we)RfKR)d3f$6uOsR4E#S%XfWZiQ zB@Cst%0a_gqPV4)i>do}!kZ?tH1#S5?AeGd;0tul)3y}dBt?GdB%IxP;P|gUNk=VH z;GbnWd8Y}MX{b*C8#V;DFVPOUEm)xTjW0Uh$H~aOjYQ=&Ab_=O1Lz=MTp>Dp%&`#pXxB&qYoC3T|P1PQN0jS?l9}P%47r)(E+&1P23e3`RwIu2P|~WI7x|z zU%rxj-^v}=dizWE<#(GoIKa6k*T0uV#*Q#$w3~4R_HHZnqc+b_jcRMsU!XinK`j_mhpCh0($3s!|In73> za4MTpPDk>6Tlb|4?tMF}Cu-KM>n2A7I}APFF1WtJC;!?FKiy z38T19&gy9fu=R38Za{R5EAJETPEim@KlOx!7k83ACh*Nf&z@6GxGQd!ZI?_T-H`J% zCRoTgZPb9F)L3rlIWLru{IR~js@DG3QBI3rcR1BBOiI!*FX0zEN!uInE*)6A1*ZuyGOwk2@;(QJ$yqKs!Lj(zq@a2o$##7GfM9nyZFd=C;z?S4G<%HQ# z5M$lfnD1gQiI$0CMnn+6G!9&z|e?9MZ!WZ zlWF9-^pUWEry{eP!b7S=K^rbGhncj%NV}}=r%V;jA=|s3FOkbU=4u9&P`2(kZfE0( zMmLU)c47_d+N>+I0PQ(|yV0WH(e+t7o~0?D~yE7 zs9-MEjwjUlzk|7tiVw7k^Zg;hNjEvv*-NHZ}#(){K6|UCMulB?U zjJPH&P`NdsAIzBvuuS@u57~A>87|syjvSV+*A@W<&|RV3`on40St8HbJnx*_fq?$5 zboc4}KV@!-W$Ybhk^}(Gn6YMm&txFn?|8x#KE{ln18%9PyI$ayT0WGG(sK{+BauF) zu5hC~ERJ-VHfQRqo<8DlC8Ho_Vtm)}XLskIc6X#uvcb237kzn_yb(5M>W@b#K&fgW zI6jmvh3gL4oo(0_rtkAa0J6%K@O||ij6e_zr;2s+Pp!^V0aT{?ZgKMybY%>=#sXd< zug4pr%D#s^UzBRsWJoizmlRbv;J!%^&?ELTNwh=&{f zBJPN}n`JXVBF13S+?Aosasrxp0BIZaSH>>y5-4dnQI9#6|GfRu2_Ow2GTngLvd(dG zrB2{U48Unc&})EmM0o?4TbveIe{*XM1s2eu?t^RF5T|Bs?&^DxU9+s z9kwmvuCTMMV9od7jcZL~PU_AJOF&Mee629Jih0hh6n7LIp4D*EA}0#|Rvyw>H9O5h zH3lO;`2qxII}i;;&5TL8G*!X$l$C&EJ-4CbV`P{;=b+A}-)XB=^-QdT*7mHJ3n!o% zslNy>@c8_q#0uG}wmx+qChHc}&keo{wV)gA`r}k0wo_(_e=7yRNLl?m%7_#OcQ!1*9q^tl?lcdR?7cC+JtkYcSS?Ec=N>qgZOYh$yFg=ZH+W)7IKu`F zoEHKq^s;w9LcbWR%l9w-efce(*77nKfVh7_g35Q8m0H|PNq~>brci@ z-R>vbbqir6ypL7j0cCr3@d?3AeatRpWh6K;P+_Lz?pms>W%XS6DcH+F7SlDheYsZ! zt0Q2@|8~-<+!O0*Ra8RC#)-}%3$r%hwPcaMj18B#zk+QfgQVtOekhJs+>NX)Q#QhV z=k(#^n&mX>!S^C8b3qg2-RN*+R~Zd@0xJ0$KxEcS0yN4tsieh7?n26Tn1xCxF@=GUT%3Xv@SfGg!vSIOcXfty2x=-et#-d_IrGsIviO>$y8eoz-VQSZrQa#A zf~@&5Y_f~&av%@7xtY!a>ufMB7Jnz%0Dsa&pCBqdO^;Ekw0LwI14@Ji^MJyWiLaNQ2<>rnkAaa$i61x+cS(Bu2tV(%vh@E(KOVzv061G#0yWHlymc}0F43>lvic1$HSYhSa?j{<1&8y)RgALd zgP1rCf66ZhP=09~Nmd%(d$%SO^C%<0zl3hrEihDIvO{D&%w1eg6`;XIbY201m7xl` zqp>O)-*KQ_ z-4an%lxwW$f3k)g&y*paGtX0J9f_$>9u(QFAFjSOPO>sqkv{D;MEAXqp52vW14l|% zv|;a-&NxlEvM7aZKHj1^dLHHsavtJ-F!Uou@pNp%ZkFaP{}6ya&!vM$xG%CDn+?a6 z84RR5abGH?zKe~%)O&Fo=l*=1COv(UO6EUCucUUIc(q=j7pRm}iL0KTyiAX21G!&# zq5X4YJUx?bYX&KtF4`}A`gM|W9Vo3E8hse(o&VWd z&}T-8CV#h<91@hY>F7(ptWTsW|RJ1VrIDr6&=sfIDcI3`T>-CVSTT`4A%>P zN$b~B^fpq}QeEFQGKSgb8ozpSJo$&fbiG}*q&?BKZ;^*2d;F-buV{D%}mVxdWRz6xq_jy(?*jdo_jRO zum~cIXMeNexu}>zY57t2uS&J?BpB>%5U@$gX88BzRreN3A#&X(n~RoqM#3I8lwy~A zlDnCJZm4s_90X<_VVMS3H$88x>NiGFm2D~=NTlfiS9h)sT=!hd%2CLlEraLF820qV zQTY4SIRLVFct&E7?_lOkKKlbe$7W?7gKeM)#hGm)eUj{$qH}f5IW8;>{TSJc%WlcS z`Tr7n_DKgCb8g?IUXxu06dtZgJ2|dfD+XNZg~1Zm=!(zAaOKImYOy=SiHVLnTR!P`QdZ1RIYqrzhVUlKW&R(n>4R$zduxHjE-qWChZB`tuC2^aADp)=2OS>6jK_{+!XgcB4q@yIb(eUDsO-kbCym}?x;At;7)u!ZQtJsHp6@h)5TVKxJ@E9@a3YAdu)T7 zk}5^x$z~l{&8DS`6;3yqoM_Lkj)t8q7}Q<20j8>o4hOK7W1WnKXq=}N?>^i6Dc7kh zgpP^5CRExdytz#M`T9AJ`3UQr6NvjT8&JtBymZ_|RQAVl!_S%%m{2qqYP*gH>6~aR zQ~=*{&BhG@Tc?B#6pgSqhCQsVfA>hFxLk*XFW4iv7>0*m?}I(kF{6+zDz*5(AlYxF`Jxf#>zmakap%476JG3kB8@7_89x2!p~QdG3ng0r$W!NJ@6 zVq4Hz^A+%#FDEJRFIeGstGyEE>3ibuYSn$8tec4E0bwCe{%Q{pdbqh8jv{;FA4__j zud@u!CZM#rBpZ0`n*F8s)uzt-BJ~3&zeP9v$9k{g^Dq}bTJ>cAC+*u<3S!cqyA^O; z{eicir+)%~=Ekg;j^cE@%DVB34u71ggUmSbDr{V)MPSO35md32qIoe=WL%o2;TE;v zw@(d6-_#S6HUdNcpQvL$;m*8?TGgI5f5#=1)N-~RrJA!QeuE4z3Em2$5$OV@C6#)j zd|(N-R;^tY9*yvaV)(haPTeuCSy4xekAx;hq$q0&Kx?DGMh5o)BX>*V6&*Pf)se6H zUhp;0dvuO|i;we1Ti^VmvKhZ|&}`I3?t0&_*NNRix4k#8!wmt(U1pg^1>Ay9_QiSt z2B#7x^qX9e(@i_YVMvA-U@!**WoMs@>+g9bOhS;WM`=kkTE#QTgfcSF64K zh(HBswKb{!kXeu!tTcFcYMJ3q1A6*F(;d2+1XY`c<7J>x*yY&M{P5^e4ml>`VUO6f z54BV(M|J*q4ZmE~x4?QSOp+hSCqCKlgSLaBtlUcPLy3$!_NYdlmUwCZ6Ms*znsnVn zSQ_-IdKJKNER!){68={7IJ!ZO>N85N%NjtuZXimi`|Pr$`%KtZUz^IFQ}96Xu_H(k zlKN+JL#*V|l=MEA?7AG!mG_HT>{45OkqY0Q{~nqKZueX1BqOt47gnyIteI9mKU!y0 zSMf*lwZuf#Y&2d~$dTjp+|y`THwP`H75(CM-K#THA3^Mb;p7oYgI}3--#yMMcix@- z;mLmwa+dKbJQh&wE{5`mDX+l+yG=@bEK9NbT~H|e*}<0zt0{y?k}bwpws$4kqG}dJ zm0NH66kZ%<$wLvhQ&#$Rr)%v~6facR->^_s;9Bge+|>1%<)r*BAWyXvNy+hk>2W-& z!VBcweJ8C5bI;$5gaJNPAF{NV+yd?UgxaqS5F*@Z`yR~*h|7_M_v-VaUj<93>X{nk zu|q(7R|7En5GujGqLA&(B83#;6S~DdhrKbw@ur%1E-SQQR3dLiN0%w~3?%XkYWLnF zK_VZ0BgPz<_2@gNuBKfl6TFoG>3~Vsd9*&fJ#k)E z>@|H2`U%}F1mYaxmwSFTj#dji30u7bdoQiB~ zc?L|Z80;a&#AEKe4JOqB`JrOeZfid#EF+y#c#0pGG3x4NqAO}qwXg}=wLNx4E%7E; z;$!(VCFbU_aGc2sGsa1#hH4^GlkjzzP7z1`%;4q~0?;GZIU<4P*X@ILYuXHGV$wg4 zU~Iz+CQ+}nSPI?VPW+1Nbz|;svs^SuaRCmLC#e@qv3Wx!kfUF-6atf`j6y)uN$&jQ z-GZY-r@G#V@}4($WiVi zzoHd$+&bV*y~X%+EiR?+yH1!{fPj_M2Uymr_AQ5$M@reRPj_ZkhTn?g7ai0P3pQdc zvz6*Lp$lD~Ev|k0;gpzAoV|#fjw$}B zW|FdU0rPIBagGMN1NI$HQH->;{TG%$bHuKFs4n`iFdBAHgy?*9idhF`H3UUb9}L%v z!fzrNk1C2XPt{hjqma{_vI|OcB*K!UPs7Qe++A!7UI7|?IlBYd0orTTu?QXxEJkYI z+~QEZ7^>Rk$s<%r&E=k9hv4^ANWjve;;aJz%&pUUP|p<0#q%^-&u4mG01l-r8EnTN z&VqQpsqG?ucgT&VPV^RfEl#|LN|xDU8ZQAPe!;AgfUQw^*t*^0q$i~c&oJ}b_|!7l ztkKWGvwLj<4Ue%mrGGk{tv?f-{wS%llJ2kx#q-U@tiXJ+`<;{MHoKo&9JdZ!^RCDi zPlBj>m2=fv+-c|M4_X3dm`X*4Ia(7IQi@g+9~wUoLZtlkJb?gy0CH(2+*DVrm(1Tu zQoy!;ogiS9NR=a21D;p9Xhr%Vv?7*X|Io8gF?%b9%)H7nu2O6~TMUMFbHNNXXAH+F z7)=5q@A0UilJ}7x68iYKAGHMVGgXtH-7S0z_IZn&0;xG-zvZy1hw@3nPAcQ`YI9;n^qkmbho?Qnl%G)3yui2b}z4+ zPL{Nee(@X2MgkXbbYy{&6n^aRLgT36Xh^o8r{76>WGupRxXlfT+qykTZd^+=idvfM zb{!#Wv-4zL8n{4xdQqVHEseL&_WiGD)#vy{b`gK7Dh|h!6t6D>0IFsZse@s6j|W-& zhFD+tQZ9khxFN^TEI40Nj#Th)ksB4`b~98n-@&321Z`52NQ+q2O>u8o>mt5@%-t5V zUHoE;Pv~skufU;}L^IqBcNwVQXo&Wl^+vm6_E=Yb`Y8Y#|Erkz0&+dK2|`D){A3p3I&24YSnUupgWU7C322jTJ)`tYLXt_|y!mN}3UluR(S!Io zemYL@@HEARg4$|g>kaRFX7tI)mnGT_9<_sCnE-c5Ng6;;}S zkHUmf&We*h*TL5rOqcFXHe4#JXc<)}X3-^pM5g-W(A}z6h22CD^jod3 zRK!mP2nPN7cPpFj6lo^*6CU$FOnU;AKS8&x7x=7Z{qj$2yLzm$W05G9BNv#~dp+2d zsgA7>h%Yeylgk@A5#~tvnpyWiE9RGp@TH;aK_9zu1B;Dt@_ufY6pNUbEO^+3t}QF( z&U*`03D`*k@^_MFnQ0stKxR!YIle$=+e)OYE;;I8{%q2!9rVGBfIa2l=PiPco2$8- zTD2NbmP<8u)h~W!pLPwQe`ksEVguDG6Cj-7nbQC&e=wr-7Ca1O$!Jm1P<4e_cI}&J zj#Y)TQ5aWfL{}7%ACs$uufAUd!sf{>HWM1Fpd-+pjizxLZ%Sz6M*oJr*7Dv;W<-Sg z<4-gvGKyJOWC$5$73AL4SmQMl3I!-tLZWs$X~8|=Ff1>1GfyY(Re!be8(?P5N^-nC zKt?lW-Mb2U9Vb?sawsj!Z|;ET?}qEQeW8Fu465aLIt`Uru%al-AeA@Tw%*GdF5BBh ztdou2-X_rRI$iCb&c?}oKVjl{78A+&RH3gv^BXkTQ+(|J76ziQ0z#0!m zg(#id5DC|Q_c zQmVl#_xo;+qb8Na#&m7@mBLoML71(;0vTGGPp7jo<+t@c&jf5af34j|gdTlSilCdp z5`w?TsC)H|aQ!1V16T^Dr*3Fnm%qJU;#%z}1X-p^`i|o z6(kD+%^g@@pHS+?m^tZzY zoDw@4kepQ}Cy@EmVNOAHm?K_1T)0N(#1|V&GKyV!++MCKm%@;J%1< z;T^Ocq(LRay+({!v#%&U+8WlIPrj1Mf(WDrE#>jmKq%u0jMV@gKu)zTsLeq_@yk)| z!7Ms=<0aHxbfi)IsCI6=%s`3xJTpq+pvZfxtUX&(h3uVJ_DWuE0HXcZPMLwHgqor& zH^jKbrTClGB-jq8t@wh|5g)tmtWty?Hw)-ha%eM=bBZTB9*vn#|L79VGgd8nbYA6o zzMfN1XnsBkVR~U~c)wJ=*y%KB6sG*^I774PPvatoPrbp-u{pnDDRI-)spCOl=2T`E zl(po6GDAPlQmEkU$3v%`+EruKq_!}LiC6Ish z_m%KM*LybB!DPIRmIqW=JD{SR3upkjYZ}-cz~hg4%3^!HhgEi48WD!g`92#-7Rv9B z<&)r5NCxnbUOdj4J4V5$TyDon%8yl&|1}iHThxBzT2-V49)avB;+V>wKPzQ<5_{rR z{c+)iyK=z~hT7Fs_bu&xTF)cJK_g&}l%ohyF4HLt$8F2Ui?yR61y5MzIV*?d{#~E8 z{L}k#ufXc?SZ9Ot-@fSkP!~ZSzS=xkO|k+h5uIF!EIoYv1v7{zkE{@5oarR%^;)hL z)suKwt@08!mc@|k!QiVl3=ntxk2M+7Ff#j*F`B;_5&~|7CPB7DIgYpvX}E{vqW(Io#m| z?AGJSc&plPZx}XE&BYJy=?A}){={#ya5KT9*`*otg;DEwa^eTI^Ph01dn*sB;feDG<`d+stT29V0c1E$VeKuqw+e3QTx2OO zNKSc_4e_^EFvln`pq{iDIqE%_sdGH+WjAbgIGf0n*Kjz5kY^2P2$&ZtaiA4tu3KTM zH1Z4wrn?O<;eoy}ndjLO&3k2X;M@sy=11AZY=mcCmSVoLX=`wpD#P=AfeE65X?6PM zm78sfBjOuM4|>yGUHgl~csz<({y?hnVh487r#$zf-@j^^hWZId5%~@ts!2FsU60w! z@s?G#>q7N`^91%st-ijXO2UsAtRmFdUPP>^KBJ;FC^bR(-bc`gR!)es{snAYQa3_= zRSFv)$g?~gKAcI>3;XhbTsUb%%W0xbMbF`2xl{R*IVwnLSCL3pGQ9Yht5IDn!v57F ziHd>|t5s7M7}k!m_QJ)lOEvt4S99||Kh>0cpB0J(3GmfqynO99j?=cM+O>`HRV-{a zsq6M#4rd7`@v&r}Ba;nN2TiLL(1NMXsbt6--pK76c$0N=ecHHfG|86*fmnM>h zP8~{zkUtMl)cADs9@`&T%%(=2-Qv6$BH56mA04c4{eRZw;ppTGAE(Vt8n@G(?Gz5v zX|O8HTxae3pz$Y1$Mwl3Ae^Tt4y!vfYhI6FK?>);lM| zj8FHfWOHV9vUB{_@i^tE=zVNUy14k|jGN5YgYfI z-)>7iT7WD5{^xF&eav*=y5kc3^tDYGEDionP*<`5C>-yK4{O6f;n=DnbPklJdsZ?Z z?%0GAPj&<;sf}RUgWDzs<@YGk%_kWrC+v{=9^tG!(Q3eO`E>64|v|2g%wR3QmZqs2smKJLID>c4~uq!{X3 zKoO9b_*Ou^$@nf2{c`4g;zj6(xNM2>e*TJ2lRpA!H(CZ(AEP{hy|Mz2K_hf-%aGg1 zk23^_iLevCXko{VvnqTA0IvTfSOEIfX6!$a5My-Td-qXBI3u7_-oJ9ZZ8ZUtjpOPw z?3&Z%Hk>Sdng3wR;&r}Dybc^ayBohm^Km#(w;S9)?c3cS_?kX@+M_PmI>nw^J>+n4 zvK@8Lbho(gJ016$NC_x>gv!D9XYtjXRbIKjC(a&7Z-b)1GX#$cU;5w}Mr( zUgGYy)4KP>WdrR&q@o$z|8W1DgfkE%mO(*qR8$5!QRmqS65l^yfF>FcNM=s=a*cZv z?G}SrvU@&p&DnKxu7KCtkJX~;4^ZAZNxgSA)_qC0xPe@v84~??P~83$9SOP-0e5H| zBGy7-&g8-lxzzSOk7w0{!jo{^gE^}Rcl{=Sp3P8Du`a1&W2894dPX>IIIhMN8Plas zKL(Z|vyk8pZ(Py(y6(MN%IwiIpqH{! zkvi8Wfn0mN7wn^kM%>T;ssF)~JLlA*NjMpxfwgDk5=i5;C%@^hO;_&+c{so&3na}C zyOwh`*G1>RhMdqZs zbtP1IEAgM{E>bQA#|Ne3zO=P^qGkQ`OPd(jDvl>Fjc)$dh}OGio8R4ZvRiIbS<(Xo zZh?)X4L8YTqA9e=EObQe_cqCJLOVCup0_&zzMjWsVdM2Vbh`?SKS{uwA|oOuVTrIo)$F!` zI23oWK5E_d;R%u;1=6${eHjk=ZA^rwy}TK)BKO^V5|tPE7dTC4(+_ zQqMjVn3vxG2c9SP*lNfug!5?ytM!HMLUX`LTpLUB$4^_p;H9kIJu0rr zUAr6^VkQcR6@7Lujc9f@C-Et$`@VzkprOR z!r%3Y-*{j{G*5~mR{T3lQRWpLm4oxFNg0-uhk5FhD56jnkn)xLxH;pF(0oztZb8~# z?4XN`+G@OEBz6hvptWF56-yuS#s|t6gLg>8N{80felQWY{1A@LoMwTM7Ud0!S*Mbh z=N{q{Jgx_ZupCCKi2(|Pssa84XyGTz%-H~gmgjJq*LfaF|5m1S8HlMk_~aC{K?6>0 z7B}%GABkGrEzq=rbC??qW`v6g8)Xdw^B{N_cyo0sTdmyq1i6`HCLdq_83R-Zp!FR` z#T-5sM9k6^K&H(KY@**UPaAJYJRxb8^Vb3iIEP)&CFY1UJ@(tIA;9d)BD7sGJZ)x@ z2~3ZDxgKEWLV>93JV~(+mqZ|n{C2DV1PNBcU$A`L_}TT+7D2_`M^gu{4~?LjM|F?2 z;po?scUF?#+Qs_$KuERf9P(fRVk6jM^|oZc9|Nfkl-J5gsA^Om|CUKlyq+^b@|JWX zIC~F3d+>`FX1k9BHRcgHqH78an2QE|0I_xXny8(=6p`+N5_saVe2o`Vc8o&Gb9RVD z8~?U(u|~D^P`#@?P=joiL2(hKTm9Z@vI6^x$w~0<^TfKq3=@J|Fq+JmaP3!-7gbi6rMoyHwT!6yxpyoKV`5YvK8)Kda{W;coO&u0t< z=?oZhv8ek5JnKJIqP(pl)0vE%V#oiH5XmzZ2`DLNpdN@C`rr$%6Hcb8@KHgj`z z-NH%MSf@8vBgAOX|2R-C-;hr{)Ms=fZPP|MtPyi)})$ zoPqz!m;^4kgAz5h!2^@YOZt9Wn1yeecf63m_XeHgVYVPu7uyE9zx8!PyEc4jsXg4; z2w00>>N}3cc2p-rugTv6V~xmjWh6g^Q%uKKo+Xq@b9!}$&f}f`{b-wLW|YC$g<23I z*biiy|Ha;02F2AdTccqz1Pcx!xVt5|OA_3I4ek~^xDOCQaJS%Y!QCae1b6q~?*8q0 z-g8dXtvdHsJ%4X~f0JU*K=1Be-MxCXU`B{U76TdX7F36&PUnl@3zre>7FvC}G@xsD zdVmt-1^~R8kxXD%{|4*zLVSkLl;t;%$H}(AaKWqD>IOi%W>8rX#T%IUUe8u&k`)y+ zCl6TRjBpCwZpM5XO5$AdN*FwUTNJJJp$u;KRncj{_sa^a;k9GnHAnKA!I+NRuVF5B zWuzxM7KBX>35$yH?jG25OgoUZMyLhz?RgPWonX|B9I4QCf@qF)3{y?q86A|OaVtb} ztWRvdo~2=V1E?{*MN^bNJp_2S!=~tD*swfEuIuQ{Ls@nv@*|qk$6L7HCk29Dj%=q4 zP!D|Q)T=NQ>)2CdGfG9;?<8^2+u=3hWQrZV=7ogJB6*c z#4c8wYvKp9!g{@bhmCJ&_WtYj2=O*nkw*^x?ZsM3q2v`Etr+r09JS*K&6Ig>`oTOsCz%;A3zw2 zMAVUd+3-eL-(QAok_fs7)*7cCW-wbd15*xw3=ojB*kwFRPiZ15hKQpCOgU7#$Uo31 z11VV^uG0#(h~;by+i;_15xG@g=a1Y}YxcQ{25Mp3;5yDiS-LwbMJ=)^vsiajN$xM# z-9WdsqS|tXWyzw|1Y)9ipBSVXHN~Bd1Rl^38Oa5Jw{AG!UO{nH;dUSAjJ8oatA=e? zFd`0}e&ZtDJ2xrrnQQak@iwRjmer<{8@nLuVK#dU(BWP$y z1vaf<17YWp1PCXqOY0Rww$3;^S!9_A`SGvHpsL6dxk-pJI^ zAs>-K#GTucB|&rcC*b8T|Kh?nji-4Akk^B!8o!9njk0hF!GO6dPj7nW;ZHh0mE7jvIkHEMgt6@6(KVK#z1oKX=%x7VC@($ zd4&|7F(LF+lR~K6k5mTvzX=fl-3dsvp5d7G(M>?eT^yPP!2wMERUVNHM{sfc3$Gwl z`TI9d>;Pd**53`WOMz_4Aq>X5II`DGx89L?Rw3=uEKyh>jo)V<04Uj=N7Ee1+7>2? zB^>TR4^yT$oLCw92iUVp9z0p=+yXc*QC6jbb6tnU*soc64vTwjO%Q*2LNz33!w9+K z0T3$C|*t+(r zuFAa#*DJuJ<_m$gvBJIwOA)TyE)b^2xBh%0qx+|s84>i4nezcfC-t4OI&CHnrdn2F zU(V|HGjuUUdel21FNlj-cDKG`%LlT#uYY*q8wuacyt<1g0d!cp51|AvD1pEmd<$o zVJh>p`YfB8FFeYdK7f>dP+0&_wXZyky=LO9ZQ~ny)cWq+pMRN8{N9URJ|ZrUJelPp zf}1%0B^y8*{!KXc!vLkXq-?iN#_T0+j5qX+3+H$Mu=lA_AR@$!uq~52ly(g4mEGD5 zI|D1pA4K%hteQ`IuG(;lHU$@qXWeLnQH|*T*?_$~?Q(04-;6Eu!URVm<$x$xT2qUX05fdEm{p0c=keK>|OFdXE61uv!%Mt*Z+29xg4Nc z4*34tJzm7v24F>c&=Vm>@ZW9^1AP~ut?~c5JNv)?he#r9|M!o}ySb15cK;Xn_1^#A zU+@3<7x@3Z_1cpZw~t4ryz3R09gg3*S^gnB;-Cb?B-}w&ro;BwkMsLw10yKg4HKcj z>irtXs5i{%Wy9Tk5g=@a3gZdUnfX>6M>tfu9A2soilcJd12Q8od@n#7ZFloJ2D2wk zn_-G&|NM0O3dp-ydspeY!y8x8=rI#ay$n@0_1-6MH89OLa?8TuS?2 zJDU~w$BMA5{Jx!d&Tgj6on%AMzYZ&bx@2c8yDb_5=MQNx8ON5Yw-PUWv9NC*VpvW3 z!dzwxm!DU?HtUZh;?&V3SwN?6>hG`ajSreg=YyI7xnc8(TBl}g**y$IDg7Jb=eX-- z63YOt)p_W6>Boi04>WJV1A40s`QihB5at)Ni(*|tlYE`qsi}R-wVv3nbB)!)}QYEKo1kH2K3o-vzYq?0L2R0azMy%*9o91Z^@_f z_vNbTq%8d?_J08~AwQYSrrgZk)~h%~yKdcR9vY}u=_q}!49u(VtdVH`wFX59( zAAEwcsFO5M;sI}FFla>U_%y9%c^lV~&cc68ZlN~1l;2jZxEY%jg!rAT{Go2kh7M5F zUQclb)NBC038sR;@vBLC>Mb*|b zKknJp{4l{!wB@d2Wk+4^`%`}f>|l@L_XYYQ3yc7;eG%Gl5Wncg6 zcUlq1bDAnpihr#i@dwB}lg;nn|L=+E|3ex=45r+zcpF6YZ&UI+fI6jk-t&+A%)k0n z991B>ifz%_|J$70-V9{(Awukz|5|-S1(-Cx`M-wz-~Zt6Sj+Qn=>mVpwgYnNfKJga zo7*@}BA}tK(Y&K(-e_kumvc!oUa5*=ZvF^(ALZmuzQTQfxGkWlEDa34KcX%m9oz@S z7+(U44(#-V+?EVZ+yxu*2oibfZh8h9#ssjYC%`7w9Iuz6)G%9eVk;T82O9#*AiQ<% zmllb{U0IkfhsR@z&|t;Q=Gs+X&Ci}lPNo6nO0T2jwNBHLxJIolNS zGp(hkAfo_kU7QoK#YAb%WNC}d@>-R_E#h}O0kd&+K-Vyl-z<$ur&5;7qB@Ghe4cKC zrgbZCaN01HwIRMl2#00>A~G0YckInHkv|+c$G+Wnw$&!%hq|w|EoU_6b*y8Y{ezI8 z-y?IoFzOQ~9SFGd>omk2iTmvn85Z1dqU6sYFpPTjxbV%PWmQMy1!4{^Y;T5grj59* zDs9c1e~tg%EnCRUYNasMvHXeCwxVLk;l|NVh+n(WQKM1SUbFW1E2muZDjNK{_m80* zMj-V}DloDel-KsiQ{!Z?qxq}>;2>*^OlX5 zsLn^G4866Jl6dPz9@+ul!9ybZAeg&-jF%3jhG|WDMNIcFM9e!`G4dZ&msH%5`CDRr zphw2BPmi;Po(qn2v_^zzHT*0|FHI;jV6iK=5ddUw2qGRMl^I}Ly68*yKNq3rOIMwV zDunwoPco(TpV4^F+Et2-IiCTB#b6QTN~Br3r>M~Vri@4OwXnBxl6stSC&{wAb!dv4 zJ*!?h0ngW0THtJ`u=$nY*Jc8|ivurz-udaSZVs=5_D;?Tcy3w zN?V5K?Za_vdlZWxLn{6H&_v|#P6GMsV}!2E{L8?V1kd}S>$ClJ>Z4y~X)eDm8Xf@s zz9mT<6U&O>1kZtKM+g(6m#^^Cjq*{;Ih@-=X1>W&b9#)a!=w$D5EL{?}+UeuXh4A4?D37VyB@T zCYwxjNPx7Z9$JgZPl4OMz~*y-z0QQg9(^TvjBdB1u;guC-LL_|k;_)rJxh;9!D!3* z34hhT%KDt_NP%)|SC9b_y1}zxF6qRg*MgUQZ^?CFI6?r6mJN-cZJYdj_0W;ukJT$# z$_$Xfl9@@iMxWc8D=%$eU)1U##R>VnW5;TpBTOKX?|wkO5V~Zy>#sJicVsGkH*)SB zg)-8hLgH~k-_&8kf-7*jAB!%0(^X1!cgL9hsZE_jkg2Mbp*3rzCNCQkxK5Un`>b2R z(1F&r(Kha_PdaCEO&r!M2S&`nLcC9zK&Vf+ZqTIB)VnhyczSS%d)@RXgn!y~Zr|x$ z8ZUUY%sD}mg|3Nk(ITGaelZuizn9R@h&aDH>nws}No0>Bfq#UHrRT4V z@FB$;dtCSR<2T6ovI!;K0(w~tS}3Px3t!-?b3>|%BOF?9tUTufFPq)XF7D6PY|{Iz zrvvzxt`^u2@A}{Ta7&{>Ne3EAiWxs=9(dd%X~1kqUx$sc40%DPm~@-AD8rXG2NFt3 zX@GRMWx;&NNGrvoyB@i>CMj7SVfE#hNAdRIiuwrErENdz-jrOOJKf;8ARZgw;IXeU zBo)E30&TLFiZ-?Q50pb5^>o7yw5b9kc2o@4b&>b2<&)z&3lrU#0o9gw>(SxBYZ)S+ zF}m@4az)u|xJg#m^Jpp-!=2QruPr=YZ5axw?v(}>YH{D$rN_}9eHdYnxwcCjPa8V( z1oi|voTuq0N09RT`n2@}ut za1#8S20dsf9;kKfU^n}NHDO};ek8##u4MO*21Eev@x{*1kMhIn8k5N#!jLEO?NsNe zB`4s)d8hA2CS>g@h0q|1r?|v9QwTR@e$%{l4(E z^l=qyKk%p`fvY9-L*!Q|z!{ZPYvX3!N7U#agx&1pIhgG1)4<41?@3_f&wUAC6SMqN zf!?TsgOv|Q_Z`tPPSzthP@?ft^H3Q+Hn$1^0^bi22-w~VVQ%L-dM0ChPBV2LSqI)z z-Q!N%huUww@-t8oa|x>hG)pE2e0=)vi)9=A)6DXYHrR6c7q)KWL*{;z?c-Yfl+i9G zM2FLS8`cZ$`-yzWcafsZTC6no>oSiyS1r9Ncf<5B7tpUZ0fhI@c6;^uto@FoyF;K( zosX`IW@jgl5i-|ly-nxMlw~a!c>-UY9G(+vtl$5L=z(f}RwG{Kuw85}By!*MUDmj| z8K)ln)HM3@p;IN1&n?NefvXDogzvQ(_ny+Y<#Ohn*$4!O4aytlZ6R;#v(Ql2y1YOG zAbj*W3!e<1vkmJ|v4in)%I(={Kze@(_u^Vp>6faLmXSSLh1Q$T12)-0B-e$DyvFt1 zt{x93*=X_E3Ia-+b>{SUxRs^(%poi~m6JLyrMP9aRK?6;Y_ULp7x31|9;e@lkI~K% znfv<{%cUQ+%oke2h>VCMc3(}H6!qx*Ij8sFewAnfj&tnWE?ojSgn33A>IYL@&(fJPBj^xNVDIj>2d`6o{$mME zA(BJ^%^?F0&DiaZh@+Pg*~ZK>6Vmo9DSC?xgzJy#+50Qezt@0jZ>|N$X>VM#nv>*^ z;xaAsIz;H+;gy$~-5}~mlhKlwsTxiiX|! zD2-URxflt&QupY<%$Lp|Cf;n+DvcdZc0n%faD<1Qv55<>7(JKmM7tMaLEAFqt6Hsm z_u9{Zjo3T5F#TWnV{Pu22<`}x2hpN!_ zwbise>}(^4{>c6Ah2`w%vvm5Pd&Bu#(e_xMLWiT)CemNynn%ASh{$HpT0czdxDa#m zJMB$JITJ4ov|X(XTFBjS=uul7=H0y9Cz85cz4Bd)fS6C6)_S}e^bI)L|53IiS7Q#$ zCp&ItKPp7+Dx1|0R(tpXvi|0(dkd4Py^`iAKl#%981CZMZ}YFC09M9eo#gY+L+*?^ zBG*|c?4pI&;JDmR7A?*r03I47lS2`dZp+$Q>QXh6^l>_Ka#B7mgcF`dHrg_vfds{^GK#j&htcWS`sDYg)j4Rc%6U z=KA+^`!E7_$I5KhF5xhBEX7+7xL>itb@y6H*bp5}QGgP)r{A#|(5z^i6ql;Tgg-tx zVZWBnED;s6OS4RJOWRm-h~wAo^U}kaxpMT#i>%H^;hr%85M$}CsTVs8CgeS(C0#5< z*yMSn*kx&HsF+z~W8MuP-sbDY3O{RKJ|CnaJ)m`y_O>4y!l79?b)-=dFa65@cdv{=w4vNOt)<1I59DG zga3GGZZ7q4*d^4FvHKd1OP_{ccT%5aQ=`#1arAme;&{YELLTJ(17q|t1s63N?@_kb zLA$cI`^%q$-e&zTtk`RlWIoKh${PY>-&oresY=L%POb@O1B^`+c}3SK{0ojJTDIsq z*H_&uRBM!b9j(Ar_U(R&P#JvNN zt~}OXb7&bu1`rhJ zHNL$Eb-G&O*=3hA7@$#J-UoVWpRWQN6u~e&D4_m{;O*oV&4ffmi zB}@aKovs~9U!L>I!i6K%dmYKdW=(i_RG-CAH|gYM^cf_etZibPthnHahzE zsfI)B#-s#I30U7InXGZwAc`1pzz?fIAa5#oZReIewNTb0ZpRk86S&BiO;wyL#pHOG zFRDOW%q3c*TI{r=+aW?Z2=30RB(MHB>OQhyA}2M^t2yTm#>V>1Kj1RP(bhK8HEp#$ zvLLJ>z#W2{4;+ArU-p)a8XcSPIZz|*nD4PLan6dGx1Nuqww?-pc&imN zr5DXx%V!Y5(NBJ|#&r3TSeQ1=4$>0Xu{j)*%;(&q%sad=Tpo8NIlK=gC?x6f6Tv{D z+D{|>^d=GsoPr^eZubGnT!icvyx6LhQAYbr^x@kD^=fZ(PaVpevEP0HIDyQXKQzx{ zu~kZR{rA!`ZN&JFV@9^9knY`NruCqAQv>OYYVwUOk0p@Vi<|cn7SsMKlav{jPxigF zsmP(@9iRQQwk3qk@ZYCzT*>B2K^PQ2&pc4~}saLdG@=U#bbW__6lv^mM*fQ!7epij$e3Or{;~zL<#q3L)gf)Mu`%aZ6b4%4!;W{qs6At ziJwmb*ztn*XvMQFa%uN+f8clE6G83a+Dr+OZJ6u+Uu$v`l~T?3RKp~SQ`0lGwhIOd z-@YPakJk({-q=WH@jW}UjGwx%BQ7@`DZ~$_5>`5?o1JU>m z+|r1~+86h!k$NlQ5xX^GXRK{!ZAYbm#2%R+rinLl%%IP2<$TZMw5)V0^iKsoCdT0M z-5z)3ZSzMzoMg7eI$o7{4*AWjIEGc^a%jbz4Tcdu67VEm?CYJKeABrbcwZH=$WRX`!&l!>GK`=Xcd}G$fe5~< zK;(kOOrCA*=p0fw;lG@a$5wGTwd>G{zMv?idwFpWf2?l;W!X4;@5(60~0@qiPTeLFHO#-4( zoh*1A{p71tml^wBzrIF&P^q3p+Rv1M-ZFzH$qh`POY_)0{>B@k6U#}W&ssMvaRGH% ztSlD-t858AN880;ja&B2v}?n+>%X$~2A>;hhpGm6F3S9{Z#czv@n%%ubhRZ zJ+Ir{Uz28B?sG9{{8A|@{vLy?&%Q+&&|0AK+%?fs_<8Mvbjne?!U`+o*D3IULt`|c}hSPbRWFl=N!*B#aSI+&0U!M7jFoAa1D-hRs5UCP;)Q;cm z?Dq>s6bL>d?WE0=v%Yowh4?w=%NoYbLahz@+;0jh#n?bQBN{ph!l3Rj&i%>H>wo^s zH1I&8Sz=Q)N5>Iwuv&T~I_Le1|7TIJEbShP{Ow4Kp!RKltWG>FoqZqM?{oo>t}Z&? zNk&(=RRR%~4cZ(ZU22+O4$=eppuQ0303v| zK(=ZkY~C^8hyyz}b{gar^xJfJOy%h0^x==+n zbHwtqx3*%ewy9qITX;%U`(=}#wKge)lLQll9>2CmXY_Z6!5~PaG0Zkk7YXGVf*#Yo)dw?MKR&S8)(s(#JINP`tDuH$Px z9VvW#Q^i`aC~k%F;kSGStr@7&k7;dn(w@O)kWa-ty*DpjJ?JvMnRUf?n7PF`rX z-Cq-zKb*SPS}ltMa+8_S%+XGY@|(Rxg%qzrWMqg3v(APtRIilQV+}7VZeDRd$FP1m z&ttAOnScyLu_H`j1rOIKlt?*s5#7V%ab{V~HLoDDr9s1QQ~tDiws;*rOQeE9 z3$e$hd$AWU5ykPM~ky>ePz_Hi&G=ejp z)4`Afp+t5C4fCGfo9?Vr|Ly+%RRrHa@YV5x0--f>?=8w^{ywX%l;`0bv2Avj_d)3w zXL>>Ybtwk2@g}FB@DeBhKqE?QZYkPyL2E*L6%^jL!CQt9?JQ$EYE`;W*i?$QPm8~$3&Mc*(Z@z*nL!@FW`>hoKsyEqyp9u=1`Tdu#={gijW^E6TfLlxyX=pH-N3a)oZQ zn~glEA>{)=Y^?mfU*Pse%h%H@hd=_ zln#fK1-tiY4jFz10+*E5Uty?|XPd%C3XOzM@0kdO4J~J5p3P_fL}NSI0{d`_tAO#bc+^_;aTW!`tHM zIRmPU{bMJ7BJQ?4g!6uBaR|7R%t{Zm6zdxN21?{$3`(K85d zYc^6eSZ}3lH}YpV*yF(v4gn;O1WYglsLaz5?&(R9KVUuJ{bp^n@^w7e&n^%zv51PM zKh=#WfnMMV_dwvWK=cwBK$?kVbYbCh-z=cMzka@tA_!rAFlST1OottrO?0I4%$5O zHoe_nO~ivIy5A{z1)_7o(S^RcfA|!4gkX?c^;lBDY&b2rq2g|G_Lkvuxya<%1J8}? z2(8reKpxa@qHrj+M4-g|yodP$C||+{nUqU8P~TqA>3AkyYvJ*~0U?Q!A%WHH8a)!0 zyG(oK)N0&K8qbdwhV*AwV&wUPeuTRBxbptIA2WjxK!3u3CS>-Ao~AaN;BnyHMt2AF zwE;%pC;k0+Yw`f&BCO`d3hA1Gu4zVBEqAxjwNNXuie9gX@lb_j`-}I#_ba9%p4NM3 z3GA^jqU^!Lyk)*wUBY$`#(Se%XofP!QAF4I+XxX5G_#Q?9VCbLV<-W|3knf3GVnwF zHbPDz#!IjOxsT8Wk~j_sl!i45L#Ri)^dAh!Fo1&WoZu|Q;7_nMQvI>V+U#AJz)a_P z$dGMgPe@h?8&)c14YCgc7!22B!jS)2U3Asg!+osTR_LphdUp{Ja*6WI=mjTsU5^~3 z2M${V6DtR;Uk4La%AU|h_>}I8Zr`gM2{aU%tsG5eZCx6A+#P@Eu8ga`9%<>}uG4g~ zc-x5GV86wYQF-KVResR_+bih2)7K{*?0Cs8oW9X&jKQ?GG@1sTX?YP?4XqqA*4@51|pk z&K!=%n>0Umr@GW-mU%zK{`C|}U4GKXdTib!BzW9A!GNy^dN5(8^7mwE7TLhssnyh+ z-Z#Elari(#Df0G(!9JVg^!pF7qFWoym-^fMjsBh|`dd~_D2Bo{gMmN-6%*MOTq)zq z0N&adAc3Y=E8-!3qRlvm0aHE?B6zqqq%QulwK`8zFRT86R(7R*eogmZDXY%mld-sM ziE=(=l*@K9&Q?(g+bk>hJ93)U7bu$bt|TpNCa;&q8~s!MoRDHpdv+cs;YGg-N2G-T zA-flHRXKd=_lBAF9=1RwVZZQ=B7h0|p`SA2-hK2@NVP;9cipTQZbV3&va4>zMzHZM zzjV?-761xWL!ESH)-Cv+v&8#fh9|K{1D+Otu+3AqVJLv-$MWN}&!_q~#S8y2{Z#ak z+*_|(hQ@-mxC;O|3$XI1f!yMbe0pEeYaH!PN6p5%RW3z$%j_Q2UXQKP2GDBZOrnEY zFgtHvssnwWGKZ-)F)f6 zP*w9rI-5-*(=?utT;1De$)^xQVdQMS8{eO7y5Hzx?W(+Y{_Qe(*63V)GHAQ?rIS?N zfFtmcA5v*nYE5)OILEfx*WG4OV$Y&U z3%dgQ>&93nCuE(;3lITaXhFFF{QV*|tFzw22G$4}&+himk<-^M(zrSI4Xkfob6;N> z58Bi+)7k$#Tjg#5oSr25QK(yMj8yRMnEU%Tf8_D(m+mxpd=?`-AijHz%JhxJ(2!GY zL_Yi;N`&!OAunok(5G($We9Z1`&bhWV5RYVvw6rn_*UQ6=aqIfJ zZ0@^pcoKhxH#SmJZStc&>Q#>e*f0#LRT(;rGh@!NeL4H9s_lliR@~@^QK4uCP!3J* zpCS}qzl>l6wXtsrCJAm({_^N4jHOi|nXk1*D7iUwp$;SBnalK7Onp>|M}T4=V!@?e zKaka6SNiy<0oHG(2(lMx7lccNS^sDRT^hE=?FtPGTmQTN;j~0q%i6EaRGi5aXRzZHdt3U(X3U1OcI;sA)=d99$Py2UHc=WF!5TP3I2c}+ z#Xao&4TG;p4* zrR0@9o(_2>St%G zY6`^!!KNxViK+($;S| zbT0v*%afU^Ywjp~-JP}dcR9^!Zm1Ss3{9EjfhNI+bmk-5O8zX}Ts!n5b6>Me?k%%x zv-<4$VuqsQMW5pS&EGJsKo@5d>Is+bl`6`#(1I!hlWz!MLyVK`^ll9D&GO`Jo`!ea z8(G@&Ox_syJ<<5RkK`CDT8+YNnQs!nv(lz!nK@J5quJ6@_g6>RNUUv{UAq7l{u+6A z<*8Koqb|&@@vtU(Y(wNSNp*L3`;I=hvhuJ?EsxF)(MO~a z*B#dHi>V%1Pv#a`%?r!N>6#?&1g%=g`c4nbEIZ0tZn1EyN}jfHAO3M(u7%zp`>jC? zTXc*;Tsn0hi@>TrJ|oL4&!`bn&_jjoNnd`!`6IC*3)U*U_y$cF zMXft|6M!W8L_0}i0pRnNLZn>+)Od6ϑ>r)1c(A{AJB4iZ*iO`s18!r%V}kpDbePd9SMvX$9@Xh zMP=yx`Hj+-6>Mm!eM9kzVN7(jCw^u^)?m)zH{ObRH|x!rR^xHzFhsr14CpnO@5diX zbNVCWQPpnZ_dm9J#@T<_3Z9a_Q|fTvx#ZT#&XM96-CXZcQKl2>(<*zaHFnGDCMQJ% z$G5|Zghr<$DXaNKM|c$69N^5ol18S__M}|;FU{Ny=1^N)h+B3;hZyph(WA%bYnGN0 z;}y@AKrZW9*)4|Hn)OvU->+clc$UIXwkx&DND?UJoa5=OdREjLb*6q>3X|9)v%iYk zz+P5zvCL<;AARD)Jp|^%G;3`{%feGXt+1$nBJ6(ri8oY!d0n|RIF38S!pT&6esiL- z@;g``=vU-^=f$W7OiTut7^p=769+0(CXW6_9QREG@joUe@56)xNzr1AG>hW{ z=}N%%skc;^sDC6FzIh|@KwgM9ohWqg`DlT3XPXqNIPO>vC<)1PtWFV`XI)Z~BKCn9 z)Jmuj%sU&ydLCLPQrlzyD7BlzY85c^%jQNP5R*GP*ecW>n8L6VJvuZ-rafYxJn>k0 z1<-<@D=ape;ScTRw{nWDtL4%G;P395&{Bb2jDz|1RVv4q_IH}Kj{bH(?;j_*mz}CD zGc-t`q9Qb_62dY2A-%^}yu(BFZzEDfP2Vc;M?%nnvSn|L;HMH{nzEkaTb;J)mdenH z9_zX4sad7YH0Yo6jRQr+&Oz_{)SD5 z-i2dji2JFg zzTZ|{0YKfMTl)gnw583hmEV8KL#*X=k)VRFWukk65F_F*y;Zoc!h+$#oeZIl%tGDa zyLDJ%rVAyYI$_lXK`*OZx7a3{ykJhsvF~z5==ei^-iN~-NyI>fRZl_M3_jr&V8eMo zo8g~ODKcn5?bjKQOGVB1GUWH*<2`SZWF1 zL~F}^G0kZ$9Bb>9z!V<(;c4R#p{hwsa2z8KL&wu-S0Y#G$|fBvmI;@yv?>F`3)gW+ z0A;E)e|k?Xq^)06|74=3!7vz6p(^1m6<4A97tPxOqnuH_YI37T_H%9M8g9&#B?Q$x zvJ^_7gbCD%?O?~n^OuX-fv-TNNEsvb;(;KMc1}3qWuU>MlTL&40+1xUd0(bDpZBiO zx~MTmooa2iF?NYJ_2cGJ&y7~#!O?+Z6OnA+@g+@9+L6cP1-0m_SCA6z z?CQp!)~fmT7;qSp1|oE#8)Ii1V@RmE$7evqGt?EI%<5QHUU4*5}nLt#f7doWf8~Rc2KC99gJ%%ll>6J z<8q`8Arc_NzyWaTMpg~xh9L(Z&tF;~cHK$w|8BIHD#VOFucIC$rVlMuuT|>z=+U^0 zEgu%iWx)Niw(e%9b}D&IyAyJ&r*#XPChQ6CC^Dw7o5>FzPj?m>0RUc>J`d;&(i?z9 z$Z(N}p7!54Svjzl-07$W{yhwUq1x}sU0Rs7kxYIpk6YXlCd_(`9~9qg$*vt)ha-Q? z3Z;ky#-e<%u6dUAr}b>l&R)I&<3}n)a4Znhpd!i06(a4}z~mxL{}Vac zYVdzzTB$rHo1cgZ2s&ih_u&L#3Vr1{*2aA4LpbE&L5O!F^*XMeofsOkDt64i@+i{3 z@OKzeiu2lWD2m|&MdTRRRetk=eIH9JpKS>*ojI`Zo%_M;`y|10O7@UbEe*|sVa$I_ zhVx)fe6rENiX~D96oCUQb@cz;?uP21_q(R=Eu2fOn+Z9?(z(R_^=AC@uCBiE-2fAjGTR%ds{S4~eF%$TJ-gXjbI~ZZZ9(N$R z+0bvuHJugWa3aSPZ8Z3qeI>PoNGMTF%<-+{^s$lj<>!zVcYyw^Aa3w=M0^7NW9@%T7H7CGio&`!#8Cw%t$@oL{*?8$@lKo+0}D6Mb$ghLq; z(1j7fsW3Z^F1)@KX-5U%J@-W<9=Mr&wHF5OxwnyXeMERb&_BTa7uB#~>&3`cos(4R zj@X$Mt~c1?cPyuK`FadZ#CAP{r4|-j5q}d;&IkuOx);}*Uf7x^6-@9-Bo)Z^C8U9e zu~%Y^{m0aF-BWFvxA+*K(x-u_X?HzMT@ERgRg8gmVek^S$MR!J?V;h3DUAtR}c_Z-(1|rh= z3VJ_Ctvnw~WFJ>I*IEZwezC3y)7-v44rXWpl30xr5!m+yCJ;D9XkTUl6ZDJ3SROeH zEU-4Vh@MLr2D}g8h>hEv^=DV9^9*cEw{G-K4d-x1mWo?Gh@O7T{h6y^E9q_yB zgGqgg29M2CkI7|i*+fmE2lfWsESXe+n4`QB1lblu!|yba*4FwCMbLF~ebLoWFAudXr85d6~ zpL2s>PgtTp@PtWjfP1hs`>Wye9>3p!8j}L7uH-D59>;k<)&vYS8JRtharzWmx&=|u z!0Y6r*|si*Vq9~Ax3g6j-OXjY4=5wqY+KUU!f&hK7jXw61N{Ytxo700JUqTf6}XRL zyr_uaz!$>UryW@(0Bo|bEBo7@Az5mAxCivwS^1JC$Ho*W`~6)p5DDNG&;6Xm$rMHb zYy*CLEh>a@^5X^I*+h!Sh@U7jJaql`O-x3l;}|gr3*If6Ike>@m)gk5mDLCCrP>hy zax5fwyZ7B3kB1ttVjH;Vx{#|S!FRjqDXm@qVuYN8=j&(Q$<~QOX`VEnBi?aWW6}8w z+OI%{+mW|xY2|W`o>GUzH*7ANI^OHljhSCsC3rqSFL1G% zme1-ykP}af9qg_~Z0BBxf?RL#RI~qM`Pj=K=FX~n&t6*Gh6`L~wZf_PQ8XipOId{aTh$&yxKDYH$agnQOtw zV^2I-=DU64t~$BXeDX2Ax2bl{3VC)K7tRd@v4jBPcgprR;`}Qsz5v$K<$c&PLkWDQ z;9aIDS;>%~Uxb)LLrZp8qlRCv$4L1c&jGVy16fP}h5Zx{b!z=7E+HI7Ple?p;)A%F zMw$@=r>OJ|z)aF9G>AwrAY0Z11F}Hf53oBbO$NnZzPipWJ#3nQW=iTeu0VB2D*650 z{ybQ_9%)QM4%~4DE7nk!lN6ot^ya_+f|OB6bZ=c7g>$g@M)DjYU0{VQc|RPPX#y?S zq7npj66D0eYBcFIV5FH|+IVG8|M&Y$1kByBo7c|6VealSUPxU5hujqR336~{I3rX; z^Ie1t3K&-*u1q}~HcSu~ey*(0A|+%**;Ws_j2>jo0g46#qQ9+`vI2N#4l67M2FbYM zatXi~uTkOP2>~dQXfyVCn5?LOCu-fAte@e+W-Bt%b_KnyOD4@cb<SW54ms4meR^MjbdjUsu#zX*d9MuMRZ0fS(`j5h?a4r)pVw%Yc=1!nxGbB6M7&)g z#Vl&=SNb;SO@#?+M$>|iDi+Rev2Io@IsyfULEXU1s{WXqbYg%QuvK^{@M1)fUgC)A%m} zg#^Q@7>wk{jrqhw55^2r_T_yau4f6pKN*tB9IkyO!c4#i2ftzb&>Z&Kmb&kHuK{Hx zAHC}Xf$ry+f{8H&>^OJ;>DYoq>w&4hDKOQy0jB!(nE}mYkwl;?7bFj!6qTL*wE{9U za5E(=JW`h^l~k(7^80HpL9tbf6sgw@mo*L39?dKKcazz5*^8BV&9|5q(B3u&R*b=K z^Yrg&B}qEuv?BQgd(uYsJ#5ia%ZwJ-ecUZc#13RGHv!J_7B4=e25(Qg9i8bKxP{d{ zg>5~+f?tr+8@A*6N8S;VaB-@+Yv-oot8ylOEr`|9(Ho!_R-%TuiOvU zzN3iu#8T-CKVN;#5r;qY`ej_Gyt7eeI*!V%M+h-~VtKgZ-`E*$TKFy)E{P$z$va5T z>CsjhKpB^P&Ktge;jY~^i@PgJAw&YA4PvIRTJK-wvQBSp`Le3)7iO5|v4(F!VnAl6 zrI5ENY`;lYqSxd*U8*bLeKaStHIR_WFvX}`I8IH;5Mzo=SUo`sauB`?<*R90DJb^) zJnEja()pwkstFSF35d4VstfTLbDys^sQQU9tPtm~#LrkOp-h9SLJ0!2!*ob9@h61d z<#@OZ8}n@xyTipzdRJT>E=kBFFy*IqAj31ZYzgv`P8I8?(?4WVekK5 z<9s-0oV5mHEQV_d&+m8N_ngf_CZa%);@{lr%&3I^=)4ss|X+b84j*&57-ND zglWG}ToM)5!@Ig=g+3oX3nfO`w?uMqJ1&RFIK7D=lNz2JVbSpj0 zA(|_`fgSq7dxzsFWz=4$eMX)NAk=(GMCIb({WN7?%F~cz$^NF+z1g1deKHjkZ(%NW zKO|X!alp=XyRXYHIw!182~VV4_Xu&7_xHBZ8|xGMhQThPJf=6r`5o_WXuiT2D4jh6 zmg5!OgZUNwp~LkoBT5zRhqc`LrRDGDs^i1fc>6zSt}$5_CdWuJMQV}gotH`k=zXe9 zyXGyjl-EZxK*@SJxp;hdSgGFN?>#{)?Sn^CzgcY`%a_e(g&x_Me$m%o#X)$Zb0pL0 z?v@4V$D>?P*R4BKx4nA4e0}`U%6N{P(w98E+=@0aen=ogV{m`Mcd3*8nI|fAOO-fV z#V1jr^UB}bonyMb8;?ST>EWW5u`|pLzE%x1XKw^@@(~yo*b-HD`K&iiT)? zO|7CWdLZfg6BW%6uTgj!`yCTqG#o&)fl(Pfu0)hm&%0^f<&(I70oI!cv1Ae+Q{ysf z%UdoV@1v%A8LuJEm%a{8La8u!Kql#tE9O_^1rQPgP24lh#FXtkMWwy6<}n{hB!@TF z^2_kOsM~F9?A8{BPe!6d_ImksVZsVN@89@}G)C5f#*7p_7^rwCf9nf`3m%d7Ae_R9 z15q%c?ED*FW~4wWSd3uFI*Fs8i~X%H&Io`!TwBRmt$*h={PpI@WKgbWk-0YY>;KI+ zf#Uz24v4Amf=gB$f9EwI-<;TMe7`2E*C{W!N&e=W5Tn^cPCos!Es*c;|1H}hk}gsC z;w;T~8N3@hZ=t%tQK_PTalX}1KycC7C6QKP)p!MM3U9{8yar?ovmz27Y$qL}U*Puz z=bVub*9W6D(9FE`oA5n;a&t>Ry!ejVYy8MFj44yj_4%yoRJ@MUE#V;Fg^PLl-%RL0 zr?FyD#MfWh-n+j^Y3=c$p9$}++zj=KAljk_tw|H=8+7;731?CbB?6cf+rLZW%X~(@ zm$Kqr%JVfmO;qVkNi)5_5gv-a89`*j0)u3DbW6)7mhvrX`sKk&lAAA|731<`T>rag z1n?h~zqTQ{?8eseOy1Lxs44A<&o}<=UPnPcXPh~U+lM$UA^X)VH|p`{N2r&2VYeZW z2+_TmX%rtT^~h5*gyL)^M&q5A*2C*)sCYDEHRj|S)4q*jY8fUTUD1rZ#eQNkcq0`q zf*a$N(^fVo*B6*T#hR?&j++1-K~roqYWY~N`O#rS*D7}w|fq;6Q1$)e*xj(+%j!Zt>$+)C}fK~1?V zLl!BlOFF?wy|>`T*w=2nB8i~uCKTUZT0ZL2e6%3oxzALfUp;`w?t5`=!d7hZNIFO# z;vty?{EC!v(1E?w^^%AmC|W!#eDHzMQY7!WG+?Cu+$Pq*4x{t^u# zX3@d(z0-k;MyBK^T1X+}9;py}Uz$1Y^6VQ!phNl!JOB=+?L^i4<2jL9nI)dgYNsWE zjv#vY^A(A7)ie=q5Q)s17oU?sDNnNv&Qm{#wDpaDes`h*4+VvQTb~Kg z#Tr%7z=)%~UOa+=f8g}t#-|&fnZIJ?>wbOrMZ3t3J5xUTE;oz}{ItXScn$gm9%x{p zMK26KXflG13mpqyUgz3s2X(=p@&5jXd58N$C0VE)+_|7#olKeH_y&BVyLN`t{53mdeGC!Tp_E5v>V zGhnTeu%#avlI0F_GN)iZkgJr)`MCb&NmYpHD2U%kf&hCKxDzAgD08KP2_A!m)EC1# zfnQ%VL#~sst$)_-YN5i#EojUW?p1&iqD)t|YN7}sxYBUs^n zDX89(lzg8k^0+ba)rLefU#Fk+(*4GKgMPIm1AT40sf`30jmA?f_g_b}JU>Eq9sbB< zh8#b>8=1dzKQ>YP@H@>+YZZP-49OmG$!enmUdLlJyXlYzA{MK$fv(s5{d#xQj*d4E zrwBtDHg`>bMa*3g0t?<=!2e}cx~xGSp%T_;)w2bml&7iquW6g>x5sG{4_@e0sqS2TRWQT2yG95l?nMriYAp+OQz zfyt5>IKPjxAJn~*wJio0hJ`Ym48`NueL_PFQ(mTQg&B6fbrhnd5r{6bYxF~XBH=t| z5Ayo;Ln-%SdlERBO>9f+LF<&z?gA8=OAXzB2j#;z7QrM6+3QnvUhSHNRo^VaDEK+^ zo_-5Z;Lk~3vglPy|i&GDdrhVJ< zuVB#-JrR0 z+Naiq@+C;w-y2pcDq6byv*r8bCi%nigoGp46K$639iEGcmZdn4&FlSsf7FjwbO#!xP8{4wR%g&1ttrBP7CP~@e=A0K>H#giLy$2jiAdMiz?C2IZi^ZUI}xkNnF zqP8%(STBBi{9K%tJoMwlq_l5v9P+5dmoT|X*ZfHJhs@GYaX z+{M`q4WCd{xX9COGMSRh@1$$HAmVa3YH5iz$C&$CAQK@`J@JfjntBGoIMcrOs?Y;t zvUH7tWHMELax2VkL2>E4b11DPZr|jmB4ef7mf`9^b_k>LShW+Q%lb&aPB4RERj8(J zt=pDL)jE9A@RmS~Nw8g`zZ;zy=QQ7ef1B?`y76;ou%S{LZ1dU%pTIjinc^CsRI)3c zkKbk%5CuhW_YiSUyzLl2(q*fQ^OH6nxa)Us^+(1PER1`qYmwM{yM}k-ZBAhq4Fyg= zC4lR$*b6=cxP#6h3{Mcj*gx?t@LDclA?GpcPwx{xhQc_#mQnA0MxbXCGJmZ1z}E;20@$#cdEZAdP*C6EC$K}| zJXL565n0fhd0{G6>$&Ps@mR;tdtlue#Q-HsM%8Xt@T7hpI5_cC zGz7Z|85BSD=fEkW_4vs8B|xa6>wIQdRm^hQbj2kG7U$si#UX{{!#9U6! znj;RL3}5b-r4Gw;Kw4E~yrpxE<*eurXLdphR*Yf7kHX&Lx+8~o2h-ib-Z_huaD~2@ z;2yJikxmJ%dQ_x)uTwW`<&QViK2}U0Uen9b&@wsF(+HH(2m6tDcNHuja?Awciaa0| zNLV0d)s2D8N;Nw@T4qTu6Mjr;uiUdEWFhT%E*uh8X9ff^G$}suql%_7I`=Ri-w{mn zp`6Q1>Q)%`$|6sU9QFm6c92$GaT})5_o`y8o3OZ^k8#(41q*HA-P}hR0VT8{td%8S zD|N!Z{%MWty3>`Zk5i<}zxSOh*KwhXUWsy$#OS4Q{`F`gn~~er;H9+!CO^*m>EFyt zTprfWaaM(X%V5{mPOV13>E&Zm^D4_YbU@hO~ZIW+s@qhqch|}Va zf|*P+vsi~YH-{Cz?eAxYCeS7;+U_Pu&3i=U_2hR`O*gJK<7||q*b)V0<=fcdRQK6n z5bfEVNc_NKBLhYgyfNr4#9v;UtECokgKyi~GAK`DpFNKtaxO&dDsWlrHwYaeopTs) z;~53N_jPd3@M5{U*q1Z6ZR6;B>@D)-1)n5=yKyA=E>d?@986W3%PRdXt z9-MW89U)O%ko!#U7w}<`^E5W+wmr3aZ&zfw!%0aCe%cf78SZ{=(qgVizfRT-W$Zjl z$>%%tb#R&cV-vm91h_;Ii~5N8V>6`JZmrxGB`lo&QMOnXil`Ivf(UeYir33oUzf1w2^q7nNY1&p z-$C-&D}MCh1yL=Ac^D}Ntu47n>huLG^5;K!8L>ZrC5Ol_ck(r}8%%(5TQzj-Zsg9whU@4`5v+_|1bb6V zd8gxT;{~te2L!}XcjfRw2flpF35F8zDZm&vMmd;wM<52V9 z6gDb7U@Qy1MqkBE)2P)sqF+kh&YVA@PHr;*H!NORh9ui6mjejhy(8i;Ltm;Neo@?)qc^;yG<x^*UmmOKen{j8sJcKd2!bKFxhR&3k+&wO(CY^mSi)sPtJS`jGY}B%bl#DTeQ3- zg^7t~I7wdld|c+GK@yv4st|olT+dU92C%Af?KoNktINm-IAHUkGa-YRtoSpcp(;!i z^>FHKdIx^dgj^bkp1_)BNAO(5oMUM5&9MPztCg=lqb5Z+tEUp@Cs+Y2PRUZ>CkV) zIAXnv-<3h#E7iRT@q7(1e?Ci-1cl4S*stql*Y>!sOJV$hC*VK&=zrnAr+9$>>?f)?KE5~UK8fV>l7bh9oW$p4hDm((LQ9|D z6%EeA;ApS=b%>)RHhjS{!oZjt1|DgJ&~&ku*S%P(!lB-OR!9HHodAp86ro_^q{f5! ziXq)}B9eK4$NtVR%xL&EVxbu#0lLsPLI8hoqv`mw*UL($oml{X8aQigMupfH@L8bf zXEnp%9xcJ>2VNIBAeXO0Cs_NN=tKwMQ)|vYRgn8QS6zW4Ax-Ut*~`W?udLyB9Qnu3 zc`?7my)@;oKkSaVmqo8=#>iRb2g*@hfzvYt+cm1KgX)Z?DD3o+lYpt`(--1h?!vW5 zTkEn zQ+-55Wyjopg=e4JlSlFA#b3wi%fF5jDY~;^AMg>^tv^P4RSMrMy5+Qrs7m!X2Rox^ zV@ek&72CEnwG9^JUV-<=y_MrThOXk&dN5`?tcZl+oqxgbEgt`YXjvMvIn7<&kPqTC zLa3B;ul1t~J+P~Bj)4eN0X>?(;8!ZZFE+HAcG=0f+LRCyfM1D@kOCbIwyy>LRVn?3 zQKp#Jr3;T3U*B05TrW}Y9V*a^EhWHWc~bgn@q1#HnfLa$Yq1ZmJrE@fpwFU1)bn;t zCz0jrVP`?z!h*bSYB;-|1 z9-hm5Vkgv4GYw}5!#YqB&d@i05(mYmoO+M5XQRNRM7a~mBJ5xaSs@O*S~UKg9>i8j zp@B;p-OiPu6J+6BS|k?yn5|OP6)3Wpz|`Z0lST6EWRuo+_@eaR{KW+Dw|5Qk!$UTA zn|}pYeZPZeH``wCIIR+>eLAMS@P@fPHKN(jun!3&4QozXlP;4(eIev!0j0*Ax${u7 z6#I1Ke)n}z%n_`a)31c$d(d!^~-2J#N)@`uW|vR6N1_$z^Mj~9MhyHd5u z572*=`?S-a=cW2_FXLbP;{@Hgr{AG_sL&vfJ2I?>&tdmrgT<-4s~qRvi0{T#bDbw_ z6NRz~X(iF!6UE_PSq2dux&lSiNS7BZ0eS6jUHomcTmRbu;rNdOlBBW~ zwo^`SokjTQxvZ>eNERw>W7h-79*-)#8bDBbB~H&yUQVHQmfk+`8*%TH&)M_lfP2hS z#nrxAuE5q^)`JC^_=Iy8o?VuY#C~jeX)m#m} zlnhcUrQpx~JL2e?>uZL$87PTDBkDuMD*aWVWNQA3X&-ulv%k%GkQF9*AZ=?HchbL@v|ovsQqM zvPd(+0;7i1hqfbP2s(3MqL%juRe9?LjeJfPVp#Ru1+`z+jZu;tKBW^YNshms1$T|K zk{pCUp5)*jBkMcM8#B9`V6WM&uncyFVLlzES0hxZeU39S=uC#j)5Wg{fG1xl4Rgrm zRNeTtlPHy-Yd&4~80gI?v!nS4sV;xe-j;OTv-SE{*h^wVt}2q zKzbN;YNd;z-sZ9hB?@nEX3sd(j09gHdaCn{aqAb=NN2L0cAp^x(uy*8&}(JH`2;D; zLhF&O!?N(0+2KYX*d+qJr^fxX5ru()xw({p04^mFLJ}tWUA4?6s-N<t{5lsZJi;B~Bxc9>8b+p8nQnVwjTKK81*w zb*Sdy@3BE*35`#T;S$Viw?=R8Ti@s0zc9d>u{(#a>4lxGyCk}~#)ovT4*GPhOm zy})SiF{n`#FLb@nR*a7ed|mp50AH4d$+6`QXVj^Zn0A`uK&X2=%FxbC1DGujGo^-k z?9?Vrmi&%GI_m0JZA0eohkR2&50wH9mCcM8jUn2Oh$_0Z;T^-##IWHx69i&#rH*5& zg3j9+IEt`d#z~XssqkXvez+`as zB=c)<7_fZ`9_}04ppy-MT#5ut?s#gnbm$O9WxY@LvG`BHXKn!S`V9|{?9g?0jBjAU z_ZbedBlkH?iFAB`Fcm;Rg`MOSa!WDlJQE^}ZHH$^x1FZ`b?*_g*K;8AO6=RJk+pp<-?t@gzD@u48we8a2rKQob*1Qxs-v?L4h8caWNijDJFHuh-X=iJ%mYP)iYXdx(o~ zPdEjvJ@XA*Og@Iljyr203Yb67QH(#5DHA?E2#2O^o#~36JTMWsJKgy! z?l#A41~m!as(epS+cw>avRB*}Fq!9yiPWaz3=5M6^9}1h0I?%G?R}K87T;%3>#Bxi zoPhh~$omBUVeksB(9( z7X$K)WO>9MoNq{&62hsAjog1@aFOnMb=~{)quhEhG=4NPumTg-62ij|050Zv!rWkT zEcrp$>tRV#p59XH$g0-W`D^}CkQtR@D0UeF{ix?OImdzo#Az_m1Gw$+(W#)9>Wu$m z$MDsT+VH4EHH}yd+gw(-SHw&2<-b7VPe_^+DU+m%Lg?&`MV4; zldk4K?xNeydnRPoN!UJC7ScV?cOiY3iO>zv{#>l7W_ZF>3o+au8YV2Vts9mMzmn`~ zf&l~AEG)Q&QH!R2e}li~_a>EqL4oP|7b@bgOfed3xMj&a-}cx0@xgnFhL9cvUdBA% z)mgy|*azot7$hx9ROM@l+BUsuqG|3g9oM3GDr#cMPsIITC0bn~-=PCO$?dqp83@u7 z0i5=~HN76;z&$;`USt&}E;JW;j}r4^^YQ1sspcgBjVmuGGj6`ea_5&NsU|*>9a&BC z9v0Ek-&J@I71m~-+7E>v9`^HV=0}^83ekHD-Ig!y4a=?YH%3YVA3~!Js{ITcN&%)7 zzD6xCYUp)Ex=&dvxgzEg49Fgf_y8cR0Z04LntAuLqvt0Hgubviw9Es?vdG2xb)CHp z<>6H8kmP9NP?CMU^*-ySXpT=CvYFYcu0gD|UcX-2{hh`~yb&L@vX8JcS^EWDZJ|Ry z{{AIFcC(kM(1x_`60(R+)*R3%9r6Uy?-u`YMzu3ee)r z04v@HU)EJ{?%LS8(2@`Mfox%wk12lB4h_K%9z%?h0+4b&r^k!&z!n`3twGL_)F8ph zsmDV#?YsDL2hdfmlXW{ubS^M|rdOAoHQ2d*Fl_V*(&-b;;>^!U23Q3F7kf{A+EnPj zHrH&0*@nKYjNckAqTV?dB6mLCsGJVn>OtfYlgl${`Q%3v{jC?0;Isj0uq(|(S=s{w5i>v`nXpVkBV+kPFc$LqT zi4gsQA#!yttJe#caKJ#Mu=(fRnI(!874`E;o8=!F3PkXG?oLHbVZS*cV`sXWcVqHM zW{N*5R_cWP1IDB1uCK#}HZQT`Vw1~CORd0J(;2#tNu4(szCznh14@_zzC*-r7c(~) zQ9k>Tj*1*DC}!$Xkba*GC0;%Y2|7RRSKJ+F)lYuP{3>@u+UTO1@W5XOYuq_s)i|63 zAuY`8tO^B${I~jf}Sg&bob!W?a>}wzTsX&gL5q0Os zLLT{pr($t<{l`h%)=@<+pB&ApUvB%ihMfeHD-H!oh-f?}V{fQD<>Z<3 zKbQyq(!n20y4G^I|77-?Zl|Ebtkhod+%(jk^7jJA84ZWZTEe>3$YvSDI(>%?-1xgH zfqcxrPIRBiBd7_ww81M(q6B{wpF&+rESV9B!9so1)YcZNO4h3rjJr#HX$*!EJxGna;7eP$`yWtKYB!MAH4(pfljIPVqQD%Zgb(iQ!plqY~# z_UiXrW)Y<GZHv-&S>CzYO|OW} zU9L@&z4UJ;l|pM&G_I9<=~BYdErPP`eG-clBA3@C8Nx9Lm@bl(wr zpYMUeZl*eqE=Wm|5|^@mB=gD9!lcG*O%=rbI6N^XH>MlXkc~fSB^nu7LR?6tti z<0r|4FSv)G7zW)FG!!O?AjEb4`PD|70oz`hhN`$@<34-#j|Zp~KakjuS+>Ot1`wIaP=XC_ zJ4uXliS^u$dG!jaDAuwM%s(tIjY=6L%F$>7m2*TjxnMc_HBSC{xLddi(*TqEya8J5 z6=Sr+4e{21TOy4Qghnsz4?niPN zek#d~ALi^w5R-M@ygi;!Fn2F5>p<$IXCbhr!+Rx?ZFA#6#!jy{%oscjkU06!7EWoi zzI@b1ZjVPT`0}_ok$Ya7=`i|%m5c-`%RDJ9=Nmr~$iGl#N*qe{l@B`fTc6~fPA!*v z%f<8gkLdab$A=|T;V)0t@;;xaQoGl~?NKD9fLapp=sJGzulm=)FS*q7J?W?70w~Ww z=(2^w<8f1dkjNLak{FH^6l>U@JCiM9z$l> zfBafqT#O5K^62{Hf=yfCtVWDqv3`Nv6rj=?m<);K^Gb(lENzZHaTnXEulxIAZr zxb9gCYB$HyG*9-3eZJxz_=%Nb{%p%9XQ;q%D&a35iE8iP`>o>-oBLf$_+tf5V{rv6MuNt3CEA|>X^il;Z}U83 z!qWzd9y$#Mok6iYX&#GY&FL**GQ9%L$E>e5!vy5$$ZiqG<#eVrHq3QJ?-W#f9c?O9 z!xYAA5Tt?XgdmpsWP(r*4js?c1T%MX&l7JI9d{mL`t7E-*QneZm1Gb<0?y{M;|&$` zD($j?MjhxAy|kqu5xne&*`iz+)&`(RuZm4u_DNv#E?0%>g7;A+- z^l2R$R^ZcZw8MC;brBk;YJ1H-e>F-w_3g%tHOw;K(NfHcoonT&kYQvFZOhtUmHxwY z1%{*3&kP}Pvy-NW?$Z~Wv|i6PM$T=O1gHhyAe{NN@7iu&(q#3cXbGFGN({7afNre2 zXWT4)Td(>0TRFV95-7v=xA^#C2+*BFweR)6fNP#2_;PKCAXQ~a9-2sQG#@0Ftu(eG z-AX1wL-fB6UPpG2#f2uKYdN{l1g zcs7(TZzAYPX9rpS5h;(H*^nrE8smAY_O`k!uch#m1Bb3b?(r09fiT}9B}|QawyP`3 zpFhYOf9wVbhiLTjbd7zi#Pk}n$D#1D%5QQvw=Q4K1KDvmO)}WyOX5fq@fH9}a}SFc z`-a2#Fz}u}(c-<227r{cORsv;k)M+U6sLUc*mt_0jJ` z_|3vhMsdE?p46|Kyc0Abr2T#D)1}zF{w&m%q7m(|G#k5jZlLueL!!*Jb_lrPJNGrm zaQMX=anTN|uTH`#E5~wyQ-*bntQI*&Ey)f(3Hm>#pmv$|a*j{i9DXcNp?X9?sYe={ zl1oF^9<(>UbmqSKV+Fw&-(GutcOq+jJ?_?msmgf*m?PZd89er|*DrpGT7qHV`qIG2 zsfQ;#ee?{+H><+8AxtZBvJwB2d5}Pb`ZED76J+a!?B_1|E&g(DoFHBF^K}ia>?qlG zQW_%Fz^gZDpUB&PPvCgXpr%5%<)|l5YQ+&N-E+>E_L6hbsZTf#*wE>3=ESH)uqese z+q$kId!4sQ7V_ALrpe7uehGi= z+Zruve`GZZy#klFm4xn)c7rS5sJ@0ib)tHKfu0fn+kahy0--XU9Vg&{v{F>GSN5*U zQ5l(a5AUv;%=k*$Fta`Y-Nu! zBr=-hI3TvhOPJZFP)0aTz~n2DDHn{wW{%s{D&ONuMOn0*%?K)gk4Yiy8N*gRPcUmV zxU(4#dyy*;-*+QU>`C=tm}yWAbV=_^m)i`hV4dM(zN7FdCvrKKZCv9Ji%TYwUZ`(2 z4MAk335#1kg9(1&OIQtyM9v*lF%I}!Vb&8X`VVhj z&tgMe?d=_#I@HuuSzU*5H*c_I*HYKgB@^~6C+dx5j-M3;)*3`28!Bh7;!-K;UuBMx zaXg55nz%~vFjYbR&Co(0(9eOZ6?Qfl6-1u?vK zttTSK;xxBW^&9$^XLP>c|KKppzI~+w8~UT$6tAiGhy}*|O2uicVA=#g0%>^}j>kZl z*!}-VI;xd6OoU@uUg5p5$in+X`xA^xfIJp#$@>GMe)(%LY76ldr?&`D^!t4z)=Q0E%@~HI) zZ!YaF_T))*$N9uw0gp>mgFZr^@65|l!c21}_v?9cG?y@6n1T}eOmuVRLdW~Su9$m- zl`*c&i9!!#nLimd_;gjE*bL=o0Qe8CaoxCGvI$sBpFd92*GJ&sr$#IvSEGVp<&A#CWl+gu{!3uIWq8!C;fhJUVJL(7#BJF1g#JIQS$-#wM}GeV#8J~GH_ zBIC070Yg8zFyNwwS0-uZ=$h^9a`y1oKI>DW| zi|;9l_$pM7ku$%>(bpVoZ!$B1TJQ~6KjUM?jG)vn7Q-d%ejoaB>X9P+ZE-NFcVxc>#%$ zeygdSY(tRfNl?&EC@F`8+N-%O-~5^Ax6Mf8pw1qVb|iDrb_#TWUZl=wD63Z-%aeh3 zWnO@?WzP+CU4O^aEzG??x79qZKikCSAZ~Hm1I9Qcrf2Usz2mn>^P_YS?Dgw4ueeH&`uulUsaWtSE~u?Y-Lcz(^_Gd!l++Wo77* zdpKp)6=qb2Ub@0cmHp|uzeUf7{*$G&qPN|+KaqU%Ea)b08$TP&x#@kf5Z?tuUS6T| zZ_rO@n1i5l&0eB&i!mL$&t{z>0VQ`S-&%PXwrjG z+GLfVsU@3J*zkb6?QRi6dC_bXE7-%|NH&!z?H0c*B+j|R_q zNNdT(QtBzsZ|D*_B&5l|y1gq!_W~Ly%x@~~r&U<>D(Ek6F6AgCN&wxcjuXpnG?gH$ zO`C=8x;6EjDR;Nh=3?mi^pmLAFwdh+%53yZ=qe0AKD8p%_C&T}8n{~45Rr6K0`Eo2 zKr{Ag$NuptGN=`NhV|Bf#&SXNN-m_Zm2BuW1-WjFnFlFH0LDxd=W%$rG@c%; zg%yM~3oj$Jm8V|rDbEsp`s?m93fecC#&M1v#ns3=k}Ur2A)Xb)o-kF0*HEsNs~&W5 zzNZS~{?#1DHFlE%p!?#^5twYm|CyuFr8kLJwS=>4T-p14r~NgCeT7&f@i!0KQy7xJ z(LJ@OMV+G~B<}(PdWqs|!&@c4z90qO{ZI;xaA`8%HvYBLFCN^%1No|Sn<-j_SUz+# znwbDtY?AFrA~0t4+u^{`k^Ar-9VW&&9$ldHjT08ZowwO;EvY`iEuk9df`)l0X?;F{ z4&X8zI^NJy9ju^FFdN+>c+BsR&r)MQEy8?{{b9uZ^InBp-vUn(F`wdXyldoNN;cJx zZhnJX*>wWMbEt^pR^5?0@C~_86u0TaRQ8@VfI2OK!DcOaB(r)|(&xmBoml~9I{KLy zlbb#y=GslxaKlObeBB1>a$XjUt%>Tv4hvn>aP$2R8-At49c;AsnC|>nN7iOaf-2LO zE?S7itJy{qLK0Jnij|T5_~s@a-y^t^7=hp8$*{`?5+EKqi4+ai3SviluX>S_>3jEv zdD!l9y8f=?P!`NcvcSp~Hk$N9y;AxL(UwLk^s+!O`aa>%FiY=uJ=<$rpHCu$cHc(z z32FEHI00FB0JFZ*EJn8E6mE(At9l6c5_N5WP5SZdG16r%AN|$Y(0Va3yGw|DGkz|x z5UNWv&{TY^Nci9_&J-iv6TSRRYKur^4H`Nc{ObHE3CG2SZ-B0NxbR`N_24Cx*ZERg z2eUyB6>Uwouqvermq4f3Jb2?%zSHsL;U?AN@Gmz}w($qw>Yflv?^HfwDZhn(3qwuq z7RxRAFk)IAowvlq0qEs;%&by)*sW<2Ew`>~(aPModW%Pbo*pku`~hiX`csDnk0rXf zy0xjvx>~_nx4yF*uh!Rl`#jQ|4pS#b-Fo@|q^8baE-JR@wusoS)Y?mL$=?qDlXJMt)7pLQ`=*@!i)ijg zBEb!caXuIpE4)_~huU32zB%bov%diPu5v3wk*(V7yshH1!qdFzko^cv%}b?k0Hey+ zypsAaECk|GE}4r%FC0?;ocYp?9e>xlJhYwgI@g_Ur#1>Wt<;&85j~y>47xTk%u_Sy z&wV8^pNF@=&3EZ;^&!dlgkv|WP0!8wPqZj)aRAt#3l8*(L$`i2kF ze+ZwnpIRXGYMLiU)`AAYJ!`ya($OkssqY+nYjH`5&tSU!>(F;5!H1Ta)nu7@A?}wC zuY?q|Zt?fXy|dX#npinI@X=UZWL)r>fFOR zH=>zy!FEx78jG$IP9| zPNdEdz(g)UV}7#u`F9nwJH?aevwPD>{+a6K_P5{=;;q2RTDQur;9$2gv*s#d>MV-3 z26>Cv(C1fW)sX}*FaPPhs{L1rXzI~8G7RGWYb{I}J0cYETsuIH$A+-ge9n6f88a5{ zZIn^=RLY;MrbhJNFjerx_J4w*^`}OH`p;~+cl=gZSw&mTVVcldW?@xc#NC`}gN7T0 zMZV9s9d48~_+DNhLsy)!(c?MEy)(R>sG>1LqUahn-Jm=oIk%Z zOn+1nni5f3JeylMP`dBX?ywmvoSn+u18cT1Sxjh>GFnalrk#`OMM15e;u z)?v#0*Kcz-3soI&jNAJ9*BA9mms-(>i5!lYF4U|Qjype%Be$!{KwO-zEdXu9tY>Wl zGsv4)=jCK-VSRapD!&sATgK&1_f=mTUz$1nTxR%kr*#4L89TIZJ*ro7k{+fIi1<{7 zt^L`lb>=Gloilz@P>vHZ7O zLhb68ytcdF3q*e)x=Jn*sD|%7IHrOJB06wgz4D>IaD00D1HQ%^Vt$`28<{CKvC+?OWVdk5TD&%EY7pB{~wVD!HAHz4uos$#x#`zcFL%)a?> zp+SgHozru*nfcD>jcaS5D^N*YLzl=9#oQ-+G{H*Mx%l;ZK6M@YCLc@|$3MHNsgE8< zi)U1o*$|qY7*)@$QO%M~F~0F}XJ-La3l=`5`P>J;H*5An8BUAP`<{@&pI6S>F{yI7 z$OQ|sWvw%+Br_pdW~8ybB3H_i@21&Qv1w^6soKL$g)rogoB*1n6aAIqQ&0Z{j`YO1 zdSNjx25%2QN1sF!ge?hl7A~IUG-^LxS`Gxp>%8J)&%UHNl+VnMo=H>OovCSk5POj( z2(zMHfS|_uis5dj;w7W=>o4fY;JeiF4T&rl01w$zM!Y|sl{HiJg;RZDOX(rt54gPo z0SwqiFT9)u|7%nUn_^BAd0RHlJwNLgV8v{rpP7igg_(c~x=0z?Fwo&m=wE?Ofi3QU zlDv_j$3pBVj@8x_#)p;hZWyakq<3z6DlO=`&cqR7PX9xxtYya}r?BU$Wx`@YzuN_D z7IGOC3b$mDE{@IQCq}+-tFLA{J?Y*`yg-@$kN_$15i~6g>tP7`u+AGsgTH#v?+8wQ zEE~lUv@q&x-Evf`(D`t0WjrzSR3LutkiB?ym}v1AyUL(jK@sIN`LGSk#ZKuSqZ^Be znWa91KWW!-#kiebHZ3zV1WHVLytlQk>~;GChtpqz$lVfJTZ@k^?}+)K_}5@)RX2R8 zwUE+|_FE(7`IVysTq$+g&vL%_d+EOVIrUCO?~oC2Hs`SFWEuq}Z-6$YOx0iRL%5pz z+_X@fh{Zxky;Ghd^}@(AGF#QBiD(X7uJchHj880B=Y@{G7-=VHH}EM>f30slo~kmW zn>N)LSjuz4CxjS`jNzEbVEkWy(}_+WdN=RIZ-W2!|NPINb^?E8@t@!F&l~%1w$^`c zn7_5hTv@p0AsT-?KODF3W(ja2qtj8W88Z-IF!f&@sQWZ9`|<^%`8U zaqEIBXZ;0UV(AwC$oW8Q$jejAM&1XSm7A!jXjvHb@R#fYVKl3XLevMLcLXQ&5<3~F zoh&}r^@J^s6tJqeOLz?9Ut>1`E2(a!b#~ZF)6_L~Ml%WqLLdBeyx-bgjOI-vT$QX5 zJ6a#r2nT5k_>E8V&_K~}-Sq@M?7=l!fBA~ z@~i(``TzbK|9P~WkE&BBuT=m2kNcCI#<=`KfcVb87vhi~B2r(h$!hm1&C7o)DE{@a z(|u5}vJQqzFtPsoDEc4I2@gXc6U~eA`+pFC53uD(3mU?u<^KKWrzeSg@Bx8}nf70Y z_W%9me>Q^W|Kq|U@$WxBO|{>v{`s3?$N&4!`LDO6*dKWQwIWme|H1C*#DD~3kJ7~W z-({-*^JA>S^XvLjhD84R&o9SDW6bSw1yAJP-!hLH;Q3G2bp8jsS>ngP*AWVky-%%Z8r8lg?BMz)J{wP@4t&E*gnet9G`!V&%eKA{y%hl{y8%Lb-+dW zC*%B+asICv=bt0<&yo2rbHP6#?f(wa{y8%L|2i^_mnezGtLC^}(wr#pkG!;sRH4Mv Gfd2z#vuSMr literal 0 HcmV?d00001 diff --git a/images/strong_scaling.png b/images/strong_scaling.png new file mode 100644 index 0000000000000000000000000000000000000000..d8337c347ec2783ac1837bd22dccbecf778a66c1 GIT binary patch literal 406248 zcmeEuWl)su+czbyAPQ27ASFn5Hz**Dba!`mD=Mub-AJc&hbZ0MpfoHEOT)hB-p~7= zXPy~-XP%k+{pJT9V42|khYDr#bHWU3@7#>&gfgM>u$(b&L1=_LbAm*E2g zgRXvhS_~(5rT6c{l??p9wGOoQlKp7yPX83Iqx172_RsG~Xe>ndJ`EkMkW6N1P+!~U zPRv>*rLgG0V^XpKc_tbx3p#vZu6B=hd;EfJf%CNi>(_)lq{3y=<*-ezQH2UCamkwK)D z=%qy}o&7{jUzvGHgalEtWGUeb9DRbh;Dz4}mwot9?F!> ztv_2n2L`U*7wgN43p5P8hl&bUc@v*Vj`})42=#k2Z+Eu1siN`2gz!fXAqU8~XVS0c z7sOG(mE$qhkTR2#L!tv;Ly(Z)St6lRZ@i+TAq6_|%fr&D1T!=DnkY z!9JB1uf6e_g2F;84`IH(%bcgdNXRH?5NtA$fPd+Wbk$n~*5eB<693i{{^xlVv9VEB zMgF}v`d`nxLPmy0ciTnx!N2^(Z$x;}|IN#ac(=R|NMH}LZrs0mzrYidD@OjyMGzoz z4@E59No11hUw&c{$Y)-DBx4ZrY1AoE5U+nr9yZ*(lf2r1As`Zy@{Z$73B@OF8Lcm{T0Oc>$`YUAp6|(;7 zu77pczc#^No8Yfa@YmJ)>uUXVwf>s0|Dj0#R&L-g)%r`d{$>Uc5tqN+^>26m+g<-A zvi{vz-v75mRyr=q|K^DR}AKGFL+SO#j}vhr3&!MC2=vjZcnkg z?kxzyHijvS8+Pk0qvg1y8upsKwe0Fv4V)S{bm3%>PRI2ZhT#HsIB_X>(fPrCNEe|C30?xo7UE7u0l>d zjNbQnSkiJNODb4g(ECJ|zNFUZI__FF04Xq$V9Mb;)@b-6cIn76_q~OD{T4qeC}$Ci zY0qa4zpFF63?KT^MyJrDEU`Z}hrCLWTxsq+!T;6$$hn1n|4H%y?|Sm_`Fa|+ZrMk5 zez)zZE~PNY=V$vrU*9J{d9~noxpPZx&f-hNNPcc=qT8%(&8^r=M09jV%3xzCW3#-M zN#^^z`}zhI>e{wd$6@+c6eNy6aPptOyNB7hM)=1!B;tJQjR>dufY+byQY#8dY14^T zBA4w9N^kj9`BEr-PSu1Kz05elBwXI#^EA2t|^1pV@=+tX2sT2;KeGb0=uK1 z?~Jf)a4QPf|b{!qMu7< z@p9o*X?-ANb|i<0J}?yG18&%6-Fhm$ECva;osJj$`czrRSvF0mY2Q!NsheuFps*xy zLdUtk@_a2R*S2Q9df!=`Z?Nnmt#VG5d8#LM1TimL#1lb2-~Dg5H!B8(=xyBZ(huPG zGE~TJ4S$i;vTuT6tcUntPx{L_?o3X|c+aof`nEh04%EVb$nR$J>q@ixz;$bkwtZb; zvP5_Eu$MWl-eps%d#y^9JWz(I?y0W(ylcN5<4J{BIu(lV<%!&+p1186xo{%R0&Ir| zqKdi6PSRgo89;rvW->UO&b#MU4DiwSr|`RH^e6K;8u6XZIgfrd{5}$+u9rZdZBrJ! zrSa#pK%<`iz0qE}jy_-Ek9BB4^L5s)A-iVLui@9yrxyF#UvD=v-aVnyEPa4)*L=R_ za&x}UU6_xIik?N{K3Ap*)d~TnRK>Ipo7B(!_WE}2?I#}T*jpo0Xf;Z+*xwZSx4%B; z+em+{RI1k`8$f?+4aUWY4If*w_rKJxUyaosIT{pB>$%P)OtP!Dh>+yCzY@X?YO+v zRdG==F^T0T^fJ6Vm7_{&LcY>!zFYaZBd;V$RLk{L-div1RL?Rg>&Adk@P~IIR}qFC z;g6^qcIO-1w{E(pN1X=+CySP^#?|%Z;0L7e zk8U%TxyL_n6~->82|Ea9>o0d3av}7d7n`|BRrA9`i0azRP1B24yqq$QstU2SFTHG( z+a`YfRD>+LHA^ZQV|^ENw{mQe+J3ItEE(n|4)L$yBNr%9!Z#jasOd&ywzRTkuo#$z zG+9tec`UfkwAQLHaB052guECsibcws(pc{jH;S8u{Grz<;sPJX%82_R+8+|h7Kpf6MEJvxZquf5h_cO95WcR={O-4&;{H!7yeD+_lLNHN z(CT7V6i*BtaD{^R!(V-oP&Ijo(~R@Yl2FgXumW|8j_m_AQ3AWUeEmjd*ofgfl2~y2 zM_=?G>JbV1`S6`g8Dko1u$%SK*srP;%|AKyq(9sWMG8ztTbkKRv@VenzWQ~4blN;E zg<2ssdGzIse$7FGJ80S}Cf`A~*7)egF(PzJ54?KcV@<8XZg8JlRoVYbfLS zczq-M=CW|2`t6-Jt4LX<1-&h9UVW>*-%*ka7)q+yX^5tBU-*gz2AyOcS4#ihm%u(M z^(^J{6EJxTY}`4#I!WBK+!uYhVS9_huF#2Mt80$jUnjMLNy{cAC}=Y!B088iuu{;B zTdo=wuLodS^xHjgYBihFI1bqm9bAS>JvJ_qfDwAMqz5|3>^#{w)M?V6&^zxu`@ zR++m7YlIbCM>-nCT~GUq<4yc+UFAQGHxY0A{ZgbEXll*;?|(%xwl!RHbTseuUCzhl zROvn6Ly1K8nvkJz{~K?3NLob7qU5_V^5XirddtzOYMG>xgVKEH1DQep{`>hX$cJFs z$!goxnU3?k5mAj$`wqu@FEu(Sa3m`g70<$F()HmJOodDlD-aS8su&%JdRy(wxx{QT zm+e1z`?VReg(U1mv!cV*Q9dYovX8RELog|3Jv1xuA@O#s9C@uYlF?a$DtenwX8cHur|k4 zF2UD)G{E2cL@JI+cZ530MhWsn%ciV-L|xxEX2E-}De=6I!TYt9Lok?@xs*`?Y@=Nz zEjJgHV8oafX!pi=oopuvU2a#%J`bIiinA!F=yzXMnDt8&>Ntqn-mi>^WI}PLu%NfzTxjzC$%`u7DX5+Fhw z*e{7&lI70EnxfI_y+U=76GH<6D8?c^q~>b0&E`aOX26 zqMNx)Xqv}VNQXgF-4{GJm!pL39Qhe7$;9YxE?eLYRB{~F^EcO*xc~|kYeLnmvQO#< z=~r_R2fVqJv1aa5N!?mvbqhIX?Jb;I+;2siJ)dNl5_TJp23&?jZ{G>%LCKMyM!^{X zcLW-%V_#e~A>B#rEQ{z$WC*J9-XNF6Z7O4hnz3q3X;Z!BLiN8M`a_YGBNlfuv+n)= zl2y_No<3_yDe>Gmwiip^mBv9=EoNixmtl_d<+yg-&))2)35eiBJIYR%n4=?-dTaXxa{>@N=j5U2M{dso|m6ilze|VM0EXbU51?JmEQ-Y8D+#gxqHoqA@IR7nPi%o5ZBT<&y#)p|B4BFb-jsw|>q zsB0wc<~Y~#H5Qk}3Vs+wMS?7-gW}j0fGUQWpKD3>sdi1UiLqksnoQS505=20Reu6p zC5?a>!~NRE5u6z7U7V9T6&57Se_r+J!P!ueH+8-9``>8IWZIw8emzRb4Cm}m;m#fB zf1dJMQp zPR8?hIf(mWIZ?Eo5mL6h=XcuR(oGdtnLBTwJ)15hoJuFoM@ z(j8oCyym#s%kAlk(FF6<1Oy%AbJ2TDA7z|i1N7f|Xi38e0DtNb zvr=!r>aBu80u;ezJ;i@65&nCmMo19fuQ#|OU!sH(5xu+Zs5C}RD<3<6*uQgQEZ_Eq zF}G<~c~f?jt#h8vqaO%wXZife5^GV^L(#JRQSynDzW=qW7NBVK)x|gr*`P$y1u4nL168$P$(<;H!!Cw#dM$9FjzTG{!jS?C zFs`oFg%!Z%bYBx%O6Ic~%X7KhsUd7G_bU;26A$Fn$uj`*jk{-WluC zl&S(RG%bd&fOX`YDIrLNfnlH=*Ywl&xc`TMP~B=YzAJM1ht{7yafK7eL3&pPaJwGDK6PZvr}C%Yjr-_Et^DvHf^<@rFwmC_@oVa~tJ#&g(hC*P99v z&1U3*S~#@;@3CW11*IWccP6r;Ys8DZnbkP`RIlTWS2cOvdkA#tyw+z~F3+h)Eac;{ zX68O+_zwIg?C~@x*--$x)%;&Y3%BgIVahe@6%ztrQW~4L^>9~5GWgtNAT)G(xg#exwpC*z1q2K&>`Y=#jR|GPa+5FD-50(5G z@8RK6!12tPxy*!hF~**PAOHNpv8m-r@^taIsT|B1^cGbc0R=&rN+O%7bo(-Rr;0xs2q2{N{eAO>U{y06ow3rQU%esVIQ{ z$uDozu*KALJd&@FCX}!x@qQMdarW_IlWbPw4imdwbKy(Pwfas{ zf1kDb)o^m>FI05Z>%MwH9lZc2P@n*)?pF_x!blH&X|iY70%(25pqrbOlv4cu%>Iwo zEOLa#$7#=Ljr04kZlL({pLI3px!`c;*F|`)e^DR(NoX&7=_how9x2@|40mt z+xKMJ+_G*p*4oiK=I&>!0|RM}eZ*OXc%K2PfDqA?0GL!d-JMq+Ocfk`Aj9y=XlJYW z3ZXB6GAd=M1~-(Tg8=uPx~}VZ<&;qs%s=u)df$%_T;*`jnMzAqG21q$c6@aPS-0J( zGGo^=t6G@DIT(dX+utGNnuzJNNH4g;6A6u6ow~hvF;SybV>ge1Ogn5}9P6y6GfK3 zxKe4&Aj7Exr}F~v>9yG(o~##`aXjj7K%Prpn|U!m1>bAFvhgC{hqM#PA@Caqey0XEM5_Y}+{XXVo@PRYvzL>Lkrx(3U4K`H`gJXVkvn_xd2D5fl zDbEY~%~x3$3r2TUzUt3Y$v3y_-{6Z=wK}+IWK1qyx&rDPDSm25`(^g7Aj#nVl7MR1 zsaJoUy7kWM@?D)rgN4gHkWLS^$Lrl+cV?=rt?yg<`Lm{aED*x)i1{$JPaXQ5yyDJ* z1JFnxPJ80$(Oz-G9X=slA!RDrKhZj$I8bMWc}=p===&Zl~AO}sR0PF2k5Jq z-_+X^L%ya0gv+)AG+9ePc5T=yGc_KsGWi3ca|C`h2*>MCXDA{Udv2tIm?*yrZ=R;c zZydc7zREDE`A23rnYGt1*DJ%YZ#CZ_@C0@;db1pGqiSez|clE}uH+tP${Fp*|r20s%s3C|&d!P9p{1guW5! zPvk7zZ@vO!DOMtqH~}Gp<3%qK(Ys!9EZ8I9^bBZ!+Qb+zP|CUMY5vV+y9655-|z8f zofXit?n1WeuTN$%*T2@Ux-L#?+c(LknYzLrUp~3#IN-IB@oogb?PQ<>MK4uUmpxcN zK;T15AY;fOK(H1NJ{}iN6!aRQYy#%}DSr?QG50_(AVtbe>iheKE>nmRK;wAsE%7iC zh;adBzWJdIG*{zidP z@2Wmj8?*L{^$Z+J2|TgbM5_YHs3(?SzB0~9cGl%3uJ^J;vFz}~b@H;MTI18>pb+fM zdaRWplx{Fo3pB&+=xb3E4Or;e)XgEXv_j7=zlq)^QSrF9ppg>GWKHkS zZL#U5mG<8d<9|%s6>{(?7WZ9!fBz{KBD?=k@>8MHP7ZgyWpYT}{rxo5hXPj}l_xz( zS=kK?>h#a84tOYDQndvq(jj=zmq13j;?e!8c$Y_TsIM|ro6`h<(+J0oOSL$&wA~~g zavai{Tx5E6c0k*_3n)tE@rVo|idrOK9&Lk~vt3%u)Z7S#C^+Or_C!cGJ*y*6^To-I zHU#(LAj)4ygOQqi-epuSJMkr=?@yB}vYQVuyvdmZCn{LVj*9=>JXnL7y1ru)i+k>a z(a12f-$7VgR=j*lIeeNyjH28|yUfuSD6-;{QLJ19D6T+}<>EcBDk%l3$Kd_IP=f@} zoJ#kF+a{Y9_Pr<4quxT>_Z;uS!HYA<^4V;|I%3;@e1%+xlmb@Wf?b67O_jB-mX`{p zs(d>vK3rW`gNZs_cHggP?yYu)2r z`4B4C7BLEm0YNxcdL#ujY!36_=IDL&Y#<(v2IX#G>A!sp!ziB01&B%>A?rAT;Y;z5tgUDX8cJV6BTIldotU`6 z6-D|He5~huy_W(Pk%|p=^Yz&PE)xVnO6#`@+6E&c=p$d}l;AE^P{>Rhg)x?^Xy?fn z#ZjTrp==p>FHKn%e}b`>@kYBEi&@7$AlEI{ojDS+x3#{;A^XtM3d|Gz+NfT(Pe46k zl&PYIj`g@oy-_i_(-JxqUXPtuyBA^|4$-GkBU(^5ZQ^2>&||RQx-hRASEHv#qwDbd zzp@RN=e4rG0M1w9NxB2qVR6E22v>;pq%>OR*k11Ly2l!harpWZNB!jNA63>Go$7BC zJ7KCi4HNfuZKvdQP`ONJzxo-sNKV<6IW2`et4_eqgSZS>;gGXsi=`99Tpnrh>f`fw zP^8m{ZBo4SSo#stUcgu>FrW-uWKOcJ*5upFipKaQf~h)CO9sw%_;>n-nKyTEF$R5? z>Rcy|6kQI#-6q9rUSg+Cbrl~n5`c#OT$GL(6uM-X_gIdgO$|2lJ;5OtbsZEno`>8H zbsmfIJN^VHBeHgr!JQb_-Ya2$kTJN!!_F02y4%RD#1@=4M zHTG2mdQJb0NP>mqw~u36_@gL9)eHM_!M-Fe?k7~_=C-sPuP2h4^P+~sZJSj>(zD{j zCq8|mx-vls76-z4H4-6Gal-KP!O_6mPc+n!RKAzBo0LKprajkFMn8~0Cu*MCUNn9n z^=63}C_o7)=|pbFz`n`RhhIzv+lD*jIZ%#}m5%ahU57544upHJrDNiWlU;)} zW%2B+#C*$G+fT$>hk+*@N3rn?`n0QKCXDsPhk4+{YYZP*8T*bmN$LYmDTbu0jI3DZ zH^Gwn*0(2OQ<}+^N$PjZ$8RZnGScS|^ z#ark_6AW!@L4Vk|cs_0N@SJX8@IZiKH;~e$E_-kufEP`OL{#fGgtfpef2>K1vqRIp zBwPf_zgznSaqBv84PWDY@}G0;R9(@*`?k-&Q>jc&Q%W@`MDdVDtp60)pd4m>@f~kd zEvIW8mq&0UiC;nNLqD84A&LWg@Wb*EZK|_m*Kno|kd8iB9xYYW@9yE_ym;g<19+M? z+bt2!?VsSp$sUlRzkqnkNNb-rh{{wC*Z;2M5mJpBSUel=NoBeBhoa{U%pc9o%9Wg- zw7RaU{4)7WAdT9GuRgnftnWh5>-Z(Y3hq9*OT>|9l468!^l3|3K~Qyj^}Ki4UnmWv z39c3kVc&5T1U>?xED8_Tm`;@P?E$|rQ+EuLd)vRu;tp4*@XWrVH@)4lV!lpWj|%;@-9vWMTH99AX)u)s$vk zgxu-7@SLhJA7s#FR6(?GZ41q^TL`9D7FSU2lj1tjJ!aQv-h&m?bK$2CvI=Yw8A>rT zmqMkVzqmoLE%;Fo%ySEHa^#!#%}7@wKm ze2^+h>g)1LN3$C=dz^3U2StT-$Xf>t$BXdmMP-36kQV6}Af!}bzlQ_#9yh-NNQ3TF z4v1{zY`Z3Dk)J$AgGs(rMY}gH)_v6Y`0^6ynT!ILR+bQ7v@}Tzbsu|qOH6>u2HZ5? z{i$^qfDe|^nrH72|GnO+itBOGe|vPBx1wOekCVvn5awMtY{O*R@uL8uPp7)uP%6``dB;h#AKvxot^baSqwzT$8zdK70dMANVPxhmX3+j z^P&Z^hH8ve3HITGXi(UBT2$1P$v%hO39p|Qa7M;fKgO*a08O^9|1-NLZ)6o@sla4I z{yNYPm@L6kQuhUq6|dXz&v~h%5fT`_AQPoYzWrh|{v&-(B2R$1*tr|Pp*AZ?s#-d2 zK5eS(AhI^OC`dv}&fWp?1rG_36gGh#)2Gy=>54E_+MW(s5kqm59!3)_d&epbcvspT zT2-O!P|GK4#Cs60CulJqocn*r;x>7r$GXTvYRZU{IgzY1;N>`1V-|91ANxR56O3jX zgaF}qw+R5dge`jzBuUs(El~1aT#s=Y1Kx_QBv}wvz1vQrXR9NrzI^^T^ixsbJsbhz zZ;v%!(-7$?o9yHgk2-BhY*?Uis?wWqm)u@XY@|Z&KbU@O%fU^Y{aLcoA4rDTK(g@+ zEYN%{M5vLE?q>ce|&*{SOJZ()49e9T~u{TuXrH)!J_dZ z{H#-09_Ni}@q=odSW>vC2(Kw{fHx7EnpOAFLC!_k`mbln4q|lc=b{2o!R2svIs%5N zsWTuDr?ypeYVoYE^1k(8!XJz*M|;J!Hszp+CME8@8e6^Z{@STV%J=upkW3MZ2siv$ zH&;Jc~bs$3a8yWj=~%y8ib5_y8_v(UDkHBQ49eiO8XVQyl%;8%WE_Z;Ci_ zP3~aMfP@Q%oH{!i-Hjv+!Q_l^xk5F6pd|p}+BnrK$J7EqKRpqW>ZImGceyJp%r~l= z8mL}Uz>#8B%QosYc2bai?+Iul-GrGnw+rB3x$!iyRW7@8yvpMir)`+beJV9nF=hkF znvmrJ6B^BV58ywR*OStpT!T?8vnCQ9>Y7-VQD)p}R+NAd=#fBgUUUZ4hu5=ovr_ZL zUd$Lr39?lY8NLFlz5@{m0OnjU=HroTbw2U7X@ABo@p1v7Zr(38OcyD`Uv6>+dXf6I z&N;AK@TfDZP1^j+Z1a8`T9-Ce7ny~g%YzW2GrtUDmC72(>bZe*z)h*#%^Wy%hzOFF zQ3zEcvGRNvkcy4xqo`U=0m3&)smDOj_4%mc!=|RaBQ?>9JIE-URM@9NK(*@^$SF$0 z95w)5(FDuoy*UO>re(}Bs2zp!0C(c-qv7T#B#exQcMEU;H!_$fZ%G_?%Za zm?~2>zYhbtH(}7+VMSA>Elr_lMvmg={XSKtZdQ2w<|0psr< zMn<8H@bb4vPgO$loXz*>sa`{9pltzUV{sqTl5)7`2oBmKkZrORFGJ zQ8{#&L-1pz+3@>YPE{(`jUn>!V3GExxmKSYmpih7CM4MYCUJshQpZ{D;^wp^jiQ#^ zI^;t%kfw#KVLDslXNvjDS)WkDLNwv$Y4C(y5Fc%G9MPC{yc)BjJ@v5W(X1=m@dB#A zh&Z9{*p!T_CIPY$&K3S190UpCzRxAB(AxDO?Yt8!gcPf!(U~#353ZR^E6*2*<(i4r9}FmbPA5*{0F%4K-UuH zRyDn`h95G+KXJ`Dd_Qlg>A1h+3!FShnahXpZ=a73%%@SUwPy6Ki(5vV=0}JGV=aq! z*-npow4+Cgri-0PqdgVz^Uijv)MRB`uhnj8SzXHg@O9=*O9Ck?m$e8zyb_t7vODob zlQs!BNt&#?3i3kdD{7d6p+9#)pJhha!$ml0jvbFIk2i)*i$~{r()@2c`9R8T1W3jU zbJSKwO9_D<7-OV(Xj=AhM^>#-S6SoN%(jDpa)Ng+uy z&XgaB#^oSVek;B%Eq>P{!Z!y|YLB$%*=r@Acs^yUTlo}m@@UWykxG@&FPvy&nL;=? zz=*JlOrHSQKER?A$OI3_B|$I;7(X&VYJU83S=`XVMfado}t zuirsL%LtmCCGZ@j3rp&)e$}3XbD;d@p)VN23A2xQolLwyzZwNKG_%1MbCWjpcqc1{ zAU3gxxYL2S*Rin@%*ItjxCvOcI_$UwVYYDK1!}~fB~vLDYCIm-(36L#00X3u^-X9s zuR&{oT1vVm_Zg^We%RJ{A@+Oanj+2?ci`8XxH~Si02M-JqR=E5;D`L7bwqHn+f2Ku z*r=J^%;f%E+ua{SM!wu{6(g);Ixg~C7zVC!-aq{HK3iHAFKTcWlCm*TJc}~EoJiA_ zbpvg=p?98osdHK7RyL@u-FS6#lh(u^*kZH*=eZefw%4l6`#eIy2$2%cr=b#mIZOqu z+M9GLrIP6pKcUizqi7%?=hu|Rp;MWOQ4C>V^xr#&E5Jw3`+3*-&4t%bFT9S=t_S)M z`rtYUc(DsFo_hKm+9dJV+U?(R_6hmoT~*erz{|7l<$06WU2XRrzEHEOv;Jvg=o-$~ zb7GkSqHa<)nMav>O{b~Oy1_h>m~Y?PWYy}$t@Jbnh^3nM#pBdzk6dVdOZRI2vd+LH zKIoYvpR%RIY3eF;wf~&iq}CsPLu<`GiiJPo{N82eR-N!5>}VVuYePhhlQ9WmLWeS5 zkC*7q;_X&gjngrAQN8@R35s}KFba|XPIt|%clvOdoerI-f`g2J{d#+WTCo*~?^e0( z%uEmP@7eg@oaX|+**(X(7i1%sG#fZhfZdsxs{O7xzSaKG&M8QXj@qlu-Zdhey?th@NMk4d1jC={?9DvI}n4L1;Vr)#yuBCwkP~JILb-0>Ax^l%hmcWuwi@5baMeMU68%Ctx}{hj7VDM#c)Ezk%a*6 z&>!Rb$gT(U+Ab+^J?ntXQ1lm;ic$r=?1KW!Sax?t?0RDvo9vhD#_~SKFltZW1!S>g zZjR>8*1JNJ6Sc~Wf)TL?H_%%;ic3Xx5&r+2*GGJg_qibGy^jcdGOw+UY zgbvFpr{SQOIKXLUx}{Hx1_g1qYBtlJH-m>_&p23AGt^67$96=Kl5(PLfpb|?qT8_h z^8j%tCnKkVm+P#j>NWhv3v?&^XMq&2K9o0DHe+ISIG}bEq=ZLGJG!r11l%c ziBF>WVvF8By%btj1lv6G0a5UsaQ1=s^%pO6IY21(D-iW(QNNFIV|9%pHIw?Ck#F%) znp@O?DKrHJ3@$HyVd-nMG)rt_CPLd@&~~5Xh4EG6FJukgZz>f-HCXh5GA4oCaC$W; z1irmS^=#=Fk>2b~r^a-ylu-^c_LswCeIRP=-Dwj+#JLTezKYe}xLHs@#u)BSjP7%e z@7xDm?>up_WgmrQp4a=SnH024Z$!2IVrsc^YFJCOPUboSTG7{MsUB^7u3IZ9aM+Wv z)(xN(FRGR5?To#pyWX+)KWZaRA$<9&-R5EEmRAYvnVNpWJ0z6zXX(}(fQyaQE!V5N z((<|EPeMO+T^8WV^L#@HrxZ3IfF9-iHGfy?3%(%xdxme~UG{ zHybopNT-qw)SHDP6ye!hm;_TTHyFh(w1`aZ*ro71>bi{7u^->e3Dt~V!QPVcPUG*|MPAbO-!csdVo8q`Oj_&RT&b{^BD;zfhHjCfj* zz$pOGHK%t6_dEl{K7v@d_G4C~H@*Px%IhizI2U2(fd)^fh}|IO;eEVeXggaCLn)|i zA!O3o5S+z$aIXX!G%C;EPf_E%Ig-7=uAdumSvb-fh}y*LGQ|2w^Z4ovII|AbAPNwA znt(H%+vOJa-P-q;D6mfD{JDoX-v!Md7pa1;gfNtjA0c#{cAXb#n^wXW-7AY&K_?Lx zzdNV@^|O^QByz>`^4yJQzdg+0mqbQTWF`k{7`1Dj&dYN>)@Zyq3c3@h6z~CkgocGy zZJ0Z&;jIWQhuO{5d0FBSN^GTG3|Qr9FG}i?^-a10X5><19tw{guoLZnW-p3ih$;`%UYs6!2zEbH+jZ23( zT{qhDJ`il(!6KS$-SJ2x%m~Ie;j)|K0SE0!ESl|veW6Y#gI^|=!Esjs1UckhOJ4`I zRQh5B@`3Pk3OWuQgPfr+IAfVd7tX~-9E&e*f6cYNEFVvcmgnnU>rYzWF^{ty zGh_}z#T>y%G`vmhAktTWAv|1FB{STVdf@@^8QvYQi9>Zg&urQ$DY#B3ik8@^tzUK7 zZUp7* z^s2N^=?baO8FYiFRC-r+HB%pIk@GN_4ho(g53rUE+{X}s5BlFM3og6y>TjiL8!lli zgfUudYzF7CP~q z%?cJhhzj40U3!b=#<^a>jh&*&_xL}|K*X^LgkYnnlJS4)|J@w@=V$Mkzb?HCKA>Oq zh`<-F_bv2bvX?kZ|A8=>>>!_m0yqq)oD{Pcxtf6l>=*4GKT%<>psW!Q8Wg6ad(5~B zHaeP=D>$tuDs$8PW-bZ#EHG(UUZR$&a8HON4jmFHo&zCq3as7`C916CqFXR&d`<5< zw!odKv9~88%MCOh4qdZE%11vAf5_U#q4+Ez(|4JV<^03*dm%(f_zSmpyvKb(tl;~z z%bOBp+BDyj1ZSiKA}N#BFMB=8JY|63E|jHz6T|rNDlZ1z-_jsaEcF=ca5xlwqkdPi zDHjj@%QvZE=>#^@Mu$h^!v=bBq9zw+mmpg-H^8$TZqbjP=m!AqtQ_~e2?lzYT2Cw^ zG+5J;tWIo0rw3?rt4fTch2t{ zsfdckloNxc4Xqlj%T2&7-Rc))tR*do0bZhUK%NJR@w{tbH5*z%m-Z@s|W>uFO7pYS>oQT)GF7QxvVhA4{Ky9N2_a5*BI^kNJ*JAb)z-dgm?U9kRe4z7bPf1 zE=lbvnslHjfX7!lD8BeFiv#FVT69Cj=0mL^pT&6aUzViTScqSGp6&lyI5+BxXB{rs zVG&$l?{m4H7s=R0=qA!Pc3;1vAE5jJZHgcTqTs&2@#iL-dRxx>=OK2rf7H_`snI|V zuOSyevjXk^G|19u=OLEa7c~) z-rdqdvH}h9#qC`!JRuk`sCaLux$Ml0TMj8S!b&wOI;w4^ACx9?!&_1ByA%$ zq`_jCV!gf~FHomW3GAKj-g-zYBK5k7V@MUXg=0$k^(X+N_W^!M0vSxZW?ePKFur3i!r{FYvb!{_zaZ zHY%^L^F+5Va!+%FD2-E2f&HhOi`=;N0sTH=|5o02RAdpvCiut4e~weh#PW>yn^5^O zb)5jEy9&(Lqy;$Nsii;IPFuu7%P|m{yiV(GUZ1IV85k=o@GnD8qn_DQ%9i#_yK`UY zEd8iTA`@!F3rItHEe1`|OVL^|QOiJjwv84-VUm6M9+b;lJ&Egcz-@T+aaTrNLrQvG z`t!UkZcWowegRs~0_O{mX%t8T@pEs(u2+t1EYFb@K@zNCG!c*Z0W%Wa*oR$ueCZt} zmttc27iWO#I$%5+uXjnwxzvcuL`nt#w?U;UO6Y>ds!69lEo!D4Z?J2P%lzm@0N^=# z^?G5rt1op9)GK(aj^LJP_Lr5mrbLuu-Oz#4l59ZKOH}*q;aj{L!0g%?lK>kP+hFs` zvA`Q_1SFtj=!T-CQKYqBz{X1%eQ8Bv=v1%F@DnD$>{#DlbRl^s&KF~h>vEm3{;1^i zC)l|Rt$>eY%NX0)3SG^Ut)a8v{<*>h2x(&0uX8W|T;4pGSG5QH>-BY7W!Tk*qd9W3 zo}5UVU6fJB=$tlb8%JSnDIf;>i+vsbtUpzde>XrZSkNNCEbq-J6$fKN8qvceAjk7? zI(Cu2JM)8j7X>Tu?{Fls8cP(>EePCt{ezBZe@zQabK*k1rh04F(o}RCJwzLe4Jpzo z*5XwD`axzB@L3toQh89&DNrjUwGZV0vEIsml4tj!x+(%_ku^Or^tZJnJywAQFy(i3 z(77-5%3pmyE^?hryz>zBp5 zezUBFiDGT`@>JO(ldZA*fxi;Fq~mWAcQup+(aTt^C$ zamkiQ2kD}(ZGaX+yJ^)Pir+LykM%I;A68p?M*udaoWqjhA^uH47=Bd&SU^uhtQ!_2 z)JFaEH4_4nGL1L#ojd7fBE1FT-2~>S>q0Gqs?TP2T=GN_%N<=fI_PPJOq*H&P0Ahv)6f_c$h(nJV`|_}`Ri#~&n1^gJpSud5a-6)%G?*pp_H`!A1t zc=W3*hTF~)G(K2wk`4Y)a_uL*x#^Da=A1OA#t=~BsgRil3PDqVd0hQW%g5CS?gfW? zmMS!cAJ(fME7mA$!{om^&JSjh+~_j?Ez0`Xa?{?#Mt~^^-9Nez zzjIVi%)t;HsJrkENsVYAxD^4_aJ|DkNoYq?$*M)>18TT+JlN~c1-kMj;1a%?9HlLp z_Gjq;f;)9Ad4-H=IPQEnxd@*#(XO$rdcYsk)qPXvB8`grJxm5N2Kd|>7SAgUk;+40 zbxI~x`PR4bx83f};zOhBA`1b1!genX%1qRkvHt;#G0HWH3yDk6C8q)Y*UCyox3l{B zMP``0mehN@j7e2lPZBCGJ|%6^8FV*Y3U+`q9p}wSzD6MJ<`g6%WvV(C%*mSb2vt=~Eg84k%4!>6xB! z2`-x{1B^3gS}@Dvs_YjRvbXNJMhab6APlmldsdTT7&=4g-TQ+dNr;?%GK|Tmk!Xs0 zVC<5H@`rb{@b=dppk#l$0iE*BjE~k9Sn!^eh+ne%Lq+MEv;@U;%%=B@*&{ zeE>Kx?Fd57jK?t;vLOWM+6Nru$w9b^OAesEdPwOq+9lULxzGo?lw1}=t)@p6T9T7} z2-v1f3T!B2POCA8g2x3}-ZiNAcv^F06CNbu!OV-WX(o+<#XwE)G*2RB*fD!2l+dJz zg+q|}RVyJ?1Il!xu)82Zg=h0f-7}K)v;x&azP&RQVcgzqZ%$vSR=|_7Czn8Gh&zRw zUbO(Pro;ySWt>F_;RdOd`d~2ZoH%PkGcQ#Xu}JSoIC``KbQ8PMNwBQH(~^5gIBCo} z-f2izL!M%L>(0SFkg8(Q6-!sq_fI>Ry?uV@{(6Y(=>O3=#qhcoaVUaoxu>XqFINEo zQHgPNpdO$^{#lqCo;sO?^9ZF1*h3mXOER__1rC=P=%-;?=HPoS;4J4IJ!eP1`w;tL z{4!xH@F`FT7w+vydS`rl9`dHfV&C+OL_{WM!}nv0#T+63n=6v7cCU;;q2XqHfeEKN zfl`z1PXqI#jxqM8;JP~sNf_BCwb|)N1B_V=g3e5;VPnH0e3O~qeoWI~P&(5R$;wJK zwE_*ZALjee1=}RA2E%0wcySx1eQAicgMi9RvfDNj02eV zsTumH*ES1{b;{QkY25OdxwVMZVzV+7O6A6Sqv7PBXK;|;oCdw$29Qpvb1npw84!m1 zAP8xrV&3P{0Q7&3>w^b*0Qu?qU&G{k36@-?K=4&AF#cQ-WIhamcILWel{wj0G8sIa z%Ds^=z`f|RJl6RlHHJp93#J(&RzY)4`yur5NK8Qg)^yUe4|S}}sZ|O2GM^qE@y>%( zJQLxwaqw{iC5tN*qj-MB=8mGhb`3jgmG4u%O)_hrgp~}wVH;l8(U)B9iAiaAxWj8! zIq2B(7V}^a-EX#=<4-ph8I6rmk#3aU#P`pA$wbCrw%T@D%T;m3o|*^0?4CnzoZez zM@zCY9epw|PD`r!u2%%wut+T6|B2{{688v3j>VWle3X5%-VLypbcUd(Q4!&(<;s&- z&s63!>eRB7V~Xqf1G!T?$J8hYnqH~H&rCl>^Rm0@=H11Urw)j1GAu@dvt0DVC{p1C zOJ0(&gCNXh&d;Hz`Ox)G&u?nP*wSY1;y!K#U1$@fFo4Xn`mniwQ!uJpTx+gDb1ff?zdzPI%{J7UJUZ`$v zE4en(l-OpS3ap@Kn88bTp9oB8RazKgu}^2T5NASc zt51QH_+#F>F6&8=CJIam*`XtiThg1;MM1%oVL8f5^w=xsHkIoEC4;ZWk9@8hM6Jdx zvHlF3#xT(?Tl|$<{Ip*DNvswjGe$9$j$@|}IJjjheO*Q3sdE(+cL9T%BRt%X-=uye z<)`OXSx;Wvd*KUK!oC+?+?1r}cV&ul<+010V<2g2$Sc;{~c8qkZ)K zfh^xVsv9@i1`f)REu}iO6&&%&E?nLxHp(o$OYLEDv$`9i3Y#aBtV=muGJb2bOERPJxO7F{TSe$7-s#2lNoDf<*%u2ZFtf1xlKJ@0 zryPv{#GhFnVgAXGfUFdX)Q#XDRzYVkb?Z$ud$aB37Z=J}{2WQ9ei1S$H@MmIgDBn( z*s8o^@APgFhP4#^OyQ&rxy^hdX$LIutnN!S|6})exwJIn<*)FFq848@PtVyO3<|^N zj+s+TL5pba_XiNRd6e>*Rk{qADU{F5ud5>2v3@9v`~afns!b{^a{blq%v9j-2d+V~nTMbeqop&h1C1PGQA@(Tcxoz|RoqMcvg zQM>yeC>%mOLpZj_9m`V-SQP;06G3{O;guykNmvf)W{>>^W$i9^sZ} zI{-*q3l1op_!qy_GlQR>kI#dY=ruLrrDe(BT&SQ5ae$Ao{-$q{`4=Fod?pMn5%H3C zKo(>_pKQ4?rIZMVH1F%nY;|BLJZzS zJf?V{6(A3_pbK|<>&0REO1qMVE_-q>!tFk683&yF){pu}N9%eEH0)_b^OQf+ zT7Gr$nWLfFL`p%78r=G^Q&p`W{|R%;)l$P9Kaswg=2uMy1NX{rxvJGMYMHESrL2{BT*@S=~NNmYDhgNdt&2Qgx-ucz7gZu1z zIj`zf`PZ)6v>m>+)(m5eIaabi1hn-tr+D_MhHp6%0 z@(fvi&pln3≦~xSm_FASVn0XAXxbD96lz&@HEja<|`b8OC$I z(_N&`9ePZ0p&N!^Euo_h(Bv~+Ji8g2|KKAnNq&$5Zt;!X0z2deDLj9%sZ)qS&Y1}Z znjp_IuEP#oE?p~nX(94EEjfq%mToqmVUZ>z`T3o*EOaW4d{j%p$G$PK(tN8EIi(&F zc2J8fM13v1hYy;|H(;9600Z+m_vAM;Ne`Soz@~RzwEECV%E;M49)GIRFsRJhirAKS$kPCaEBFXG zcvMdygbMwv#^jllZOBYs1bgkRD$j9~g}c|s$`Dwyx3}1~Cwk_A=8^np$-c|lV1s1l zDB3CWl-3lqQqSmUTLDO*6R?XeeL#nlVUfL~sQ2L#E9kXr1ln9}n<5o)TR{i#Yh>a9 zs!MU)L8v3xk+J|7 zaPh+i@jbb@=XXGh-vxTJsP1dKR&!vs^3&G=JXPi?!$!$v>8DN7Rk}9k%|-oiT?MIa z{Qesa&0FNRw=0^GtLa9=FIGzLoolP$5dM6chWYobFfo%d6TWk%K+`5?SMr9PU^!XN z!JZ^SZp$C{eB*~h5mL@0l#Gu5JH*N&=!CgxullSFJ>2}7Z-G&(3LQ~6a@p{LqN-&~ zCE)R?D~_fY9}*QFJ$d;PXtY^Vy;WJ=Fr2!@R|JM>$4_Lxk8$Z{?an&WhCX;bfI~~+ z0`zOOZViskOzE&Xvv*k1$eDY0oMQjiI$dGBFQ{=zpT+d9EBaa657Q-&U}IKU7WGkg z-zxnAv!~F4St|yDdovW!BQPwZh$jUHC>ftge!?WdNdr>RA+y0L9p9K<7V zFhH%=z_%#`B>xMkw>#7eb1++`2|^Q1N3Ov3F^{nnceD#rQfk+l+{oXHU^UQO1i52-Xa}&dtq55+e!x5EQ$j0p* zOl38gJPZ!MuxZw2*t__;hyT>7BtK?n!@kF0cg(sgTXDv7Zezt;h7_aX4<oeM?h_5$f0h23@P;#z2{VvK-ZbQ2G&!oHX%c`wOZffZp4SU@2W2N(9M@1%ZIF z1$)1$#mEQ-4AZT%IRnQTvr{F4c(BF--DUZLmC}LKE?_H_T1ZMp%loLS9{_l}IM0Ra zjY0{L(BmB5ex)k@k8@aPO;_r0YP`W*QITF5Pe>ICr-|-7qQ%cv=jsN#tU5Aktk{wl zp$#_5!8}L200y1e38n%3=M8_K%w4(k0{VHeA425X@mFw*S1YuL@CfoR+y>(jw*>c} z?#dZztZhXviKeg_>J1s1xpq?|L3OD!FPu((>S3IU#?hL+Y7+Zo-LqV#KL;(EpuS^` zgZF8M@~-dC*PVa_`v9g`HqaJ!#j|P5h>QkH*+7fF0{HTG5ZKRxW)p) z&T~?LQ|FK#t|O5%u5Jko8@@wUepQ#~R;d65QW-;_MC`=)vIJ^kZ|tSc7Y?U3$~<(> z4B2&p#ULfIBeSJlV{7e(i%{^WPD5zU;IeqVnS;?Ce{Lw}s$>n0iu^#?-1j(mBXEZveB4!bhU2yfk|retIG&g>Biu%GJ!MuTwdGg1~`Be6@3*@rDuBS*WR9 z<{uIXF@g-Dv&dSr2PB+XP!nk68A-FF%a&I75ljX0R`<>;g8&i&!7#u35tQhV$d!lI zteQE|)*aEA*pwS|ZPg0Ym51~ipheTm6vCE+=a_9%w4mZLRd5G5Z=7-dU)zANsWf)c zVeE=i5v%VPf(sbnUxYhUldm2kJ|t)d<}v0IJLhy;_(XEdzP>sfgOzM3jb=1!j*Xce zzrt-0y;{mcPIUssbb)}9s~bkFq9^6kDAt|Oi8ED_?CXP!qqMIKA4>{LI$C=y3~iH> z(+N7f2I`1q>GHgi|IYGkSKHl0!Q%&O`+n#;Whf0e0;~lPB2?aoQvX3v_prG1s>@mXS!rGX9xm>-uT@pG1LbnHpa~ zjze9P4Z2Rw>56zB-uRj#UYqaVKyfyqBbwmef^VqjcaG93;e9w5`CPZyuIGAfJPH(v zqun(;BZ1!Ae)aqsUHPW+SzV2r^6?tkI!#A`+Y50-R|i;kX8nws70!lVUDXyj$$k8s zlNE%Eou}t6fwffn3OHB_7sX42c|+-}uyREY>>6zQ$~tmMrBlYf+?m{nOp*vgbVkqT zWS0rZu5i5uA13;VuZJBNNbQ}VQW)Yk-l>8)g-_9OUm_PLnq94OU_?oAd5ewp-6v)O zD+u=9fvKiCclswR@oq>Zi3;x>vc@*~G#t++k%a6UiDUJ_MSyb=lb(J))i{#zv zvs&-nj-K-nLxj!!#3QGN{E{V&gFc8=8h^xCOgEcSGaSgt>Y_dVC;{p_1|KiZM87=YUEdA4(B$LlH zKYXko?lAQpC(wIVW3$`G{y|unYfv5$(X@$ zg`Usoe3tNTb^Go@Rx3Yb+@<$;8VKcDNHe;MZFSpSc#~kXKm$B2)0`=JUuJ-j0(v5a z_q+LM-SehpRqYl&HE9eBty3oe)@nKZrP&})tame11%n40FoFGi;EZ{H1eM?Yr=Ea< zT8>g0b$({7H62@E09oH-d%CHJw|0F(sb)|ZPpdWOzK|3`f5BDn+L6pQellDh zu4qQDNOQG!m8*NTiOZQ0Y%sR6@vs|l5}poL!ZQDiJv};dw56y}{&r&Q(l*~BU?Lsj31bd|>x>*xS$P&qdvHPP`YLe( ziiI!1<1NC*AW?<;6x(yuG_(pmXQbkN30jE`Qz+gpDB(_-ivrQx3O3I>T_rg{6cX^A zkL$)o?<;0WM;ivn^0}pt{i0NeL$&0hTvpUg%vCt`@blJUOILA$g{Sd6@N#BQ{6gWCP~O`Od%ZM9`gqZV!96VY7FY_>n`Au-~v=dHvq+RF!oghf)}E0U$Bjf!&|`W4^P_1V}e!F6B-DhMGyXArrQ9~e6q+7s#=k;EP1I@xzm24KDXLO2JO?TW*#3p(B47 z0!qFRL-XnnZo{hw9YJowm@&V})8Q8f6t^|fGom%ut`c7J1UNM1U8X`f@KD0J4q&{} zKX`tEP(|w}i2R+cvO!={4AB^f=?S?=hAhOPt{b{q&h`Q6xgJWZ~r*?Y$!|Gl6!VC|H|tPjR5# z;r1+*W+C1=y8`N}FkSk<)~E+XI$D7nn^z-sBSQFYkA~^m<(WvnqC7{Hd#d7)0>dLblRs{fZUCCVLp^{w#q=mi*FaJ6 z)ooCX-?vaD?`diOXK>`0pu|sJ{q~Yo6-F#?VFaf9Y4mS{&LDw+M)^lGEV?B~2t0cI z!y)V|iTu1zMlZx`*QZ5AIlBy#pEVizA1gpfaQ7j0h?|!r=k77VK_>^XR{E%k%eG8BAi7>oSs4d; zd{Tvv*?CD@_0+1H+z$%^Jc58C6jTncH%_9sX4jaoB*3$;`Wj>c?4X7%vFS1*K^-O^ z&VwD?W1U;$b2Rl{CrBuD(^cN@U^V(0IDmGTcR>t(@63!E`Adqp6bwZlo5)1O9&MAc zg0ighwa@22E{X8#w>T0Lel86I9Jv-+Dltbl>TLdSPF;=HQ%C8AuK1%6Eo%y@AIeTD z%d?#X{-~+a>lF@QVq%vIa27&0W~im-dkKacEDUKiJ`^j{I+PxA%twGYU$p%fkO3p1 z+^BKt6wh9}>i~W|$&!Ufv^tBfV3%2ImFMFaIjy)#CB;p52U|jTjzuO?2EN`0-Pn$R z59m~HBM_Mi?@!fJ67nujwbg<<@4*Q(URl-m)Le)z0r92&icqwO?wnr#kk=6T8hPPY z6(_e*$@X_#e^F=#Xtj6&s3yi1`g3X1rJySC~N{lItS5+`?6^N zeVBZCM`<&vQU|hp zT}DrBGBd&;kFklYe1)Kjy3Ua)Rj%{pRnTS4ZK zA3^xl9D{*ahI5~o9M-QI%tSQlKMOS}67{?;JH1jDcUBWGg*~0LtuRyE1-XT$BWTR<_ zacbJS!UY7&uGYXARao_J0c{rb!B0<~CHS-R#Yp^gh;QTFBFkq8D7!~CVmDy&%og0o z`GHb?q%mCG?oAI8cLcQm$-Zj<{_*ZwYX_vN3H=B*SPSPNJuF*)cJ0p6E~t1PSHfeL=?k>3pPk_{10IOd~OU;O5_+6CVKWxD5BJmO6W4v z@lhexu9+7|2KbH+|J36zkrU}$2~oVH*1FYo2kL-1@L*cS_hNLPJl!UaZ7p}TpV)i> z)^ri`H}nGWb@k)|sX4QNH>*-d<0gvUNT#x^vj@*iWHQwnafwniJUp8SF-oJ#bNwy4 zqu?vx6ykQ3btF{m8c(7}H=)aDl}^EdV@3bA?DLai6i)MK;N+T5?Oq38+z60AHu;!+ zSL?ceNV}9F5DJycSaVOEn0a0B;5u1s=KYl!qqkcw!KK|ovvEK+wb=f1{q;{;HC zFn{i)H@8$Cxl>C&K}a)u<-!I#VK&R;APldU_>|qN&bOw|Hy#vNas`a1i4VQCO@enF6=j&Qp<~$WdYX8&sR19Sq{mu zd!2>$w|Eg>6%Ul3-q_EsRr_x}b{aGeV6vB`CP#%8pcY6uo!GqRq=(R4#^y z(NQ_1IA;5vJSH^B6eZ&3&{*YfmXTxKviz`LhD$;@BXe~NMo#piNSetq3%yJcVr};cG=y4!T*%+DGh0=$0>GHUee>E_IoMtiK z6wut0g7@WjDRa_C-X(u3Yr0bzhjoYUWIVqtIydC0M}!`#et9v}H$V@WlQJgJd8%2M zOVynmfe8FXuj5mcCW(G=FdZ5kiK~;++!0! z1jmQXH{!;Nm0CTdt|dTXXJb+}I5Vi?_g&Jl^B}Pd5_#_w^cJJ8T2Di*}+pc~b z!EfaDJ`I=Ipn$wqs(o^=6pzX|yksTb{%XFJA6sTxl+j)y5z|yaE=)BbvrhXiP2!M5 zVAYLEFsqcHk=OPj45qkmj{M<-Hx>m$*CDWB}Mfh8`A4(wp!j|GU* zT`(AF(h_zoH#3--tlx=~RyhR~k2J|uFu-*t&@zTZt5v|C-Q-+Xq8C7}#;InM6?ecb zBhptck*Z72sYNv14_O1zz=xYdtJUkop*0~ejv%sgj2 zw<&d0@KcIXoW2P?(7xr1hMQ+dCt3*TQ%N{(vkLWtf|bd)?%QdvSQZY%JGu!6V-N_- zV%pKaa|&)iX)oKzy1Djp#kwLN%1&J%mPXWj#0;<(98@Ci512>vo0FJ*+q^7$zzD$Y zO6ANs_JwunaT1axkKBhD?8oIL%r>4lv##eiql^880E|YZ5IQwey(XyS0>AQdglUo1 zxT%*TA7FF$`+X8Bw!4^@w?c+@GCbChzGo=61)m<9tvf+@Jr?{d`X{zQd#Ujnjeb^k z|C+S~p>QcRT3~YhVeBOqS;2})S{Fcsk%7Q&u#%<_V;{$r9eSkU+uJ-7JBj3(51b!! zQ>hxy6nL`wJ5)zKa^;H4T@q1P#fSM66iL!kXZy^FyCg?QXuD4>MMlW`I3@(m%j*Sv z$XOQCe0Q@V(SuBsv88}JoP19E(TC@H)yE~8`InAt08IOA8yF91jm&jTy*twJKW6Gs zy@=S-QlR@jNUnRJ%p$ifo)QefAEgqScOCe{_PwO&(Ii`YE(Pp>3Hf^Ydv#4T)hdpt zM)VOE%?h{Io+Bi6UKo|q0crO=5xIM)4Fb6^7E?cA1H)an5qI4ksg~POOWA_%4eIQ( zHW!@AKd`;brmR%C6iGpHjU>O6q=cBX=G%TC{;uBxUabYVB8ZK=a|D}{Vre)6;Qsh_rfO;s z&C%D+S?Kc4DgY-?J+!v;Lv6yY^RuA6kE5P*wRL^QAX*`jPLiFjU68W+bODIkB8abd z6DYX+iqG^MJ7y6OJ+UezYw;PPSIMz^AZbm+M*C{Yy<`~@HvDYwH5ID?Gfv!BeSNBQ!eU{iHCdWID+#9&$Vlgw6=jx&pR?qRQ+-yCN`! z*k6qz{hXe)*i#KxKwK%>@mYcE4~(B}QDr4bJmHe|Bc&pW_YpcuHJs!afj_jwZE^E> zBvD;0TNf=V`y#>iBB@nCrvX`_54f{e_^4-U6hSEx>1}m8ndn3=NN}g3bMTV$ML~^~ zl+A`qopD@?c5J#MMl&S}vSoi8K4_vgMOyFK#2obVPB}uy-^hAT?zBQ4e!Qfx1PZ`;X+vN2d>Kw0nI=21Ia49r5!M zR67|y8(W=~-oPzXcnDyv&CycR>%Nl7!-NlJ0YucV%ay;Fypd|nf)1SO`Uu+FX|(QC zl}aJf2Q`c|>Kex^CA&_j!X`!w_+px)1iqVWojTU6KvD)75JJwDj& zBK&fv>Uk9>hIY}|d5MmkrV;m@bdL=vh~M(&RR%RZb1$aV2i1%TiKi>8(C(C&O0%hH zvZD*syOrK7C}q5x-nZ0$b{};(m9lW^M6SW(MkUu>_}XWD1lC||&Uz#KaW&n!;pWSG zl5E8P!L(&J!Izh8=2GQoOQkB|ZtCpVsN&_fyn2~(Ufc)dB$O!^B?Ty{$xVXmU_?LE z8S+YZ7gS9+GJKNmlVsn&-Mh9e|JoCna~H^z?ilX8FzjGFGC=5Nc=N6<$;H-cr!c7M z-3H8XWk>z1z_XR7UmR;X+~IigICcL6A|Ym#4}KDq_+y&Ggf45GApw>oMjoJQulN?s zG-nw_*%YcE2rh)#AVjznRTn1Ts5my+2yjS*uQFxI%dd$N_rRhxYObPSb0V6^Yrvs_ zGi6aS__nBRDv|iKL8A1*_t1@ezDm5-&6fYW%bSC8{SfaGt~Aqbc15H z-Mseq%q`p#wnbo$W)2kEZ$PTk>ejZiwWy?9aW`+Z*)8Es011!U#nD#1$s(#-wlXd&keRpOUn z^~+uiRcJ(E;eT3L7F$R{F|xHdT_5;!_VZsKdVPDuoQKrM4WPaHfaYM^`|0{^)Y zm4Dj`5PX!rvpR*MB=?=k{GZxrO5A`hTYGerUVp|5{B&wjLm5^mtk*{PGv9c#|yh`{xyEJZMws>B@fNu!# zP36N2M#}Zf=AuqT`*xCBCti=?=dai-3d_Y)PCsRFoV;kG)r?m0-h?@ys__;$f!XFt=0beIz@QK$y^8iIXTYl8=AqWie1yi3WTJ6E1It=bzsP z4HHyoeI~E2Z|0LUmV$C48x%g42AZQJh*{LnMJW@;)@>cX8z2RDVb?9TDbp;f^dghZ z)hU`rXc1&|Hmgf|ytEw~KaO~$J4JmI&KN9$JoHeBZJ``r64D6vOk2{8+gA)n#xXeyKP5MGy!*3K?~ZZ7j=t8a}%mJ$2=x-GP@`PWF*g!Rmc7Z$&m`N5z;` zH^u2Tc~yJDvX@5})-yov0gPz5Y2asF`)}q(+H*GVLfH%LKvRCWupJ7CN)(j1URYAO z-O&&w{6nZY5cIhGIreQ2WtS}gsworYk!&z(S%Krxib2G;(F)FIr`_R+QaS4jnNzMV z+HnEh7qb~t{0r4UF`}#MKbXb17b3%CP9PV`VY)3GYj<#Ew>*?poy&NeEl>5Z@Y}DK zTqH|;wEXi!FBkW#? zv1Pn6^ncN>NXR%cAkC|8D`=KMU5Jnzk@|Bh%Y=uYe3R^+pq16INCqYtnXt~(E-<^s zpN*(PU??;~QkEs1ikL8JX|{`}h)lRJ#n~FQ0seRNR-2k|lg6J=@eWVh#br>d6(H`Z z31jU7>5JYee(^`I3?9Haa>c|Yr-#dB)xcx!BbEO0saPM{ovMSkla@s$OIwSoe97#~ z=lF32oZV$@m(ZulxUh;jJtwA(zipIOG)wh-Qi6_%Nt4iFz3d#inbX12;1t}4=z>_B zE%9tc7zbTKe|$?#*c%Nl$~he&8L783hCV(NE=oYz`Uk28JVWKg=28aufcvoNlGmwH zEU=xlg6W3(4N!CA5Ype&CW=xh#T@OM%FrBO$goFpLggUP+Kk7eojutf6TNub5x-fS*N$zrYoGYsjGb1d ze@y3$*-W>4JnNAF6}5QKBy<7>o5hW<7S$fqt|Ih*aGwPq+9RH&Hi$?m5UA zax4^VF3ezX{Nh}Q6E&F=ow#>}dak}`OeSlmEjx!IZXGBVQRAZ|$30*G?S|i>5KeH3 zdKv@vMfLm?j&5Yc9rqd#KP z13+O@^KvXA`Ytdf7N1(KwHi!I3d)C~8zCx02u5v#Epd<+u%;VE2N>)e@TxA}54)B9+~7Sac+z;(a`WGfar{5SA) zNZwq>Zl2?eX==MhRqOzTPB&EAjBTYnG13z_kX&?uz_i1YspXUYHFtSeDvxz64GTAilQg#Uxrzp~C3sMee=<;+l&&E_^5|5mSkn zdd-2EGjZJP(k3#k1AT&XMo+Rl4S>wCs(s4L^ztAoui&%q7vW_c6X#>m!JYH5r4=|C4$_wDhVWB9dd(6SWWorm|wfWa_S6l1J_? zShA)3D8DP+JVkk=8eoZ<%xPYOI3@S_VI5zwC}-PncRT#jE!Tr3obl9qZ5XhPVFujD zdyd;ZaIc6Jd<%oN<}W#o<}TnlMrV7dc+xzyK~w4Np)onlH+2ZX`}v)DHC!}16ZD!uxsY%$(1E2Gmu4iBC!LHr2Zvf z`B>7UQy^~p@RenB3?M)S3q#ec&<8{c%oJym@KkAgUmqv7&9rs{o?R~!#@Z634sq+H zf|RT$m3dP>$8)p7_u72*t~c(jHHcO6qsBM~07UlyOm+^2i!Tu-C||x1kI@4`cK{8s zI-yHgzV2;8jp9TdJ(F*1GO92KvC@m}s1DC56g<18~-un2ay za>VktEZ}1U=x)d3HG%j_N^-`cCD9+QW9}kVl^rvWB07bbzd`!VWu`PqSFQ!RTzH-z zjA6B}SLQm}#?|&2vLSO9g6EExaot?A0MJ~el_y8O_aq&kl?nLcWdVDERBnS+hEHB? zY>7dAI|q}h7rm~X*QLB72s|ns+@ah%b28j{Y8@kKu?x3Ll&W#7$Y5PbM80B`zn`J| zL|~556E&ds+4`w(7!-?lUI(NCBZTEH^Sxa{F0+%p3TK)>MbkAu4{)cx4GE+R8rg%Q z+)uDxqC5yIrLa0y-z&vhr_^lggbN9nsNdkwIIs!h67($Z^n2n5Q$Qkc&NHWzry9}E zd*~-@wtS%xHmUHG{LFdrn;k*wlQu*89m%T5QsZlQxIZ3WAP0efxDdnhwPxbiJa zz#0LA|E+$eP8HLw?ph?(LzYLn(PwPFR8`|%BHq%?EA(wzW8mwTFfId=Ou29lYn>Ez zzwOyz{@U(((VT+7Ln0P3vmP0ISsjV8Ugtkie>1GP`(Bhw;$|N9#ZD=S)!T(H>^BAs zcb@rdedG*~Sts_LMWGlsFvH`rn##j*#wET@Jc#FhLUsr9&} z1{!pyc@=*Ws_1E}9~o87`gm6*KERNXGhBe`rUBNc)g@eH(T>|I8%X^LW8N~UVh=Q1 z%Roju?C)(ER~qJdSb~JCy%KhCmdENmhA60B^5DbGRh&J-z6%_0VgB^7BiQ{t!4gBe zp(I%bh+ujuFuji%9wHY?2rzAdKC~EYvzVQNNLxMp<&jGQ^xwteMeQmIA10%u0hRF;aEV_!TyQUq9wjDxEYR%J#E_cOZLY>ezk3nD;PuQ ztoYMb09zGRd2PB+puCDpk9t2f8vs)k?#VYW0b8ZctE_tPE!Cq4i<^TFK2R=|rrj2x zH+US)j=)TW0<3FF;HhD-p&IEkHB}2HincSPV>CdvY)^-HE^^;yj(l?}`>I$;uR|bG{$u>kQIv`Wl-+8q z3qWx}!KjvXAPkOsm(#Hs9Z-gh(>IV<9IfQ0{66b$zt$?xU7Ev&n?Wu=)~|c+y9f^H zS5#TFz9mi^Cdu+fFss|;XM?Vsv7o40UhW--*Duf4aJ?PG?}0ezH6tCBw%Cd=OK?Nk zqtz!&vd6XVMwRWhF538;Gdeb28sFHu+Q6!%bW6kCg~Gi*i`(foE8<~?#pBT&MX(i(-k%?kVn$i}Dc=b>UXc z(EBj(D5j0>WU3>01eAapqBecGx&RatW}td-g7N1iFt7a%y&koD=|?3phQqY);)xK& zF|F*d3UAs)dcXCh8{^0p6z~nnq zI^WK9AonZrt^t)=1Rr9v+Irx~^7Vtwh*nyaSncl3h(>TbI4;IIjU%~zQsnOK_VpGs zL?JTsr#BzeY6ZY`xz)xsQko^(Q)z0wB+9UQXJzu(Swyaw68QgQjJ&IT?GQ5tGYESQ($Mo@Bh@a%!1DOln-2%)t{FS zdiS~7JR%R)L8+m$P>`(nVjzfyyjTtCuXblFr_|2mmIzpBgTun$dV-$u@sZw+2LT zMe|WFMT>ihzDbnS$%|n`ZbbBk47TBu74!>_H7@X?<1`Z_o+VgDuBA6mf-&1&oVwr? zj|`5{t9bg4PLHd87-zB81GoIU#v~_0aQm?kKt>C3CVsXLY@7e0L?+ zqaBb8ckE_@H4R#c^}tQ6Kgsi!N@pRl>LlB^oU=}%;?|uUo8Z?{F|+Z_IWs4_UA!-l z1^f`3%W#^8rey&eKR`9&fz>V>|F#@7FYO$2`a<8#w2;iu=c%&!R_EAxTfKlBQS=9G zr~W%+Be&hoYox7CbO{;MbYsg_G~zRIHk}Ppx6@W{Ojb#)P9o!Aw114bb3IwD;A5Qc zFcSRyD%G|P;854mZ1ZOhot4!A{k!f&iRQs&KAk)#w%Y{RD5SSr3O>VKpgN>jm4YZ0-c*V&P{4`!#A48 zssC9CnY&&5Hg&GWz?3_i{`F-yfx1oUiQAQNew1A^>sj(GJ1u!H6za#IPc1-m>{<2Q zLlq;bb%nujdyhu;&!mjP9PR}%0>4!|OKP?|=u@&G87Vra-`_K3BTj|y2FMod<64aW zeaePm$wrcMt9(Zz=h*j7?Eh;Z@gN3~WS4@3s@N$<{?AFvy$S(wJW##(Y(8Pw6OA*` z03-QE_X7UCz8CV=Ocw~@+Sdz+@u2PZH>cBgKo{B|wlR4&l^+>T0d>1$hCK5x-k~n~ zfR8|3G@tdJ!vLsz$612*_!<8ssLDrQ$j<^rTL3be9++GrCr6lmdkEL8N zPq9}~gko~U2fG}{&R=NI!G81#(Z4OJ{_(aLBd#A%1-s)Z8%4krBE9$8&S;Q!xC0`V5W%2>|(S@1&EP%J9$E`nrWaZCN31V~;d8Ji&nN(+BqW`=3M9l$sdG+g)(a$1PGCt|!-( z^G&X+TBw4PUKS*hV$%j{(5l5CvbPoRBntVAjz+ z_Yr27f{7qN3UALBz#mvr@BwF!M6N?RIj{lj!*>J&__>@f`HKzyhzrhlV$a22$jh8a z00UD*cVF9cz2OfIYKA~5bwbPD14_4R9&bT-*9`?|7AR^?qPf}ktob2;Od>zOjLdQG z4J=!NdhC>YZ{-&c5%~^E@qkP090K@NCyq$Ry>@#YOq!5#5~_3qYMP7 zWa)@&dp}9p76Y;W3%Ve-gydhgdDM~yP)9TX210L2vnM$0|*RBbK9 zHyJUl0s|1s*148nJxU1haB4fFiuU{rN(1E7;;PIW_Wm{SBn>d21?z?i-x_!no^7zR z+iC~EVFsiXF!NGph0U2OV_DvFcR&`AA%y-mX#S5f|GmDj@F|M9c_Zyw42{%=vS zV5}N?K{&wf7x87U-TS}4*}r=o>J~B=skIrBy?5T<^XLC3f53>G0LKy~7s|dpbw+&pydJ9$N{srO|L{LO zdcPCGUq0C13E?jT~fO8)CQvJKx7Gn2wV}j(ww9mvH(zMP2$~ zyABuY0?Q5!b*s$lmESdPe9dvmE7wal;_olzHzj z{?tmYj33%5c0=l~UP$~OyxG5AGkO(uSWS_d`TW1Xwf1kl*Z+8NaM(Taw89lj+C6T% z`^%sEuW$H2-W9Mhe7Sgse3nax|L6bxzkVW^LV}m5e-YbI2*|^ZIe(F)Lfn zwejw8UF`q%I1g|q%Uw$l9 zII`ODYt0{Oe9n7M7{d^39;l67^*7}9pv$1qQ9h3u_7T?KvacIh|6XXR%FEARwb+V% z0{q;U?~$siNz0HYkDfP*EemyW@ur-60keaD9i}s zT&noXN3Ri8cbZ=ek=@7SfKY9iQNouf=4zvZ`%HOiQvzjOlU;Xpub=p@Rs1`Q-#Koe zJlCn{g7>3N`qx*eO9Yyl!%q7A-kWjHRsY2gWO4(_vp)Cxz5QJ@FbjQ&GQGoo`NZFO zbpQIbd;P0=QlP25Yl!xl^2}#}H4D^z{&;`aOjaLg>IHUwrvHI1A-AusZ(7#7tu4<}I1w7SZ!yn7T578+ zoFQ}8U-1HJ&O4$#~hl?aj+a$$~7P_)ybckQB9%o5{ zmgusPP{2NRg+<3BVk4iPy|9s_wb9<)Np;XKN#In{+N?nJ!YG;e1T2Le^yKYdd)!{^ zM!%!F_FaP}Zi;sUflH=S4hxQs`HUZnNX|5xPL# zAZ$8%q!zO>f8|>zy^PXQ&1|oy_F^H!d_b<)=56g}r$6xUvGBRhUUb#_sO*Q&9VcN{ z%8g}s)G0Zo`m>17W-{sK4s+s$>mhM|#zkiLjZvGPVc z)uC<=qq`=FR+#Q*t2N)()Us^#w2Oq%R}EPKAJX~KBOv^@KVVPnDf0LgpF3H~Ywc(@ zZ};_%e%N5y*K;DfO7A9&C~VEEmRmbi%OB?8gM%w?J2K9^Zzu;71-`x0u`szfou0Ch ztJrk~6SfB|<2Ug`Bb|v~yoIy#iN;1`|KNoD(-%cW=^j2?4GV}nN*NH=gg(`9?ak`z zD^7(M%*-be6B2ertAD;YfN_9l%vO@NxQ~#-k6rwaNfdY9R9?{ODIf6>Zu+z~>NIY9 z?Q=@vh>4HL#!uVGW;P~F1$^?CO`@?2`}d4h@kPDDhfyAyaA<-w70EvJ_Ijo)aGq1J z)-54V^GCTI!$<>urE@O6cK_m{RyTe@fn#!hs?n6HwXS9~{U1%DvUh^2H#T_GBaOYi z#XhHAGqy52Gd?k4RFmJ)^40;(+b*Cs<{f@KEg2SGXSzIU-@+G0k3>P?Ms31bQUBJ| z`72s$?rt`h!yWawmvrq7g669dEXu4$miryr#N%S39yeh$hrgCQtI?(De?)rL>yoI) z@~g$^8Cs)}ZsC!f9x>{#uJeyQ=DX=KHF)(G8MujGGXb>f&-n_oe_uF*c|w{I`F=K@z0s0i!phL&=$fT>vtQ>`WY^S{yTj`rCVZ~_+>Ba}h>Qv{Ha{9B#@Jmd zF1Gq3?o+q=jQu!E^YOZuD7Rf~SbEF-{fy;f1Uf5x(#?zO9B|S2$coC8YD3ld+v@e8 zw9L#*;`YI@{rORdf*1AYlbtzD!=BfuQ*dRd%4G-^>+Hvb4gTKeWm<-PEgIU zV_UYKD+#$ZZL3~+vlGI>s3I7YQUD;Nh8A;iQvGR^s%C8X1zjt}Z3z)lm?0Qsm_B&ky1n6$hqc z-oxDnN=9=T%RkrVxhodW8XO+I29UR%1~be4B~5J#IE|-@I52F~H$~riI8w^TiK*A+ z8_#6(G1Ep>oDvsXih+%uV->V3xPi+X>m2`HH5HrDKXVyBkP7M~Es17|y*Qx1y`XPQ z2xC2OdvF+Z9&h6*Q0KAGD-wzIwD3q@o0^hw{rtc=Gt*^N2E9~JjH~%R`a`BNd$u|* zDlRatVomAiSMy!5Fo7(V&Gn<_lvyE2guZa*+P?=?Ps*F4e`1s2J2kgcboykLYkx66 z>?bxV@wv^lVy&u*gv8ap#-W&l^mh_h$D-T{Ihys#9K?=R?Ra{+H-|Sk#7YsXNx}N@ zz3h|NHzov?kI(7vkRtaI)ra+zNmxoxSKD#2Z=;V~-5kpl&v5%pGq9#Da;Lc<&%Nf= zEAtcg-bun~U|IbL#^5UxyQ8YfkO+U4`xcKv$LC+0Q-Nz4! zWHyF}CsDn{cJjzhfPHvo zu)P{F15IsN?31|tizdn^FjbD>BPb77q|aKxZW_7Rc}uKP$|pj(mXwoYCxOcykD9lx zI?HdXN8u->DTmFm)jOdiT9E|X^;F!}r=Yi{a~W2j)4!no&U&>7emf!4za_0cU;bRa zpJBfAd5H%%<`vS5pR5erx0;iK3}bYsEi-i>=%8(1@bUCX z4G*pjRQ*TacXIH<$$xqQ{O#cRn*HJ>K6-9?k4&`?l zayLBD7JRMM&v&JjE_G8lad9qaQqWaooYZG*$k@}0_t+)VbI0Z4w@&X{MnQpþQ z)BBZq*r-9_`l58In&u4-^Y)CbIxH_Y8gfOKd&C#KP7;1j{XE<_ID@S61&?F3q3O>r z$Ld!mrI@E?xJE`1b4UUjaps4}F9bnQupy*;L#fa{s!=Y%BD`TxTU+F9sxUlSXW*W<%!z< z#u)s=Frm=O%!MnAuifXJecM*cY_H7`z3HGPOp<@IX5#Ab6z?JQsM@jlyXU8;r>Wfn zY7eW)z#2M7_et*88pfO9Unp;;k&7d#QL;tf$@h3;Ciune=l8XaEAENhE26hu1)qKG z)jm5S<^LfyXj05aL~*&Ow!QJ{_+>LzWiPbU?B@50e9G(8_>64v1k+M(TlYEn8Ds z)#EFxJKwGGBcWXg?dl-DwqK9R@D6@zvz~0m0V{1OFA;}ADkm|maD zT^)0;aJJU?d_g?#tj-F5comctkuz=qL*{1Y@R)cMVR;o-7=qqE@9%AX>5u2_gNu_H z`Qoho=*&S3Ap(qE3>Db7wzDq$-HuGuaF6(!bCo1bYk#Gi7i9r%pa<#(S502t_6fTO zt3(Vje<0f&!fV^MjIq1Rbfx`I_Y?Tfa2uW?Z?@~DqX%!|7lCt-I-P6DzJ0T_x9^|_ zPMpfoO)AH!)Zla)37wr!c(~X;4*L_VUZ3xwcGNo`8xt85)E}5iTBAgNCx7)e;6u$< zbL1XqX;h08_f@fr6^{)?M3j*~R1`C)pXnFf+_8ODixXW~b~V_1fQm(Icr11E_w8H1 zmB0YJ;ZL5nLg$&WU~f?5pe^!qa0R}x+Jomm6ObUQwx!15^D4Gd zXNb<)XN&V`NlgyCG;b|^t+e7+O?g5mUq9`x*XSm$D(92HvAJGGpSAA>SG~|)e^%}R zup2V2NM)A4B@_G>jf|O~?qns~4}n^K6Y) z^@#AzxFapo&3wkg6>mN2s{@$}45%JQU-gmGl42SmxQ{aJWp6qD|FHJmQBfybw~DAJ zAS#lRfQS+$OKg;+5)}~1NX|Jo!4?I{A~_>SPLgvH$vHN;$(e=*njBuWbHDF>Z@qih zdh>>VW;vtI;#YO*?0xpxr|QCZVFS^-qC(pjVo48!*!`-$JySG$tMogb$4*bireT^| zCn`H$z>}NoNGTGJIm=(zr%Y1CY3Yg6lap~-ausrAt_Udy7Yn5bHvj60hTyc9SEYrk z7w2q_exXKcH%E&%V1!jEMcH#%*Sp)=d<%VvUUTS~u*BOLg~?`>uL=>|&@ZZ=K4)i* z<=K_HFig95v)c7AuC=ElFsM84nc(Eawr_!}gh7s~@|_NnFbPXb-Ck1yr|)~+!y}&S zP4-&@d14gbXeC;_t3JO+NWO7lm_e(`f}U73xdSUST#4|>PCp` zMk*ne>YQ8;_-O7-ll#`WYUux{p|s7|MZ!$ zhbrhOqSAg#aV}&jB^$bhRmeHxrn2^ayPWK`=G)L)9$74!*p zZfm`I^$J^5bA!Q^cNLrMJQT7Tv?Thsg~mY-#Kl$W9s1orgRq z+CPnTKBlmwiow7B&H>%tPH_uH9*P(2>d-X-;*$F_e&SJ7R{oOZ2FSL~%H1JnGkdH2 zyPctk4g%S4f2_K5crdzXY@8{59l}F3N}6D6q%EjkYeXT| zszXTQL<}@o^5UokL!Z4>(UShw1JB8l@rEe>{^e()a?7Y|X*kHgU~{?quk$lIk*fIT zBUsyM2<_5YgR-{A-f`P02e2oMAl?k$x&H(U?1JQ_$%u;zq-bVqIN`nzz7Q*#q@H<2k-T4iU=h&=1q=}f)GO2uS&cYb~2qPaFe@7vPiHTM_+RA=J@l7 zlf9I{t_l2mpiM1$_eBY(`7tp`Kw!l{gt)S*EK5uYQ!|k)X=#mx_=o!5n>a9XHSM8` z64f{rl8UkgOV@?&nep*9)je!Xj0sMlDrSRQjZsWa3WvOy$5ZQSVB#*afBpk$Ug_%U zI=dWw1tihjHmLNva4G$p9?4^6f%kF(yN`)ZxSft>U(7#{JPwHuu9tm$AA2>;ZR3{g z#Xvy$A$GT>vAS5BAL9sB}T@nKg^{JKhIqEAM2I@$Qyd{Sig? zod{=CZ(4b;qv0S8RHorTe`MfWhMczuYg`FuGjT+~5trco%UE}wC!IXLC{>NL1nYWm zoUa2#EI975PLzIm$b|R&W=zJfaFh_4P_TNf(WiaA8YE~|b=~s<&A!kRD{gK65W-Z# zC9v5R0tY7D#S$T*p*m-v8WN&7gUSj*7d3a%DhEUAw+`%iH-arp(?3wuTj#g7 z6~Fau?3{q6)=Ye_J3^MdG~Pcgh5dn7V(D-Hu2?#>OR=^duC(sdF2l-n`e!bc=BQsb zDW|R7QXlFa048c)Zw1z%3pqcchPWE0YhRWU5a(;t>1j_L_WHl@P1tYgQwrHWGz^y` zMQL%S!jK8br?#%zgM4rav5ttR)^1L_56&sNROIC3I-{PlXLXo{HpugEKvzG$5Vix_ z$TY)cS_Sr(2Nxw!e&xYxk<TlvnLVYb>~-ah z2{I2p(qA4xE_T^uo}O4+Lg=7@%rUnC3Vgfeq4va1;vPu=RfvfBs}G`%APcnz?f;Oz zn02g|W&a69pzN0i#sUKi^m_UN_TN<%tfb+>R*}e{hyhtl2;%#!&DNtP$<7n5jp=~3 z3zN?Z$SNG0uMsko*L>3ZZJ6-BH6g&LN?)zglnbKSg)fs*QkZ!+p5DvsO?PM$O^uja zD$G0`Dcn(rtLi15YaupNM(~ZTS4gL)thDZd4?v2=rj*S63XEv2Je>1?C?T=J3PLyL zokA_0-_2(INiuF3@a!jM`2zNnF3PP1`%xclmy7C*r~wh52;T_CjT|c)PR*`jOpr}j z2y|la9OKf+MIn($W04X@a<$du+6uN?@#f`Y#jnvUA)#Nk+jamd4Yf_DM=NGY4joc@ zUxI}Y-p1mw-E(~SpD5JdtYlx@T({z0Xr8x(< z{_&>?amz6IJipb#7(i_g=iAL8-_CB_Zp8w}S9R-K@dZuEk>+BRDTN2cq%6#fPjJ?8 z59eeB3^6k$yCMg31jeKN8IwWeMS3NU39uU|pP!mIYV>{5pH|$+_ewNPc~X}FF~|Yl z7127(dc}4>zo_+s-JDPsWVa>c?;D!r>U~3BSetR5U8}_05reE}+%CVO?yRIhVjUfv zni~l5`+5r*R%6#@w^nLSABQTN*%G>Nn7wk~Jji@V+@(eY*p8v{c?^T7&Mr8?#;Qm_aOF!dqV~#Aoi3F6rX9muQO8xn+j_nZcRYk_j0#IU(gSU z&G}rXtGzvd0G1WQ#Q>^YP*hZ=W^vov+H(Fvy2{!+I)Fv2N~s0EH(1CM6gTBm!=MUD zh-?j3RU`q(XR15X8O~AiLujpM{t`A$fEEbBl>JEovWo$_`^?yH^K-t`ogGyMtn#q4 z-zj5Fh`!~nR+Poro*u5RawyqHZ#OGq4@T$Q|FDB#{a@xddmvqc~`$nmcgEwDYhtxP5>%|Fq>OW~Wv=2O) z_!Y%v@g9&s%-Weu>;@oJ?7$DW91LExIm=MJsK*4JWMD~W8~*^u-$2%rC4w!U2fZ9qx_nX1Kj|>IzW+f2K69GZaf1p7g*C@}=S1&y%Ylj~XoZmdg zuZMN$~4O)kj8AN)u2*5sNu} z&P(ncT-)!=-r}=>W^T0!;5~joG~pyxk?U#Y(i$D~wev~&ZXhjikO>fLc@1l1p2iA=8|B`u$0w`V}xZi#;oO}u)7z0MWx%YMV&!J&!W1Lg0$7;U6G*Bn^z zzR?>{E$O){r!kR{B>p7qL1SZM?@COSPsTlut@LJoeZInKCj7TP=Rf@6@=R>G_zKb< z^HCBEbLlBX`VW_Cf5v6aii#+pRmfN4WxTm5@%Z=XEt#H4yLV@tLy6l7UQLeE$N`NhMt3r;dE`h}dUv;a`$1JqP*We_ zlHO66O+F>%3}e&hLyWdD7u^NTA8q@MlCasg*!lKdKHO$$o1Y`?=?)i8P@)Bp0a-!h z=N9ZQV*)_Eb%^Iqp^lDeTjyU#v@6y_dj6qpQIZ)eVDLHzAPM8wxeTw5OifWk4jKq% z!7In|p0e)LnQQg1E2yD-4<2{0Nc{)o;3<^JSN-QeJ-%O(zGaRvp4HYp_UjJio4@aPqe2Lj_0BF)J`Joo_u$+-f}eP zs5G+O$2J{lS0xQhag>E$iLR2K{A4&7&!(y_EgFJ-!oeV=ZF$Mp5W@`HoO0i_X~62V~vhqK@4gK}NR% zO9HQn`AG+&h8;s5{IsuTv{U6j?nJG4+z zV7=%^suFfMqY-VI$qr6l^#hM}1~@ zqZK(n>}jDu95m+$`J`P0*ARV`l%CG$xXw=w6tdmxGPo> zKL(J!d10>z ztlL^N#k=@;MSt98iKbokZw>M#1wY*+0$7>ay1kZTz;RQhyN}Hztu?4q<^llYpjArb zt|umqjHGfUPZ4-*d6lDw)#t3v7OiH#`%t%kN#MQ^vv1zL7gvRI<*$k4PLAB`5w64N z#rbiZ`iXF>ssEBNN2y2OmV;dcL?0+`4&akcTbv|TM~!Q3=@?%zb_cF*#{gy>II2h@ zPRGdcm-cqM^%3BOWM~2J^yscwz7|2mVK?9kG~e)np;WK`$II6m8g8SSN=ji<`Ptdb zY+BW%nfJ3Ikzm&()YZq6_Uu*%Gk5vHdE0E!3l=5@Nr-Cj$nPTP5$p_ydmVCTjh8nK zsB}X#NO|99s&+@~sm>lw(t|8aG-0lzYV*3|6~+RE!c-?ph-!Ye;`pxNl--I=j?Z%` z$@V5np!OyXLl9^7uwhV*lni_;{hWwyl?Zsw7*)Gp!>*>Tetl5v*|TR8u4}mkx=nuX zs-0~*6NHZ!s)~!bqqkk=e1oKdsRO}#-c3_A&;e~lT4f4e^Dh8MC5(*DLtiNxO;tLW zxnbhk&(H62{J8E#;HPy87FE1vfq{`7=kAJ1gM6117o)!MWOG_98+KMzy5OzEF$#RJ z$Hze1`fqvFAe1RNtNLM<;o$G}Ok6rfMk8*T(NTa*K+AicRJf>&U)?9~zN(07r4QcC zR_xRQ)L}lM#!okBD+4 zf>`fn3;w9qO^l&f^c`Re#@-n@A&)u(}l(`NR!Gk9`)U_YA6Xx5i8;R}X97l5$Fk@82qq5Ow05H&H*G0enDD;3N|@-DW1+wY-#h zhB%$JAe+?FjuAw-3Xl_6$zOw5fO<^Rw5~VOKvmdCqwjYHi)IX-N0#h0$dIH>!$^)u z@FGABYc5RtXu;}q9C79-wB1VYF0e2@wO6`35fc+5($(|nCE-JvFOUN-$QJ_lWieFE zraQX`%2)n(1gpBM#m4T+fX(^oeoLBctS)$6yQZ7xB8U>UhvGEg3MaFNr=Uu9h>e*A z9i2Mq9Hxry}!-sWF zn9M5bhzq1(-TL(;U7yIYpeG@{FIi$$Q+z&|tSMFPiE_$}9gi6#w3*ieeg| zC=%_n$k*37T%OxZ*r`ikOd6n~A#Nj{|9ol6Z9v+SYS4Z+iuxYw>@sA2U z0f+P82wq2>51m|_J>B&5EG{k8LO(rUs5^;tUjFW*yFK5gLbZP(*BJtM4MyqV&P{wS z&1uPqpYobKUlPvuz`G2Z#Dw0};kgt9ix**bFp+y(W7>O`2S|JDkCk zADBhP^EyhKdqh3s&+U|COn^IyX?Kc_VHz}$^BqW?p|2paoW2&r9zZtE<2NeCt3ICG z3_9FdYKfiN1gRydJ}gH#T$Hpy2^yxpNyUoPm8+a!lsIUDSN2d^4fEP{^6s;G(*OTYhGpN#QB{rR zp;ee`zV|~c`;9OHNbt?Utkq*7lDnhPxGj1;GfSY6gAe>sP@}akdt8*Uk@w`~u89nt z9BENsb&+T+$QAx8Pyvh@9N?~6L7Cs?d>K52d!E8kp68qAfh{ziLR!usjE zAd_JsW!3GLn9k3u&A~LDs%;Y!6CEPQqB_MGAj!4bgD<^0eNweQ1s&Lb1X*(KY`Sl?gBfG?X&p3RrG#-;`>qp!`^GjWfTrM9{*! zcmJympnsgmaN{o*!BcLxio=vf`d@g95)sWeWk+EL+pp;r>;iU82f2g?Q}#C9T#g$h{(D#y4o zYiL&tW=BkbwJa7`%i?UzfY}kRO~en=6t3^0{i<3wYISSpwNpEwLrs|E)w0+Xt9O2O zii#+!W|%q}F55edTk1&+PET-UN^n_>Ha0g8P)wJT0L4XTG#BIBT($5#9w+342%IbA zA~pi=v*_XOw+ztxA>T_R_1k8L=OhU73_lP{eDhsVdQm<&E-o&zMe-*rBQDE zT{{;E^l@a}`PoQ;UW)_V$;qkZ{0wm}nBZ}=;tWXfY{PY$Y9#qZoc;UIt8(!*_d;L! zacy712xJI%qqz^z*e7|9kXi8*nXXR5oq+{SSFhypzq^WQ0U!$E`+BPx9Lvmo729rS z^m$mN4;)yvKIvsig}DSeyMx zrsA@fKm(owH)@*+9nOcMf}E=7yoY0C{ykp)Ez%Ut+&eTxrS4SrdkkN;z)?>R z<9MNQU#TG*-RjTkkK(kEiF@jf@;;6g)&o+f3Fu>HsJWuEx4IvLoxSV40CyuhscYb29?*gx=( zE>|E`{cF*x4XB|37+Qpn@xXQ?&3H?}CQEvYrjlz-O5-Tmy!M!=aFYXN z3n#MHLQ96}aqS|R4|IBDR1AELcSNQBr9oWa0pOWn#XI%@jeO85Fzw&3UXiBbqcznV z(f){!`OYMj%XQJN3mgaEL{(H&PR=&ZJ+|3<UY=3Tr&k)x+pJOT9qKr`l6W1g#ybDM}-@1X8)QJEPf8 z>Ygsj)-hxO{}yh$ig;U88-2fz?4NBF)xOF(XNB*+FU;9KC zB&YYHJRWwalwc8Z&s{0cdq8VD?P{sL0M!L{=bo|I=AVjfjh832F)=896;a3KX|q}I z)a^v0GfKb%Tiywj}+wro?ATF-A=uPe*%of60CDxuUga9&@>^ymQh+M}8zn-s3Q znMDqJ>uu4oLTD(Z%=yPPX*4$5Dm#B`e2VukpWhWC;1p75h}Az`f}9^-jxSD0p|1t8 zOpwtW?k}GxLM1PR5_g-vLQ`(rYIFgb8mS~K)QGM5NEQlg&&>x-JI5&{xpoSU+GPcg zJ&prAuB8s^kQ^>Uk<{zoPP5>kCgNSCqDOU*mPSG z%y)Jyxo%)!@CVy9^(HGFP@P;o8Wv+6J;V9hG?<1m=pRQW9$v)D@7J{dW!lAX#z;I* z0q6P7X)9ZaCSbew^N$?E%yIo%xq-@4#c zg{)-uIB2BNB3|AN&v#=G*@6)~Ak@yMJ{%hwBC0!RVxPsga&8P?aJz^;)8(5OdmtEz zAOVN4nJad``OtskN{+K$hoC^!tdaVD%i3l}Wx;1v5ka)oIzi$T2|iFz_Q52G8=U8S z(mjF(y|8CT$7m=MLjpv=73kjJ)sWVg)j~*$Kj-I(8Ytfahxr7*EM;Y9ua%4efo#ut zyN!{aeRI__Zf|{*@HzqYdWj@ZhTYcG5xGA7|9uxgCCeqIl4uaEtWr6RvUHY*$6Lk%Nj zl>6Nl%9ua8HQyFGXQq8T#&ad<4K6%i+p`PJj{oCVf?-z-y1%8P)?;0dW{n3stG4#6 z@KEI6D3*U0LPZf*F@EyT-t+)1)9eov8_x*cwT1Aw5+|$qD!)Bb0-Wdlh{Zwai;A*1 z(<9|&ApfpR`%Z=cZ*6{Dt8ZajIKY(!Kp0~9QxZkE$%M?+w|@GPq;xWdpA1_ zDp`QIMY9kH?MG zg^Tmx*M1`l#c}5U8o2lljER3a+WGlkisY1yH91xO82iW4#J&8Aa~~Eq zHlzazls{0a^BDrq5lmbmtMU^|$HqnGnqOY3VJp1xw`!e#_lE;aVZ)`}2fxpcLl0CZ zYk^6^eK4MfFN9E#BG^C%XAoGwf$EmKG7Y$JE)c-q4@PUdB%zu@bP z)^82|rhjptBpMJ>M=(b&*2k=@8|ZS#>5BZE7|l_y@)E}BR91JoksVaCYu)hiDu=~b zaioYAdJQ$#lJfogOqO1zk7Alk#l-{}9+t#sm$t7zQc{>MnNCD~EH5&&1Y=@%_8)8l z#aL=x8JKsQz7wKfBi=;b!l>Wex;1$4=0ECpi2z8`m}2!cZ>DZ}X(_>FEWY~~H^_WX zy7xQIb_s+=MRZ(^{*E92zX3Mqv!wdclrEAr3*m3$?DBodB#NGYMWnJ08QH^ z~Xjso!|A|isBMtkot0g9|x-x(4SIy;zaqv;e@p?C$!&dRF1 zn6+cU!c_T$i1BYjEv$)J-!N|2LOP6IxxHg6wJv1D9v>jgstyVc=q(6ME)|JXPUOs3 zxqfF@6hu!II6Y09xAl``P}UX}GypS2r@`b}f%|#S>|xE?T&hB4#Q+Jy8fP%ii(-~h zJ+T}X?_;=nzSA3uVF4N^G#-19^$V9>qXe-I$@7Keb0O!>z5lA(yos6UAYJ{MTA4B!S6HM`|~=(hcZC@?dj+&~2i~(Q$onW%TK4xm};vYXKX%hh=$X&u)W42Iosk&2M4wL|NDEjKNXG%dav1!m?U)?rT+5 zGU(pAujhe}=a=E(VJ(w4mldtt<|y*{ms&&UQE7ZNkEyU7x?Z-ylsqq>3pcTj#pBYj|x%D5G56rmi5u1e7AM3e-4e70Tc+lOwpXz|r%QW9S(-aNV>O zB1^_eU?GR1@ESOWJpoH zkFHu@JKt_QsDZmbk3u|h>jTrWGtyM+^b1Q%Ei`8{G|lwELY&5_Ad$|>TDcBmBH$C9{~7M*}#mCxxrmkQrq`;9)_E1q<4Il z2uz#_zFv&0$EplfkzEOu_wVxi1_=J1F!ATgO9uwBIrX)30TJs>BQQAgXzpl0;V-ud zGEMDJ3J5hQKF=M#zKIE;M@w>7U!^{eyJlO1>2bR3EaLz8RPyiW!{4hrNR_cbYp>Z) z|Mq}g4-ILAT_BA*W|jf?U{Y(-_Go<9fXBV9J{>blKM=A3CM|gHTHviEgf`0sXZ;yN zYT8W=NmO}~b37h!33i5ynRpnk+42Nkrrf?su9NTXr<-nDDz&B!r(W9jABfV!I|B~6 zBJh8<_p(4-Kzpxfhi1;9V~U@fB13=<@D#~U!pFQ+5Dzy2>{V7PmyQpyb#p<2vznW( z@DHm2WLytL4%_5!Wq3DTz{dxI2!DIa1ZQxaj8GTt2h8JHoTkL#Qd*X}BV>Y=V|s|b zc)+ui4hB$mU5!(X)>cuGu{1krtN;MU-YLAL2yN*hMG~)rLC^a)PMC@BE{@8K~D4~V`CZofG5Lpe>LLuheZ#`+HiFga0W{VkWMJU1Gd1aCjwM|P zWo%i`9Pr_}v^rnW2yXgtd-XY;n`)98OrD&I=#{d>30X=iMGlE8Zzrg@N`nTPzVzDB z10dtV;mVzbuePhqO3xBT7}P_Hwu;`PC1~+ zz84Q%Ese*OUNi-H^`vh0->{o4>6jt|P5tukzEMRn>Y1_Gt^tflEh}(KR*QWbF-p{i!xfv)A<|anF}3$0>!~rz`Db zTB$(+J-U;}nKcmXc{2Bh>7A{4p635?oK;p;<&SAwT3TYf5tv5CL&Kf~_aT0Fm#7yH zX64W3&RNlf9%;@(G?2xg~9)83Fh( zHB~at<8upViJrPcoxKVAf*ePy`9;UG#W@dm&-ucNy56-Qy#GWAi|b*x zzeLg7hhFHnP0{`>061x=vU0tDAd9LJDlI2fSj#sn?XKDV%JZrV2wfuBSF=T}gKKO! z6kX%F2EEFZYfHiQFR{f_uD%_l!RQl>Uxdd>)hg{xPUawIdfa?`EtsVq0ghZse%l9B zI6R;f&H%Fy$D4JK_V)H`_Pg-vMIO|0N^o;~IP-O-mJ4v}?mo0$1*T-jj;R-)1y6G< zwKYsPmMvU!fFLF;EDWwW7{FXfqQU9uDiitDF2@FN59 zn|(BWcz=ISr)HnY;SP0xrR^$MoLeH^qAkcWi%C}=vQ;JH4{)i(sQPoSl} zD5p(Xal?jyY}d8;xz6vaclj&Ytjig|!iTp#sR`$p;rli72E?F!DrkEP+aC@ zG~Q_=l~oLtRnWXR+pZqWbp)8kDywoTJdW$ll}w#1D7fy)>Fn@x6sBVi`i#KnbcZ=2 zqFIsRT#B1&R75{FzRI5I>`ePihzuKDpug3(ZMQbhDvdJqnhqF)GMvt^;GUJ z2R*g7mCuKE#|c5ebppL>aWmvBY@|+;w=+Ig|C%vv5ox>+rgKQYTJ$-=q^MmUyea$R z$NguvAu(FeTaUj5J%ZzBPNz3NwLa+lQ2O%f3jF)+*4OaPutDBNN>pvIB3$PU(p0{2J9ey6XM7W;cPE( z>CF>ZxN?5*P+2w4p`EV1L>;9--|fu$)TzA3a~=JcU23|>)(earJ@zXZHs2l8J1pVB zKa~ba!zWEO`hKi+bI?MuCE@ZP*KOdB|6@-2e{a&_g@U({Hc&g2?!Zm(U!8XRvF^+m z8rP-l>$y`p#4!+PyTL-j)X6@kQte2%*l_1f*_6w!_Xy8=r^y&P$8ZVTrb9e}+1 zHVc8W*$5}0H_&id3Qs*;AD2C#@#q3U!p8;PS2h{oYnPIh!|TEx42C--E$N(gsroUq z@Vx{u!lhd+Af2u|7%&Otr|HV(+g*chjcURP($tjQt+tn341aH#Tb@LU+yQs+WXbQy zwBLet9t=cMy{)`}cOqbTME|>qj=^0D9KsGMO zV+G7)R8Kh%D8)i=d6H`jh2UahM?+#7pe~7U+qr9%W$bw+0P0nX_N1Wr3TvU#&12Pk zo}Igg1<9)p0F3h*+*=%*j;Ih7NXO8Yt}g(Z;qgZQwMRVDa%*5(Oj`Ts4F-1$T+ypw zPU%Ws=07h&u_a$6<<@ZVnnOr4Z>NFFaRBh)()YhI-ITcnJ8JTW1QbQe1$~j_wN&HH zqyzi|RN^r_1~AXq#6b4fJ=i#S+4YCRTm>a27WvRc;N0FA+TRl}`gz^+Y?dIoDm@!s z*s(ZC2(b4L7?O;2H+vt^G;s;mQp8VE!6%VZV@^_ZQkI<#-wTaD-p*igRlQ`%_uCvVtmpk4M+iH>b!`$G{WW6q)G z9V0sCVxzjHqB!0^Ok%?xu)4hLe`6f8)&GD2CQ9)wjhj1t!Gf2M!}=Z818lLM0bR4o zznDw8)mY4m-kL>9rZ^!THb~X`sa|sRcP>DpmEs2~jomc$gjaw$czAbv`^1uEO zMGPzL)~cx~8^iN(&98oY+I>GG+GB%XBR!$^nS|u0?%hWuxe$Gtusc{yX6CRjy^j`B zS-HFtx8ob+U(Xcik=R|u{I90zVEe(R-JI3S-#VpB z>Z2)rOYVSGO(Y^M-Z$Gv!C9BIYo@PtkUJ5ti_|K6?ZIr+7k{w5Hl3)pF?$Ft$U6KQ z&={Yee7J@#AWC#Fl6H0Ry+o^^wSC{vHsI*U^%NNKxq~b0mSWy3I8C{vzk2n;J+^J> z*VsjTGFZ6Ix0jE9iX8{X2@#$YkEexprPm}R5Avt;d5cozxVNlrr0Lc3zwc|8X66KY z-S2;ojF^k6qwpG7^67mnC^JIoT%o7@SSGmVwd3|L8?yb)E4 z@~JjVSfTul)7C1}s^Z~VKz0%q@>R;Z*(vlf zFF#V_#_LR`)~z$4@akfNN{g-Q@j?2odtm_VN&!4(XiU7U1) zYjR?J2Qvt(eGVo;7Zbjhu!_W{^%c{(b~9h4R#}WW{m^nW`tmKmR?dkaW4-ZOu144j zk^1toNuS7gEP3%hv`J`yLtOt|4)_bLcphMb`{RCQth|fh0Z9R-%5449FkNoHG^omx zzSr}f#wet?lm{f>nNV8Sw3$*C4{?F|5zw^-F|JBxeSV+hn zsvKiZvBX#VBT6pGT|Q2BsO9DaLndrfoeb>~Rw?CCg**azPScg6(dZ8!E*UTo zXWg$E!Si?(+!JCGM1n~o^}9IbcT2%n#B-G!AJJI`4R5?rcbY4|^uFN^Q+I{!1i4k^ z$O_(=tHpUwar!p?q)QXxF@Lil7Gq6YE8Hl$*7LcWrr%-D4N75GB?NFUcj~o$W58fT ziOIl2B4Xl%>>2zE$yS)4vHo7~9$c5)FRVZM;IxmPykj_o z2dU2?{RM-o%|~rrkjgi4eShZclzxmDU_|c@pJ(@Io?4_fckPs-qKs~0eG-+u+yl4N zEHVuAI37osc0{ncuB1i|_VsZruu=*+D>S2e#RxLX%MUc}D*-=67sk5lwG_$E{q&x`M^S zX9DBeZNu?U%iBFTKSGbrElbWN?qG@RFB z><1X%5gQ#HRYb~Oxv)dx4#Q`tIGSO1JX)8W)uXd-@;mJAP1n$T%PoIrhL(ELzi~#v zy{Ze?DvenNd?WOzHG9-!TErhb>H)m*t5(iLvB6D(=sDIDj_a5J{-8LmZ}yq&mGS2S zL-Jn{-HGp)?yTu#0pRuve#L*z8(iHZ(+Xd}wX&_)|Jj(gy;_DIt4%cRoq{0U(~rk? zB@YcJt5^N!hHd(sFeo2yv@rZqOiavSr;9JPrM~_HCYn~GjS~8p^Q)7=Wh#&JOQ8Ff zWGz<+gpzp7hL^hqHwPIZhcCh6$yV^;{|_efe+J3l{Ed3PtA$*%;qiCUV~%bmj*7|| zRs)Csd*z@AV(<{lllqh*C$vLG@^_i=zy*P+pW0W%+`&pw#9Y6IZKAb}wIC>#c(Xr} zna47LFQ5tU4}j@matb%ZAf$hGDY&oUG(A?pv#zjifxbuv%$naC#&_=P@C`cH|96K`kmyNWkFTyC}=54c?vQwKXoUqyW)J z^rTs3Wlrs8bzqi;GYAYG%bUUTc&>a0FM$pxjJRo!}*;Rum`9N+{hVmXnU|xzmJ=kIU}v?lNE9VPJr023!j&dwP1R z7^A1JFD~e~{v;S^s@ZCF9yMe*XkU4^=Z_yhR$T4sx+t14O@T%6c43)>9jbz(G#IC# zsTG%tX0|Ktb`h8IcDXlz)0<(=q9P#{Z60`C-wDsfUf9YsfjTi|d>Cy02#XcDzv`c8 z)wt&jCoAQiJ>i*RX5&=F<&CT$&+4+(qDLNHv#a{4Vts~2Lud)S(3*)%d4yX((E-qSk+6H(Qs|}^z?Mt3(8wQ zz)ovO4T2^=uX2@%>-K8txTT`5?rfN=9oI!+lyko5>-UwqDZhqBCm|_fhChBseU}G0 zx>)yh+VhWILcH?lI{8`hRBsK$DnxP+*#q4_{YH<}qMh(BD-@w;imb42Z@IvbXL8}n z;c7@dhZ)JM*JQMD`l_U~L-h2WCo1R~b&I*78|6Gxwj3f%5+UykR>_troqjBK6Y6OZ z+;M<&PZIHRBAbIi2ek?a3>*UUzN$OMW@ZCBJrJvF&w-gv*0IX{)syDa{F=QOmy;pw)9}11 zqyVV9UecT&v=9*za-~j91Y8tvoud8u-k`hqljvukYis?fc3soy=hFioYww5HO`W|LAD#(mY*g%UK{%7w-@gr(s|mroe}AD@smIP8*Dk z#T0uN0n$2)*lf(*Z>_D9*7q_`Wwwn4pL|Rh<`+xj4VyPc_q*IA*y#)&8jo+lWgKTx z0(92Tn{%jaT#~}aZs|r&XVqf0e6t&TT1E$yN3@p_Tr{lV^dK^{)scs9PNvuoF`?aunM$4H5bf3mDbNpjt8KM*kL z-c3CZEKR}ZXr5)dWQj7z7-5bX%3_yqNYY3JFox8 z@B4ed0eg&N?}#>AIy1FEs2g0Px6mllxj8B1lznH6d-Qh3bdqT9yRelB?VyU5fH2Zv zJb@>J`C}0&S@zdN+~3uyY)r~hs~Yig*Q6~E+`L6MtHUc~w)f={))}Mq&pj%Ky;3zx zbJ-b5pjQw*m85iCTJDJ0SY%*TkbItJ5MOqW6sB(Fv1Jz7{L-(Rw8+xf*tpY7V@u@n znfJVV&+pBL?X9i7h?6cvl$3?V2B`HWtDNLr@P zj+tx`!N$CAfl}7KO!hc?!_jiVhA% z-6bYIi%f&~s+*uRe5>x10%8W&@@Ebz;^VX94_a>LEm>y5$o?00p*38~1F18QkG=rn zMl&<*m3|dWwx)xY8A2#4E30!W%_zY|@_2s}h$>)=Zmim+Xw>I+#$&twzZVYwDfiU- zUk&(*k+GC2t6rr-&9ekXMOKS9J*}UgUUB^FXP@NIqujE<~xvks0J8(zmyN2IvBCX7^( zryEo!eRz5}aF|ceAY`dAs7LVQX4u187CNM8Pq!6M0lP||UBHn@nU`j;wG6Y}A%fUV z*2`-Z?`?_J7j=2n(uV-JZ)T<*KjZ zMRoB+_7j}}&aka@b>JPX)UnHiIqv_q8~)y&^4dU;)~GIlS(QpVj+PufiSu1lszh-d_j24J!?`bKj8<_|Bb3_s4B#yR3<8{iFd5V5ra z`y}A)tK;owM5(N5PV`jqib=WmO76xoH)Y4qFr zf+8~dno~HH<`bZyJp=|kCe#;kDSJ4(h4iRFs~E-6t-OB-DW0T5lgf#XOOjJz7}M+T z%~nzxM~u4WaB*-XdwP1(@7XxMkg*9VU8%3n?7`nd&L8!xCq62nR01h319{uPfcnyE zZ*7sT^^;e#w}=>6Ywv>uKa?`Wxi|LOhomaAv2%wj~;G$OpQ{cU+}U+1zJnT_u98Pu`y=eV+!Vf$ck zv%u38b!++CQxnl6bO1RrOhaZwDypjKpd5(wlIMxB*nL`{_fMoSXU^6Wl2R6#?R^li zaP;LS`zF_EF=jbtWG0o5tHNdS;7+vWQB_J5f$U7)nM_dkajs_-6>60=pDPmd{mPy^ zw0HlNw2F?#yUl+8u$VHi;7Q?}7tdcX>o*d==e#qjUb!d&+HJ<_g zl6szuk%%ISYhi`CQ^)MW{V-}bRaXlufD|08HFdlc0^qumAwDAY;*{i%h0G`Ty@E8@ z(a?5!q4Ih(kHTY+2C5#rF(hT*v;su0)LRBCZ>S~QL)Gi7R~?VQmIeksn!(wR25W@3 z>FM7+hs63Wz_Y3Qym*|DbD!WvfTsQv&%;ZdRUTF}h7IHJPG}P~?|EgP=RMjHz>Vt8 zV6oM#k22<(LN{B^RQUZ`wq1PHZ%s~M^sK*&`&?|j;`A&%K%_zwRF|QpGUhrFbpDUvhYnx`HNK$QwC-Ec5jI;KPO=J~i zE4JS$zKj(b7HI{*KJPsdH1Y)E+tZ`?~;)MFIMX$&B;oR`G1A_SBL zMFJ*gB@;eAJ_JHVwg4xZq&Pvx?83s{nIn1#PW=ZSQG*g%Qxk3&e^Y$Gne@Ifo;c&> z?E%Gb7^A7C%7mK+89jS=>9}$aVkjv){e|ItKY(u+sg^g>H-va(K){{NU{Zysi+wga zrV%}}I1BUKO3PmO?DMMfPV8X;gnGv8ZU}k7UhB64#*bu&gcMPUp%qL9H@-(n$v^2C zCU*O+jJV>Z-J%mR#;#u(vw*F+NU_{I-p0#hB>kpUS!aZ!T|oDCWJHM=fQwB5J3vTy zU-m7lP;qP5=l-}{%01^ePXY}4Pef`?V_MiR|B=g^%@w`M_2RarGaLYNfVly@hEwNk zAX~QFnEW#LRshfE0#4)1yi#!@9D{80+iI|CixQ4{K_hf}gInJjS!*GZ!|no8=XXZK zN8aIv`P@>#bIlH-*$d-UMR;H-E#^|1MR7-r5ebFe2G;;c(Kw1}BEjxO4O2&*VuzIUxi1TASi#PsV zU|1cbUJR@&;HPz*$Mjr$Zbk44Jm{k|1fSpRf3wp7N10SMy(+lfPOgUOjdpi?hQh;G zot%)63D7-NIuXQL`C-m#nk*y)i`0IF#BI_H)(g%7_Narob7*KpmkG1qK4~QCc8m&K zyW|xeC>JOE14I7@ZrFy@^OBO1b`?eHAZzVPk!~^5xHV=bVD;5;KNYM$Avc}5Kt!!E z{ElnA=_4u?aH$3%)>Jed`z5%MPS?5kWVhZEGjx;fmt56LyPVvA{5)p)(vR=miT0tZ zAZ;>{l-H4}I*wqOkW2)&6e!#(mH*PO>tOyhI0=NL%V8p%A3qdXUIs_~;alWaLjVc^ zr%DwI3%G!Qz(9Y0%BxoxpITZNd3n{qAZbk8SjJB`3o0TM(ihF zf*n3ah8*bYU9!Eg`_yloKBw0{g5=TBa%Y@_RJ?9Ur}0*`ok| zrw9%`2y^WiL6fJ_<%R>aW1I~)obiM!#-^ryARkIGN`R???imGL-HA#wlDgLbC!C4U z&4av5fHz+VuqlS^KURP%~q zoXZei+_JcrRbAgSw4fC&dYwy}pWZ(#LMOE`AnZ-hq*1=McM8TICn@q1pq)CW{6t3v zf#Y-*)I{%cxRAlk#l@x3DlrE-gG=m}RJSTn$fKPZRu6T^iiU`Y$X+`hTsu1dSevb?qYa0MK1#MxBxV$?eSChQorqeX zWmL*gzv43Ssj=7KXikJ zps6t>_iRHj!N$|SAe#2c;OOI5?AL!DYk=>+6gz0P-TV?tMMZV^r9~utZ^C0@#S9w< zXJNH^O;uk0E%LY<^>yXM1VAePIUR63IucL~<^ zP&4~&qI^PR%(b7WrJI~5{_P1@L(3Q^5bMgv9Ac!<61#BuK{eCuf!t!aeY?jXLfc`N zOZVr$`D3U4aUlN|1iG$qmiAt&ns{PgNOGyALC~O}kxnU}%(ZKGlx$XLmQ{ALnhe)L zP6Jz9wun;{)ve@bx*@(5LP8HJB_EImlSF-IOyQ*a^Lcp&LrQ~bJZpw&QJ_8>0j!3y zMupv4PtRLmq$Rh|lXUk@K!Du>^_yS%*gwBxB>fL6^+UH~$G~Rz5=+}dcED}%lYGCS zIT)lR?BNcyJlS`NPifFBm_+#2QGx zX9T<3;32?6R1ElPMryhS4dte%OHQ1uI_>~-;Q+N}?w^P1mtGlaJ<)NAoi+NHQ&k6R zHe$bkfpe)Z`=&aFcmhvun~IIw_t4qJ)wQ?qjYL39Oz#{a1ix^nx!2j)g7nfxG2%8W z{Qh~eb|Nz3>Ax-o4YU+3c#bvjF%Q&<4PQF1*PbqFN7o*>+yb)Aq^7sR_rEz9e?FK0 z_00_dSmuwzQWhf&SO)^D2Slmr51tlhCflm~VO0d{ z7%3@!YVJq3nKTl55UT*}P{$=weErXByW=uoMGRYCh-ewww-rS97FF9uu367wGx7NDk3F<$OrC}1l8jIJHDDKFH zI0LZ>10$nMVPWCo{JgxqeJMa4Zw)L0TOj^x@>edWkp7&e)UZ86D==GEny&Zyck#F!>ECZGsdUAT5U z#Oih;d*3|`S)KT*ZUn3vdBSt54&DN8A!Kz%hWA#BJ_9%+2MeE~6N(;xxO4oahE-V9 z@JC(sU|?oe(bY|mIjUK$TH4OhJ&^`Xl-}Un?Cc`2itfn}Juw-x z>wjo!YD!+byX+%+{0$FGX!|sI`dR&-cO;wtdCDMyQiojzevv*#B5?NYfz6nqbig3I z<6W=SOtP$zvBG5P?xb{~neJsR-80G@t04 zic8I)UN)5_N7WR7Y%GGu;bdoM38AK|0&Yo}otrmrx&Wt}Quh-V9K7HJ{2IL$+%d-q z7|TxDpLyQDZnnRb8#L6TA71d?MqZ>-u4+kNam@~mr-bXeud(ER{cg}cOMip!{@vXT z4|j)>ZxP=!0M`mw=`c~&=OvfhZE^kX?yT_q7Q*eBTezo=XV&sPfn@CC9C+h_$tO?_ z$v4QeCc^(Mw_Kdcnq`WPg@x7P=u9^mBLWyshIRFxsaK0ZkB7^$1KaW=rj5Dk{yJrC60o4~-*TnhLk z@O~)7_R7x94IQ&ovu2xcPa(7UM{w%(i(Y!%%!czy08_(N;|Z=#E8kOHxH7~=rI94)Rk+rbb7{bBD?uF|2w1i7A zY5(^4!5ww%QvuS7AacK6!m0gx2m4d6-Cc*$l9I>cjzijx>Vp9)1@wr3UcP{N7z*m9Bim;!1*6WN+RJx&dCc)K>-g`J;BG z&q_p`#}sGun`6Gok+rg4i7)iXR09G(zyTWw=+x`R)iPgM{pQd6(&wJB`{s3U$!D?Y zr2+KZ?KwrqDXoI1)OjE`yPfaIy|!BRNT#c;Eqb%3wY4=|`0((M$?Wzo1)V=5)Vc`7 zM~1By@&_}7QNbgc*g0mdUA*A8gS=Q;Wur14>gt^Qfw#^Y@p2?6ARx0aW#spTLc|L& zw7k+27j`S+hB| zzW}O1TVK(zEXTz5uu-?VTO3TPK&{fzli?jO^9I|0<3PCZ)OkWrR|2jl3j^`rN-1%d z>?R@JM2?J<)OfflOr0r4{?CZ|ODAZOz-zqjPVLbhxHMpEKy99rk~2ZO6g#oKQmaBv zM)pq7eSGJ*F5X3SxgaO6v=Q6wH{7sONyb@aqQUd3!Od9!976Ya+pF%KD5F`Gx7a@r z!rwk$0}4p3K9!@LT9W^9d;hmzA~UO01^|3n1J27Lk29_vt(Rn%*zYcY2hYqXn=oik zSM>PzjF9YD-%KF!PW4<<;Vq7Y4AZ}6g>uXix~JCLFb+x!SKk#sJ@vz1sfD^QC=0oP z(9zs*AELiFyPl!TJ1b!fB*MhFGlGD?t7T~~2VVWWIWS$%^KBxV0UfKdj>|htPE`hW z0Skd}1DsPI#4ru|beo06CZ~S*fB$q|!?)S^^oLeo*aHa>1LyIAvSzRvySd{9Mc6@q zMEAUGXB-_!v}X+0LkP}rFBf#r3-y;e*~q3tq!YN}4(R=fC_e@#%S8w;{am*W%d^N? z;pzm@YN|IyJwqbfr>p|0?Y-Z?=^TtvAV=$UDQh~IxfxWW>2ashe$1LC^cGg038 zC&C3j(YgyLwrY?B>{tJx(=(0r#gB?XRAUVrCUE80F1gV5n4)Gsn?P%36+g4T)}pH+ z15dg4aeA9C9ssVUKcTHsFyFd<2L!3k9y;1%`s-YMu+D|QP!*O0Ag^H#;JWjVthv|| zm-7&T0n#tB;}SjH(khn=>5gac;AN=$wUrU`p9>!3KcS&kT**oS!fYF;@QW5OqZ@0F zu*Ci?I{{dg^MV9OPj=HbqE7J&Uh4GFs=FyDJ{YY|Yica9Ii}3tB1!3j2da zkrhB>cgx`l;?)1*=b`+YuRMb>KepVPmzN09JujAa>&4^8l|uA~GBPIw=*g3Ur{ zak~|9vyy&#keozHQgYyn!U~w80q~GVC2qG{UNvosc-+44-wv^Yn|*J%Se=u+(Ow)s z)m%P%dcVX1baTVvpSgNHJ%cZiet1wvjArl>_)NZ^`Ee>k-;)|-jgL*xYfhh4;R?^{ z0~`mZ^~ff*$L&|x{wkIumqE>S6$V7TdjH?2lLUv+l#uWdRBesl?&4-^^N+HnGcf{+>y;%b_kAsFPt2hKgI0A3B}scSgc*z+G>Ujg$C z`#?L!LPBJuvAZrhI=a)d2QELspuf4fex%Qy5g54u97t?9gCi(eqo3T{^2+~p4^fx; zkl@{%L1H7u>q7e5jVNKkWozVDjrQk1upOWX>Cxe~tgY9=#>P-oeOCA5BGO3>*w@;y zeTLk$8~)L6@q2zgjn$zg-m-7|96J8%(r0P6jh6=DF5i4e>^ixJjQi6cQpXfmR;mMQ zoBYn5aoC=AX<1oNRXD+6v0*1@Vq7T&gSe_gqoV~(;{+c;sb@)vGVnEx1zur!v{FHz z*(%o!(8ZXwD&&`j@`qN+C-Ol5g`$_27fE>KN-?iSrG@PrH-~rpkM@kT2%SDK9|;bc z+xRJiwivoAR5dG!{9Ry%9!K>S^Mo$_}F zWE($ry!8c9A|TL0x()B6GWJ%+I8?a#j_wGroomkra{@M#rpz)xbV^jouwU5s| zr)Ok;SzylyEOfqe!Zm{9L+xi_CN)%o4#s0u_H|rEGR5TISV2v|f>3mM(x(<~+U& z|J%FNlL8$a1;qHcd!CY{q;J5BA1!acZ>-`wiE#Kso|u)ds!dHzrTVS!|6s#^1_ZX} zUb=Nfx^l79rsXWLfIEKumB{Jj^;+2i6$MO=8+Zm9M zd;8-Rxp^$z{l?V8SPvyVy%gjWD^ffk3{XGFpxdqkLc`^!9lO8_UOXx)>PnSI$up}G za*M94cusL=a7lALhb=g*%X?ENma`9H_?b!tE(@N*KQ z5s?vOx4)T-`XC2!R$u1VpFfQjl*?=++gI+DPmflT;#w~F@f<_+H~?!1QmRfl_0-8k ze2-!BqrYUKyq5s{e(U}SXwUM%C@Cp{uv3tuyROyp!eb7bdFzs-XkY`(W)6;yOW+$P z2T}4r^S$%1Yyk}A=vYq#A+fb=|1-?0b|-u=S3L#lp8<}G$j_y(-PR>@UU#gSt^jt$ zZV6tVU0mFc&}@ctqE!TxJZciXdDGyl2Fovo&poV;hyF^BRK2qoN747AlKOM-~EDh`XD%o z0fq}NvG4L(bT^sA{Cqjq+U@s%J~j89{<6}aK#h!Bb8E3UD?hh{n!G3~547xcz)3FN zVploMXBB{IeE z%`^t`u9qTl<~coJS4Qq2ZLVFtDn_fD5X^hMQCxDx;Ae3G zwFXyieay5+I&grVUm%&`JD2b#Yz`X&KX=QSi)c;m6Vp6fSXgmZNEUxV)kePRR?BdFZ9WR9@Z)_xtH4m=~i4MA;NQN1iAyb%35JG_qzQs1Df@Qja^vA5gEy~VD}YE3 zVpTf0)GG*?9Dt5-shPrxDC_L#-YRjrEj3Po7DPEHq8Nxh1|%LJ4ET$88>!tSe^#V_ z|J4(oOR=k~LxKj}fKu%`;a*&=<(vPE;bY^7p$y`~i2d;%gc^g64~5pip0s$ce$!Q< zBwG9G)N_g*vG<5Nb@LHE6ht%FuYkJ3j0}y4k}z{7LO-$;bQc*e&WoDyunlAY`DL~- zFwPk0wKW~6JYLa%Z2c0bFKs|-8YAVV{HEUG-O1@f!ZQNu%}m4dEd+HV>U=zGR~@A1 z8kXCIE!6$MxOE~>!|0{>$;t#04x;Hc&@MR-L>JQqHl*nuPmxrC=hPifzO&?U+~N`7 zf;x8-r0yfNv3UbT8o*1(u-OomTEjl3x&yS*=RR z&Bj~f*XHMYK)bnF`)Gc}6lU#uBi5`f4cw_SntOH&u=KsqtYSuf$5k*#ksUb0aXFpB zJpXpYVv9k4#N}Xo6whF^t`xRQ!=LrE<#WX-uW7wg%i?o0ty-s|Qt|rHJtagbwc}2B z@Ic(E1p@T?_Egisl$OAfT345%_XFKVX46zPeE^Q zAQd9pnD5G=UGmJANH1lQvE^yHzNwzC!%|Rrf#hkh!hYPCd(kl6X){bsN5R7bRiPa)nGZw+rX6Q8 zi+M>1389ELs8)Z7D9(^PF?H((o|=jv|4h-DyKy>oIlX!Qt0#}VK56YMM8!8g4@#k^ zZ*pZ-WtS{g3gGHMi7)-&Rdf$*js#C|*(K#z4eEje%qBM?0_JpIF!)&VqM5^DQ5d80 zp-w)B(laQhhx&ouNJL@cV%8W(#UuN(&Fw%#Ot7@kzEXF?KFVmDgnPaB%DboRkg^D< z*&t;FgV3OqW8)VL*?ci$<4j82mP(0p=U9=jnXeM#KfjLbV}So(3VD7G4P(Dp?Ed{% zJmy~#At7~sez%?`6(!{W*w@TQ6lX;w=YttpQwAB(7s0xyxVlzPO;1Z%94zMK2SGQ<@Ss8z@MerQ+{yIIA5xN-me3V7yYZtvhNfcv-`U33Mm+Om*!^7RWK zv<|=9%g}x573@hv4I?8LHG<_9ksFpk-K3&58*|uo&T-yXM)bMD0x+NfWHX>lKxZd+ ziJG?ZbTA#-Id7m(&p+M;nZfF)rLWFktCJ-=B zps};Fk5oKQ5kYPQFZohf%A1?#Dk>@pk)xG@Tx@4=PqT5{X*w{R+XyRQIPiu6Etg4d z#Ji&7-BBy3LyxTVBSw9Z3)zzFjiiG?O{-FDFCbK)vQ-BW#CDQis{nI@Lt_Hy5?J!_ z$;eE6&1(0j{HMqD_OGCEvk;J+#>f6{=ytoGSP9p(5U3S_(K1qMsD8Wt~rVbx%kE>Cb>9Vd$5DoCdZ z2nfI!dJa6=$8@Z$a-bYl(g=wmu=W#ERShS5sgEwIZI4jFymaZglxBqM@_6lN(cu2v zOqK|y`TFJ?7IZ;BoF3ncRuF@j2Xomo0uqQYK~?Ob&2x#Rt7}n>VORKSl9qeQ6~v=NkK^5 zt@`${H${rMnslP2c`f0JbLEt-f&x1z;O`A46aPv*=CyE{l$?AJi1535=#19;sbfI1 zT!z8Iow=$tn0!tSOQ)@KZN#{T^g}9cmZ9pDo{4jb@AZlR=_CPBxuB5=6W2yKw4JJM z`>#dNSAgP&3UZ$mr~y!?%zQW*?Sq4=#Sy)06UdbbkhGJwMHcpSlO^S zNgU^+3XTv3-rh>=qlN}Ckd967vgWZX~TdM7sPvmzHZZjQBf}16w)N6JYTUD zxSJAa$|H7c=Rd{%)R;k=%S{VFA2c4GtJL-7&n$;cM^ z^z(Ap6ls6y!Qe-FH>?-jooOKls0ZzFR(hQtXmz(WJ6aF{DNbhc`Gskg^{(Cs1n(D4 zOB@o^ZK%+9`^}CorcBnhG1o(9HP~_MuX>|(ZBQJ!?G0DLHcyWB%m8MdotbIvy7rof ztMM7N)8tu;@f!7Rp{)Q+Ow3y&-Y)1h8UQuwwy7&9U~7-Y!t#Cr_t13Yqg-5DQ*(0K zm$KX&kZrzx?bzEHH&IUSY{DYH+Ae>TypY;*;BX+m*sr-2cDxjm zn@AjI&JP2StDaSue0A;lHC~;a~=?LXvD0x(86|Ub}rmq=9(kiGv(xuMD$3ZK6;mlS==B?rS z>i*2FBaSALsLCd}5B^+^H#!3hQ}<@DIc?YWM7sB!wbf%_q6dz!gLd7)o*sEI_#T*u ztWoU%2NM$M#~em2&LSu}ud-b^t9zne{FsTl)BqhCtk?(vpSbL#{2xk=+|Kx zX3^5xS95I{P89wBr`zto{aJ%7dRN&_W=ZZK(Q!$B@|~2FpeXFQJzGaUSJYBe^(CZNKXKQdY!chC zV^@GGX!7vp0d^%n)h}FG4c%zX!Vkl^;dwn7R0iDH*=Ad|7suoB%96B=x*QnjXrDHE z1;;r%HlA!*7(KrKaud*?G8{t{)UVl1wgNqO{tD}+gKrjfxpT!Enogg1F0pwhPtZ} z+8LURtfQj34*EB~^Pd9?BA#HQh-px4Be>;Oxtn|C^fODRYmj@`ZyR&P=;Uyyeh`}X z%4PPPj}oe?fg}||^f-(U(-ls`D1W`5|6A^AHl-m0xjH1E?&03WEPUf*cBuqb$yH%w zQ7JGEH9@uVatjGK4Hs5c(J~0B`g5w9JZL6ocJLMbbWdxQLv5(5>aTyu3QFs7K3%1p z26<@}$OpXhYyrOk2q76TSW@}fvu8a!a!sexu>V^b(?Q~e!)QATrrNm4zRY9N$z)z4 zSvY4#%pkzgVNbS#TOBQEs*JD`Yw21B?RL|#^);&gD#P?XfQ>_djg|XRqUbS`t@{`g z{^`e#za9o@V&CJeKZk~^<>cg;HEWZLhGt}BFak~Y#>NIFpbYPSP7uijZGT*Re3@Nc zkAS=~6Wrg;wu3`MOI{bLd-~->g{KQif7B9J+0N^@ZAjqY;2fk?0>70vW{jAz(vN z`!YMBBSQzT2nqKuV4v4u`?@E-O}1 zte--uNIJCMGIj~yQ7$$@N?saGroO53g&L)N56m#%hrT;*NlYm$)S(hlNC~ggDBT{) z)GK)2rF$pm=DpT;{L+DKY8uM%&l=yODrQBp3au9Re7c<3i2o`(?Enlza|XQzn%t(CGRM%j1nPSyM-d=W{U9cNIR?;jpF4dlBp7Qn%aOLj-j0~^++rl?bRy8 z>>Rc3Gpc2WC z4kC|R%I`h!azTlAHkv05TBqG(Q8u-!V?3tkxZyxt^+XzdHr?|%p1qS(1?cE#Zf@oh z63XrERR|_FE7c$(Cuh(L7#;^+bs#usnX$QW3|_&A=?xjP^r$ClYMbvah*>byV$~aK z-f!movj=w)4(7bN2jfy(SZQnJ&x~s2MEVj}K&tim+vzkWp{=3L5=7~j$syJ%& zxJ`Na^7ZKtr<$W0-0;~fV>vB*C?*0t1dYC8x1Rdt88lnXCl(66?uSPqHhgQ^k)0M# zB?F|x)3cpS-fVK9NFBJI(I{%7$l_MARrU6s;{@$?8Tz|N3DR@OEU^2B5meu6HuE=_ z^`dt-iG6^NaoH;}k`!pzZ}af%CO6!eM#8u@Eop1o_g!`LM5f7i_x7sC<&1>7eq-7i z64085Sf*X^%O>$Bb>E(EvEP)Ac=6%|r1uggUjhT{hM*?bVAEQ4eYY)C?Ha6m=Wb_+ zoQR14*ERX{dxYqjo&T%RAxMa@TR zxx8ns|FHS}PbruF=j#M*C- z|Nra7$z}^-LzT#DoBQ`oOiVzZ5xGbE=?-aIjl|koZEkMvwfx@wa@%9Uv_1aQQN1CS z_;Gcz>QNBSzI|ZQH6D2VbxqAd*1qu418JF{=baP1qsfc2HY&VfO9R7qyQ~@fbdRFp ztu~6`i(NVO*_rF20v)O}9sUh&L~N^OoQ-jto15N24H#;h5)FN_*8(P(4fORp>`2S! zD3*9u>$#{U@00v~9zNW#-kw(=Q|C1Q04z_Gt+nX#?G6BPke!jyVwVddN0;=O0iQs? z)0~6^xbH0w+kRVZ*HKeZ0lWAPVDKWS^e&=#F+EAeq*!QNEn9G7=YCHrJbVk2lEZ~# zG`&BBJ)I6FaqpfiAMC2e!!PsLj*rTaSblLq^%-+c94MdK^?>$X;Yd^`E=9N+S^rz3)JFZ@c1&}j78Ns?Yn0NVylvwE}YIp zX5WKE#CE|nI(G(Kl(&d%(h896^~e4uars!kH;8|ehc8^g0Pvj*R{Pa z($a0%Y7Gnj?)0C&^#8~A$ubpuB5T`@L$So|Ph#)Qvfj7z5h1&yl!M6`xA#bWfU_s7 ztvN2+l10s4sZx@PmNWL|zMO7sA;Uh&2h6`@kzV`^Kq5+6TX2QBJp?}!&tt!dfr%-0 z-mZRa6u`uTdD=Ok7OsvPeLMsRZH01f%6nIP(EIxPRTLDymBGO|FK?j}_%b}q-@EF+ zo0gWQz3qPdP*Spazx_c16|Nr`yhH>b6O+!gi^~qxQrF}|c8Ff7M&$KUtO;*eSy?4) zDg&1@>Nw7*dMP@kUm{>VDZI`vY-q%Wq$(MuyL)t`HI2kxlgo)PXvGKz*N+Y76@ABm z?2Ti33sH@1J)?2NLHw+YT+Gm#tx`AxyW!MP+blP>|IhRUy2#M(fdt}AB|g%48q6|J z@BeDk&YaGTWU2s?j)!*>P0h?shA-P*nmqd-q*;cVt>d<@nt9 z@3IxBzBozU994Lg-CpXNT!IKQtx%hJOm4nd35RC8U65;0*X*nz4jvvO6H~`Dz0g4< z+lBV__AMt_%wGY_(WDYLv?8SUhD@c@;Uca}8WtKB?rVMKb{>J81twM3HSw8Xh*$xO*1O>z2H-vi`_>x(^l+xKpkCG7&7b+gwAgv@dT9 z2x!1yuq)Iw6DA$(VfeS@TsWHJq31zfzCOFPIn(~okF&8PKt7{2_ZE3#Rh3Kf7|BXI zfH2L?mnDu)C*1$>sGPp9aszT^C&Ue@OJy9 z6-lC_Nj@>wmv5mM@Bh{>s4`?Dn^3o_z%0vI@3-D~URj{q!(>8^L&PtS@SS)&GYVz`IWJWPHuc z>rKxnl(_pW1{(cvjF(EZtWeeOz^qK0ZtJ`DNd3U$m%93vJ- zC?O5M$(p1GV|Xg4@$;VhiW0HNDon_iKN38rtTT)%VxZVHu}-xriy*Muns~gauFgD| zxD?|{fWdRNx+yQZ*v~#QPnnLq9i&C>9+&jUT~VbOtjoI!J9s-z8mq-sh3qX7dU)77 zT`szJQ?7X6aJlT{4OjXPTGm{`3sk;iSUC9Is&^^?!LizDaQBY_pa!xK<0X8d9{?O; z`yT<_x;(1 zTmDjbH~!eU0=Ubv3aum$L6k^`c%lx_>H2rEyKVuNGdk?Eww#x#K~*L_F{$W01uo%{_-$!_fB({6@f9yI;#YxzVFZ>?o?l_HHGw*r5C+qNfZpZ11Jm>qFzy3P zS`#uElM>k^5Z?HBC4`z6S6Nxvc&;ms11+VaY;)6QWvqHAS5~v!GOpy7Jp7i0(}j!J z9>w^Xa|s?got>TFq;bV9KsMteqoWU(An)6L^1t4=r&Eu=T*d$MFWi7BT2&{AReHLC zLGePo@Jll7rsp`&+a7liLmFFigO_(K)XVS3sLn_aK1GyM4^?NlXSH#zWm$bbu4^>; z&`}(sI7K`^YoCx{lw(+lu`iWaJC_SabT43gxJa$k8l7;`7o*3D$qbR5ByVeKgm}QW zwpQi{O|h^ls7?Opx%QR<;eN7Gm2;H~7;*slqh$-f{bFc1tL|!9bHYVnw__qe^?Nqn z0Q4cx`|&f&$6aK8NXRWlXB~)u&w7tPYu$WVGv9t;Fs#F(6UZn+-@i{TE8{7*hfFIW zO;fQs%-m=Uk&>6C7#2ZRPnHhW_4Q-Y^4Vy|yKX(cTR@x|lqRz16X* znQ%?nrDGL&4GmKP6;spfA2BDn0Z1j_el+fnL3w34S`9!lLZGpxu&R09SOwC&C2%h6 zC@A$~jSC;!0W9bdU-N#~+qd|z-QolUnA8%tp?yUpA}nkHfQ)?*nZ|hQ)@%_uZRTeT zPAGh_2s$hTK`6H2Q%f&ngWQzTQWf4YyAbSPtnxQ+Fi)mP>aBgAz`y{9y7jlCLs-O2 zJwVN|)5e%z0Y3twCi;|JNEvU=Gip!@vckM2C2b3&e`>c!JQu3{%) ztS`g~47*uBx;XgohC#7!1={j00iKwnXV)(q+8$n-6X{n?GjbHos?K-*VlTS(Qeyef zM*pu6qI5^N@qR`|&hQwQzZL^(YY+B*<>VkkzPhDrDTxsH@XK0SziGExxhSmt&IJ z=7Z)+!%=8##+nD}C|wT;30~@8N86R%ut#b-&S_wX*}(XC9>@shMZv$cUygKUs3nkv zth0u5Z=TOa0E+p!I;au-ou^*Yg@%Nru)1&fldL)%9Rk*n3&)G&0-JXg5;Xk}5iP4| zY2^ZnA7BSiICWXf${6VBnRK9@exjuni{Zs#ko?E(`CkOZe|<}2p%~0I14?yU>d2{t z#&1+NmG7~Jg(DWrC=B<;t5=Z@D!dVZmI*Mb*0XC4@)8oWF;Bn@qo#!V2?>F;DR{6l0=5*sRbO9Ug+I?VT zZ#|xS(&v+><&<{4F_~pdIA@KyC{usKzIdcbKZN4W(<8asBzu{>gxr@Wf9%(WRB%`y zN^|bvhK7blvMhqKg2OveWaG75prZq`!b`iAJu`k*frts&I9v}xcPO7qabSGY*x^GN3s&r z76XNe2M-=7@%w1}3()wt{u2+Kvg>UIEu!;>C?bUOIIApa>_u^nT9F=(P6frwe>3oWv;U(>*`Tq3?_<_qw#BiJio3*=^O*sGe;W9 z^1}7>_VjMxh^J`YY;aUt*o1*6G^#%5keP-xNKb!j%p!33&NTJ282hQn^V9C*Q7?%t zMK^N?zw0tiZ*lwyBeD^By{;FGYQq?~5ZG|Ivn0H@1F_@)XT6h_p6;T&=AR68Okq4? z9|ex@r8YCt#T-Gj_yuTyNC2as>e4P!J2hkoi)?oW^*A>K&=kT^)i#r=*tmlu6eGqL_4m^8XK`) z_6RvMN zS=0=ljT>;6=c0}cj%rG$9bN0JfJUv*dMO}76RMOf;*aZefRCQnH=R-k-psr-aX(WT zMDNg4Zw>6Ow!rw~bWoVND$5c4LENxG=6Mw#g`8Lz82b9sWFWk|JjcS^d;nfPo((~O zK(}vc27BIvA#*S5sBczAmf6alY2C(!HmKd~-?=C{E$_dUUc4reg2 z4N{iG^?dXm&k)|I?QgOXRYu*^+OQo& zo;*2iz(-(l5+`VXv{If^Svj0Szg97g8@za1{A$m6a1-ffR#$6+hOR0#O>HpSXQ{i8 zpo$A&(d3KcvO8?F18OZTrAtB_F)cva2mF+>^6HT;#eJ88FTO#%zZrjmCd9?1j77o{ zYt6jsK%S7Vsj0b>#tLiAxg#%GF@eQeGl_ZLj#sqh?=R8Me{qYLPhoTs6E>otn$s2? z{@Ney$yyMMTL&0mQY@=#%lKr+CJ{sBDPJGWnVclRTNBP0%)7u{9yd5?UB9g7;4{(~iXW34@KKBG+U;*i1a;=G3kk%Y;9_lK z+bzm#XN?lwx|d;D z$Wj@Aj8W@&s6@yxP4aRDou-CHv}KivL!J6ws;DV`6g&3 zlKYbL{cmDBrC+y4BW~XKl-lm3|B<1UYP}tiCAtc*qDcdoU?t{ z${3Ss8SWzC(2_aI$wO=zhInSLIo|pz{){K-5ZtT$$-BmVW9kz~JOZAO%9SCJ?!t%jDTi|&NTVTVXj&GJ<=5LWowKbA zmSzEk6JQdO@X2;RM99R|V^BQT^jP>%55|)#-9tA)L{zPdIGq>2dv_;==H$M|u8~K2 zzS|5VkWNJoJ_BsT_WpD*Xf0li9z2MMFLB;7RAYr~N|B(0>@FzoBElo<$~EA5?(t4j zmkbOIqX%P%(7jcZl|$VR`x6#{R5WgTOU=?U4>ENto~aN@g0pp<}s2y8l}Ytx<2{X?B$j&si6(f9qPuz|Z) zUF%wFl~r7=pQ;(%J71c$yzy=`{ZNr&w4oaSLEm_8GAPe6={FgYnn;1CGFtkbn? z*9?K?qrG_(@r+VT_et!lODieo=_q=P3=QwrZ)as?rDtVz`C?NSfNI6C zu&|OK36_N3N|4ipS~u$J5sPT`q0lL0N&(7!|7yiWeU|o95XZv2_{(W%8q&^on|pS* zru$$)1W2nNRhEO%bA@L8Yy?;C4GjR6{<_8Ch6U66?tlv=P=Pd;&GggNs#7DD+${k4 zRe!0lIbae zr1^YM+|ud3-|apFCU{B7%XbN}MpQDs-w9~zfb<0)pqYUFTVtOu4gC%cNsU~pyXOmr zD=mFS*A)W?&_{OtFme;M2DP>saHvAP_yM46=LyQPA~GfiE=o+nG)OP_ue~_qe?wGK zX&S>#C8SK5ptT!V+dTv$CynC5ImyzNd-W&&a4(-9Cn6-CvG1RX-ZR0yh&NGG zAA=?$?5+2Nzog$(AqGLe?NfHCip1YvoC!o&(YZ={h)dYt4JPr$Em@T|V&x0Xx9~rJ zVsZ#YfvV{y>#*gZ(Po9+8rHm29P<(C!@EP9YH(S%sDOKuud4y)!N7J9Z%Wc0nqW2e zU~i=peJ?FLTLI9`LC~RAB^BuB_YigW@&i-ho@W<##^%i4d7$p}byrS0+1T2qB_`eh zIZh@%hO=kS79Xe)!Y%8k2$HBN$XmPn!4-mYsl1f6c=F_it2>qcoVPu6f7F zXD_AfOtv;+K3@9}*io1wVp+-3ySFCdovc`8|3O?24*6jb;b|+9s&6Brn>EGXZua|#mY-A(x;dl1aP3(~}@8-Lc4M7d0&f-u>^~QWJwhd+_ zU~mE0OW!$ti;K?q;Bf zkBRiRkgN{;9^mrB+-IuS(9o-yD~4MnblW?ir3QT$LG?`X}cTq(e&(lPQKAu zEYyAS7Dpd91L!zMBRbsyi&}_%ATFf!V3vs>xTJ_A+gXJ7B#C*1{Pfutv7i=9$U1?x ziO%;U(PYhN%Nt`lS`9N>EAcv3J4US~jv7McTS#X72yz}8thK4wlHB0krdAUQrKLL7K&uHboX9&wk{Frzf8@ zMw;7{Efj1K(>E#26xH{m*~?dIagEo%W{*N_B~>rHnJZj5oNJfMSj|~Z4Z8Y0_8hqb z(D|F4k={9q*=jQw01eui>+0(xe1jsK0L@ecrAH`5B7}dsCx3ay|L$Mtd~o`?m39A! z^QVQzP%&j~TKaf!>Cc7UF_G(C^0VuT3z_vu+&5w+FvgU%6MYG~TjFxY;E=VrdUbe= zUjv@jTa>b(D#Ox^Ku=W(lX7J#>xEQ6YaGNPAPY;Jz+@}VT5fnZ^bRC2hl1lC_~Ryb zWC74eW(+S9A^qp=s6oEf>#M6v`?Fmc@){c1+>7P1z;vDEfJ?g0%0hXDSE;F~kLCis z{@w2WA?Wi^)oR`m`hy6JT+r?+O*A!A2~8Uqa4S47H!V13(kIP&VPKkSutE4*I(f!c zaPY@Vo;}4Rh$jIMAGG=jia}qx+_00&dz~?E3NR?s_)FKA#r{fp#tLeOBOA^DO-7fo z3VRpIZlwBSiv7@dHXg3jgO{X*HVDY3k#MhO1GJPXiwt%F`(jRT=62_nYW#P}gkL|i zB)j;$c+z~?0~Hgh^_d|W-tvOnJCNdT+tJ3x##XNw0KF&jhcM6mXL|O}!~EHAe3XE* zHs|3Gkz~j7tv*6?M~8YjH9hlG$>&ix)7X*@bLVF@qn4AM3GOv8z>4Wg&U{5l`{4$d ztJ@+P)Djsd90_&IiPS+Uqt#?lZ!e`JQ!u<6X%zPsqt&i+rn9%k+#9pdGedXTdjKK6 zsRclJg7X&UU#^6d?x*L_uramA)G*W=<@TEMuuL#o z0Wpu0K+L{#G<^W@%C2*A_EKBlqWN7RKY`-K@6(bFQux5kQR|<(#?JhXVDEWW-8PnT zjbwWIj53MYTu7gO7G;7YF##SDcrG7k+dreu0hn!Jl`{&0T_Yt?%myX6+Lb2l8WsE{ z<{}10j%1vK{d}@YXq8ZXx35;uGwsi|{bA>6nHd=__q`VRCgSK)V9zLP??39Rg5EQq zK^#bg9InzlDiw#i7K4IvK8r?l7bcX{fD@^8{`k8td{qDNp#P;rR9`68H>=bXc#)~# zuYj*_jKpnhYFd!nwDdGq)oA}Rnak;X`v#sr)1?(TQ1J?B@3kiboMj4lkSadqQ4LhL z(Z_9=y{dGpaGelk3ElE+Zfm82eCHw~iM7rbg^uXno1pbnNc&V+Ghh0rr*C@Ay|=$T zaq>08G*Ihf{#yA7cPyq|n8wP6s0lB}P-l3gH zR<+Z=J<4F|?nu+9Cq}L^tFgU<94sy|)kg|+vdv0-e~a=O`|R2_dneN;Fe0mifS#k( z7c02qeZf^+T|5)VjnhI;giuAsY@pEvr%L$2XiKcFu;^^4{wO0P)Hiuu znZQoZ*MJLVn9FB+rZty%f7%@X2&o@6syA)noj(FqYH>Z)p@N z&-d!*8?8>ZWH$(!fwn5=u&^rKX?S?l>N$UZ&#S6hh7NM-ypI86@XTEwUy9(ooi;E@OE)oZ-k)p;7YRRB*Q>PxY1U6k5B-K^fLTf%U!uF2&*fS^7Up zcIfBuW)coEY?>-OJ^SQjp1-8@ug8gy%gN_uFsbHCEN#7h9+!n0U@O31?Exh`G_GL* z&X~PNx&}o+>oAR7_tfo@e$WiKfX5)I>{1RX5djHeXciV3RFYtOLCProTa31E zau$!?8OsUUSEk$Y2FbB=*I*ymAj-VwSXp?|m)=yDRWQ=g-v@&QZSxNH5(wkR<+asb zi7H&X&RwRc^ToT1Q!3ruz-*KFoEt`=Cmr?cPk>8s)gXT~@flUfgQKp&)utFdBuP%2 zwRFk!nGD76a&8NunG7o&N(J{)(4MB5*G7YGKbvo-@CW@ZWBb(dJn7e< zf$Q7NuGHK*AAx8GB4WZWkfbP0zh*J&p*&QkH<)-&;02z3or%zcCo;^fX>GY;H5-4} za>SlH|M6i5Uo_uMrp6b~p29XX|iad}`Qg?d|>HIUE8!onr@Z z>t?m&z}Th#FRGpZpNrR3pwD%w0Fr{ZleV)+vW;n;qz@d_mI5zU);q0t)l71g>1kD- zy{g+z^1GKQz4_s0_d(Fb?vB`9SPZKm=Cg>@Tls=QoJ3(bF^p4)4<0}a;3YS1v9To& zHfS=bkIt5x;NUJ)JxV+ZXPjHjG(jN`Q8@4g^o!1$t2l+W+QlQ@^}QV|V$vo<&8@fxI$tD`S%+@)sBb6EST^Ug z)9`WC6_^)cX5ZYg?zljc)DS8$0VYgs)M>kYb$>j>AC1S7muanL4T1^2nq~Hjc4hP7 z;dmeqSeo_acU9k`>dsU3v9~2eB;&0;`H(aj?P<(NSt6}xvxp5oc|<|Fp;g~C+=IbE z&~g4mDAEP6m&kH5CW=Kt=+Tw%1*mneGmXsy&@$*agu9LCCwN# z#MG;C2D1#;8VOJ8eVSf!fZDi!!&$Gg&`PN!oj+O`!nustB!Uy`ijPa`WVzQ z+Lp&@AazU!n%c9ctj{R~_7TPqyrX*$=SP#lt;* zYU%%LW7Tq~D%?A2aLLztg?-MiGz4lk(y$mOoG4Ds=9mw&Ospn5-x0C!Pmjh*EiF-F zH3UOS67bw6hAuy2{A#e2szcwr9`6zF?nZT|9|L^U`*FOw zT!YE)vyxfiau@V5H&f$Yw!0ai%f*Tme$Szu04cO(#(y1F{Sbat^I)ffza?XEp#uv* zV|9(zZXji<9s}xQX(%WW)Yid0-nPl>PXPX%_l4JXqi{zzo}Yl_DvOG6k1XIx`aNW$ zr5#|;hUaPQ90~qbD8H(GC?L%28#P?ZVdJU^zk%0$O{AjAxqCT694#>LZYPM*l5OE~ zpI`%H(%e@NQc!B|J4@s&cz;_Q|5AnggXYl+!VCxiqxF=M=+=w?dxVwyWTd_`o$YBU z*U@@;4U;727wTOS6WI+Q|4C>Y;9xe-PfWwf*74PG1l$qbLZy?r5>7`8Tgt6AH|Y5s zx$3@9wDjq2R%`!gc5OyOz*3#x&S0`mHp7;WDPqKz=Q1gYjI~JCOep|#?+Cn?nEPwG zKs~C*6nElskmGN4x%zfWvheKq+eq3RYg8DL&q2lYWKHtaR=$>qa!bIS7xO{Vtnsik zNO8nFN}BOX9GDI-T6=QVTm#i5X`jH<2;dU|*Jy9(B=!;a`&9V%h=EN$Z<@=~TR0No zn4$GOaj53QDrXYm(gQh(JeIQ(fqg9kRAWgpXAI63_=~{4%F3x?;1~Vj1~)vKIr;rE z|7Gy^?Z+fhXJfcQ1AkVKr+M5EGj18+=#&mRyvnl_d!!{&)bPZYfy)Er54k}be)-mh zrRf(9Q>S$vxX9&Uds)n~TM4X`WcA$mb@1Xcp+y7DxKc`zx zMLoK>VpE*ni@PTW@VLJEpcl&Trc(u+vEmWl?D*n+`8pq8=97SKBPkCR!I010#(O4H z@8K6rQuAan0Fo!f8T0&dC>wrw9pTaWx{%-cnr?~-DmQ)SROQCotGJ&oJTH*!3IVeP zR2MLWg}vqNW^F5Mw-j4#&pCH~2F&KaZ|(x<_Qj z(Hj4beA;Og&z@tqise+gd#XqwJuYzcP6ALO8UCfxS2CW$IjLq0uz%;BaP-4zSgx4{ zWd`US>8ULl$dv~6+o@f6*gU_cg%-!@UcGc~zW zA3=;A@&dFX#MKQm?Den(>{en}?YE~Ri^uWMDWVjuPok}Vyx|QcAo2`SyIJ(fYKxa* z#A%+N8pQ*^tBD^U8-7H6^f&cT-2iOa_T|Vg-yP0>Q0G6XQ6jVX!Qhztoh@09UX5QX zd)5&h7VoHW#<#V?pKFur7u0b zi#w%h(&zjJSMZ7OOsmO(g7Q6ok1X4rZ+&FoCoV1x+nfE7r`7lSHkdr}6i7JX*j$mi z_(cb(vsKYHj+}x3EKje#$G!t&8nW8?AO{2CmI|PCZ6=<)4VzUBj){LLw19?#{iP~L8p5NIjaU8IaE1(3@Nb|-`EHjQ zZ)bt@F{hGBvgG$IXhB=p8vR80`&)3tyc8CCF^Acp4CSnjcw?ihXtSxTuzs2L*lCnZ z*kVB_;5&PKgxpeK7V?zMh6p01zdqu~#7EQ*#Oj{{+80kmzX8hy)O(Y=Fz^ur-~zIgUIHC>2fjZi0eIu#RUJQR==X=c-?$v~ zfealdoi}@HxJ;z^6#yOHUU&Mq;S@#oO|?#jL$$GUPI-jK_0F$IJe-r}jlNOxsP_A= zLGJO0uDZHST8g>gcamp(UEQKO5<->e+34?T+@M%XqcH##D|vgIyw=}n4afbZMKDQp zoe!1a_q8Xw_#ZJKZ{oMQ*3^p4XY|R~mWH!Xj3CPWteal|;YK4*KBu9nVB*SW#nf9F zwcAP=OP_ptO}$J0!TLmiQ*t5yA!a(h?w`8w3#U*(@%M@`x|~vv&u1ns(4V5-`mPTJ zIu4IT`&E$5ScTAPAcGUHaGczj!hZavj}pL;wiRraj^AF)sq9CC3!mO$w{U56B$ioR z$NC@)d!zr7Ikhppy?w@;Rj6T&jv`|acklW00SbRg@(3FDTI7V|_VqrLBxR(3E zB3?%7(nUkOYKx8*;f$1&_Ml3TiK!nKn|pe@2B?1mjGzw)F?p8Qiaw7x&f zF}^=B_zY;D&&vDu1c=sj?Bt)%DmHX~QCx)9cC^&BTFFXYC`Fr)S`4(Or!h3Z82+ zm>uEG=&ma9`L`J_p3xwPxcz;ra%@#p(yzp!yI+ zr&TTmj9LU1n-G9g^6N8f9aI62Qu)up`|`{~;vy&NADS4k zT-HMXBq^;@@Xly;vP}mPTIJ0j7#KKXI%Nb+xJi#`woOLXJ~0u!J0EB1{svi~Y^L(G zKoAwYr^MJmkexqUMEQvUg%2CFv3$DB`M?G9%$M*HE6hKs21hR(UP4tZ>0*ua%Om~G zRci%d04PXaqxTxc+6>Z_)!2AuqgND^Te56KkUg){UhXBXuLihbx=ETPDvZQpAO&zD z5?a~&A2$e59XU3-co~+rec#`?s$WJq2^zSeg2q!P^BAK4M6JHQ-6d!o>=3)2;BPMY z#2*Z2*4EA$DQ+7fExnik&?9Sc_$ETw4koT@P!$ws-0nv?vw)dmRZv*g4(IK`HdnpSeU-Gcfv;h~^38HDh45U=EY=LBeX_U<#6>$iaKxX9;2 z457>wX3k%9)~lI7m){foC+-Q}mizuIZoyDbdkGf+Z3e+lkmW0C8wlk9+VA_Qus2|>gLPG7UqmLaq*Mg3(^{evw7!A%g zU%0LIH+5O{-X%k$#je(UTkrtdhbRnKc(l(uJOWB#FxPhdiL2d4VcO*U1HhV{(_OtC z*yBx|8AHb%s?7wYsrAEXU`Pz*EJR5j3*sNDK))?=@z9sutnU@*hkGbFKSmH58tTe( zvgUWVzJM!bgPK08QdmRqs-T8o&t_h(*~UAna5bQ^LEyzaRvBSi0th6a?mVUjgPH3S z!M*}!UjSY(3mAj#EA!|4f=4xOx)(DaAJz5kU>QK*Hrc+8b>@sTXbHokvV7P!6nCP* zG2Y(8{Zak7OXLfWaV#Fa5+or{YxdUN_M5mCWbizon^QDqUblfcElpI9dZ;negeRt@ zC)DKGbL?($JN2T_=VQfGmDXFKS+B$5#26S@Z|Jq z_pDiz9p{8a$Arzzl5767dtXvdpy}(=*KuFi%y(}0 zl3q#KHFKV@+d7PXabXw8Z_S|?pm~k@lI(tdCn<^SA$u&4l+RZqZpOjJcTsa^ zh&|D@aVDxy`~EJKpP2SN0THe>qrqooNe>>(#kT5L|K~|m2T}ows<88DtJp7`)6wa| zi7^7G1)lM3SX2?7#P_o9VTGxLJr5GH7IT-A3mR|W=ZsD)EvF*c>1;=Y?Ui0>5^b{X zR3`e*m7IW^;9fwn&C;Gd^J|v|erkjsE||!%P>D=-SevP^P^TNWNKf}pd|k+((!ZPd za@iELI$kjvimWJ3<>;GLNPe7vpf90Q!K0_&6X5*gKc9(KJUTi&n)>`_#$UfJMU+i5 zHR)>5BPcg@!G&urtuTwh*cfffk1=IeA)>q9FI+V-DT@wqV+aRVPo5K*L-3!gc*Jjh z>Iwfuy)^z0C-%cf|I14sc!RRo`-TVfPy`J$^T^L$@?ADG<-<<ow24|HGTO)qny-eCv;olN+FT*T!X9fUK!_qJeKCcMA~b z!*lDe{_LWDxa6UmZww#k4df=%7ggDfBO_L4cUSyE42>kb_zE|LWSLIThR_65r;GdG zzV6B`-=3Vl3uPS^_4#RL+Y&{ zsGod5QeE()@`(TstMo~Y_Z)6dk(sKEy}d#CK4|^v1E}Evta!ir{%OSFyKGrLu$^Y= zNt><2L|;QwE`0h%aLXGV((l==#8lI#?iH>yj^$|v|2IsI@I=;OYC z2FK%g!IQ5(qH1qqD<4en3utfvZ3Un3>*3_x{o$~G>O@~yqJ|fC*X3;pBQsxyj znlWT~yFo3Gu#;FaA^to(sBQLuIkz`)HrZxcR>WB`)3k(M^pYEnB~|SD50^In_z{@+ z;+>*(`j-Xc<4O1~dsaZz*1`q>?B2SEfBRvUVpoIm{WN7aRx;62$f+y8ixKl_zx9F(`VMZ!oAKMrR%L%%MQvgdaL zM68>m42@tU^jZ7T8hmzU=Ti#@F zf>_bT@87S_9tzFXGqEro>cc+V%yFgR3TM?2@b0}Xam{axmnfoPlYOx?`y3sucM?#n zD$f?{lk-FFifbq)xRe;(7Bb5;R3;-aBTBSh_DZG6YKmz8c>9N^`;;2e6XUlpKmPG# zfAT99U}!vne^w5$s25+w#XXz6#rLYsY2>LcXW^pf91fGix^YnC%cg<;zm9uQkh+l2 zx`lXvIUSQo;}N%!18Ob;ksM_1rYAV(#n&Fj`y+lKz98G{dND@mFVVhDk|#H0j*)Z?wpA` z(s{)kjziA`jh;s+2wO=n7Y<~HAK&omosonCdP@!BhATg7qj&t6Ja zG-{>9_w>55P^GlqFCCZ4ndc=<*)c#Z1v|YbR-U)fHJI?kBJfB(Yj? z7*rj59u_DpX5wB@kPtK(&MYm%w2V->;?^OT&vE46;5J8F;Hk*?oNxcQDmr55ZZSWw z=}=y9*vJ91q2k*fl5B{sBg&ZxsA$j($}!s>G$}Qp@;h7?af1OT#5H4G6L#Qi^=fhb zEniz5`&{?{NrFS0g<8@2rw^W6V^zC_&DolZo{2P~XfyQb=0Ww*8-_WWVP|{Gwl4+G zA(4E?<#0S7sBjlIE`5rh(#=1J1;5r1NDE5V`j-V=6G#|BcjI!HpSO)$UN);(ZsRLH zC|!tKkkA>^v6{BApTFnDhveFGi6!N5y4Q>-sx0PEB2CMoZ?t!<;m5&MmywYoYZ9f^ zR_-XZS?VaIw6A;L}m-!;x z$QJsE7#8}U)ho#j;=E<3g0W>Cyr?=HDn4AbeE8RXS5<(Bl)IHcpZW?vA9^eq zaZrjz51A_Pa~7#d7a+Uty(T_u;2qGFhcN zq=+9_k1Musk*s%p%Y*aUH;-t1E$H4~ORWwhACZhRf08mE@S`~Y+*dx9R)=6uX*U-Q z^^_M<7gapusIRAf$W@ciK;i?$Kb0mSiVQm@GSY`)Kc^v{g8TC##Xhehue)@+e8i;0 zgL{cAkS7SdL|d^1zf#l|N9LH@=mQ^6bjFf^TsT8do9T(UbAV*P@CRb{wO-jnRfoMU zi-v3g)v?gU{gzbC9wz~34xVEsaaBZ1|G>uLV2k6|4P~~d*29NP8m{|-KpBA^(7999 zGa03ePX1mZ^%S_LY$0{qKbp6D^K62Dybc59V+A|G%`Zqs7N;+H(c3l;9tf!eZTosS zqkqBYvtX|aSA6~j8YOt=u;gZf3q8r++Uz@nXYLGyH+u3f^K7C(%I$Emy{(p^>JTO* zi+na<@q*&mmQLv_gQs`naevT%Dx05_JNP!mEfyZUwhEV9GA|`ni`MIBnDdF`@*GVT z3+}v-xlPy5HZSSThvdwoz>1#@Y8=;GU6)sT?o)RzBZ-+{ufVQRyxP1wb$(*|sJ8OY$G*b~yKU6J-l6Jx`4Xx?a}CaP}i2VEj=?6jg8E3F^- zk3Xkkyg#l;btk_5o9aFOf&U|*h7)Ow$Q^n*Xzx}+uP(mVksRF1)W`8<-6c8N4{52X z)9D762?+v*uo*;v7qR)p%<93`y!^1Lk0&PGthUhH&a~m2s29x4XXSI#9UiZ!Gapsq z@4l?&CQaq`lpHSG&_AA5wT*}kSug3XWH}*rC^o?yreiLRUt*TI(9xq4Bjm!^=Fq3UwTS8zgbX07 z*iy81YXxz`a^*UVTgRHGd;jj)ptHL3?)FoUBeE>i*5SI3)VuY%t*yS)Mm=7heQCg4 zRH`?_3%TyORQ=j|y&i_?>P)uEOdl+DO*y=T8EPTh#kAdBSYEW?$};4<3iJ02v+aHu zqjG%T<6*!&lsgf7{U=88v$mC-?38drz0R(Qh;}m~UUy5@f|%3lCRu&-Loc~oUEBgM zWZ;Cml9hPQz}l993D)_KQ&P(9E7inN1HO7?>bd&M$*HuOw={z$iBF+_d{#|#MGD4F zQAUDX3?!@CgE4Jr4BCmXAPO`ULNVacY2Cm&?kcMe4{o<-vi4$xvtTB_`kdSzs?2d z)oRZ#4i6hDb*E(|SLO~lU>r zAIftX3}t+>_@ZJ>+P3}3nN5=B#o#K((epZ{+AMQ+Ib{9UPtj&PLpgT6D?lg7@xe&) zFa96mdR&qHwL<_iC83}N$NAE$I)}cKtjHZJtZ5v1)krXCK^48oYsq!6+I<*&IZ%MP zozKE4JVLHMZ!(4g6k*sc$K}3xC_VI1{{mQz+PxQtd%h0qBg~2__cbjtm5DP{ESi-< z=ju}I_SF&L`6S}&8*mx-9QQ-B1Rt`>B}I!Wo{7zdolObHL7`*YLC5g}ul0_j%;a%- z{x|v-^mo8+e3}xmK`iFQxBlvJg46kRi82E|mYxcG9);{y?|Toq1@=9{A>DHt=pJ19 zIST3656IJ{7s;P)qSOi1+-6cT;?e2n=62I=eY^=Ez9gXW zh`B1L$1>CEzU2Dfd67qZ{n%S1*MxNOTxe8M2^gPD5X>9CeI6&75D7-!$*Nb5_kpgO zmqF+42({)q@6GZlUgpp2`1vzG25&!eKu^g~V%~1l+mkud!@L^Xaq8;E+Iw8sRQh)k z3o95z)D5a6B=lUF#`OfaruF3K%%r1fV)n1h-+fdai+Lk6)1MVUD4otit2-b(ItZ@j zdpotLW8-oy30dDo<|EhBZd&0bjHWX;2Wu);r+sQMmVRD8Gb*cY)~_$4kCrtb|M=}- z$iG-RJXb6=v1+qG9q({;2+SmF`(TrBu$E%*Y7ulONld$!co;BZEP7039sdQ#h4hEb zgXtpxpSq}dn81LyXVlazl2}p*1A)oFL#j)@z8>7Ni5wS!1xkVt$D1Y0geHBM3ujML z@sr!5AlK=}6k&TMhB>MP0Q@U4;c18iuXD6yL4@hPw3-~Teb#|*0BecHK+$IVWzves zTqLr2(La3!mB2*b#LCo#Wlq6&^cf99+DDa?zpZ_nmi!d0WBX2>V0TpQ($q| z8wQa!vPECRRqJFN5KhOs22?^T(bkI43-_=#)$gWca_j`6v9%)M#`chJT=2Hsr_VNd{P=m zsUiqbj~l!X2EfnT-n04mBXIh|v--!o;cq>)FkEoeLg0m{XhY)b*Dw`e=pm#?1vu2! zZ3;>0E1tQDdJtmJG8!nYT%=W0RCFFZxa_QU`k9RrqvPYc zJSB6BN4M}UlQm!Z77N#9SkK1mOx4$S(8?5eo)+H!8%F=zM(!yyeA85kw60DFH1vwf zkMA&&lXqoQ>Dz(8Byb;81~K>FmXy4GZD$0WH_BlmZKJX1yPj4Kz^aZ2<2yX;w$Hyb z&;hw4B{!z|^62l!amVKB)x0mVd8%2TsI0`}kj`DXxpzp^&_?>9v_Dy%42q+a=Wvdr zVB`sqDqQVgB>HJN|Bnmd#7;<7lLJ!=jRs3%n^4Jc|HST>lkU-Bqv~Ed^8!SP^s3#-{#&H3*Of^48)ZVRsoV zoPQ^0Z_md;@xbkI%~ax4kC-CQ=jkoWvsmYAee=B&H@C#B=06_LAu3(3;Hs>{`(fJ~ zVwKd#-Vpz740Jf{hX#Ga;}-q+2^{S0k&#LXI(^ntv>}&+lP8&hKCU!0LI$t+ zwa*ba-(S4vX=+noD+B;zt`oGQ@hQ-OWb33DIl%v%hoK^u$y@nZ~nJgWDsj7-(`-+jBAMP0Mr!j*k#1(}e-n`9UY zg+rudRlAxEKoPoJZa8~>wE=R zmS<<(ROIaWKqI9-xB8ea1=eJN1d#sFhRoGNocH>QLUCHSJ*s(1+(t#aWkV!0{Wn^UqZh|2|B^EplBw!}QqytL- zuf4Q+*k>{*I-ShoU4Fliv}{dQ$$HRsh;(&4V?CUVyBxNt=8O(VcZLN88z}+_(Wb=pEQTPzGuv3G}&+v<*|Yo<*hXNL7qz1GnMz@Ot1>Vya$eftdDk;?MR=v|)u zQ8++PvHLgbdRGM$dA_vEo@}pJ)U~;3QL5cydgE*`R$V1F1BB#hZ-$sLg4?MulWm^m zD6Z1;yLgmX3LHtXHz?VJXZ@(Kl+!L1wT8b?yjNh>SFRn@GJ!P#% z=ifz&AA})UIhZcEuaC9WRqgl=2hRFMn@%8(XEvU`g|aRi?U$iH_%9CQ`-iIsFo}6N zF9ci6Nua@a51U?ICWr<)>1R_f~oZTw-(M`2aB6%!Rbu6f7uG5hY27Vp(l-*nn4-)-(+y zzD(4!F^Z7;-nywWljJqCD}72;Yl9_yD}Jthvx@bws?A{ytNl34$2BLH_lC&o$8ucg z>Zyc>OI#YKtnqh87wuhGNlU#BB3PE-$2aW=rlQiGC-u1aiw^R*e>L!(%KNOZ7F|0T zE3-JbdtYPI5QIFnIS2FV-L8jSLmXUum(cDnhyoZ9Xx$x#*bh!7vdvLSl`Dn?*J@r6 z#r%Fze1hjy1tG*qak72 z#W}ONLA5_6Sqo5ns>caM4Ra0l`saKcb}P~r#YCw_i?ZYGHCp@I0$Ke5z(;~q&n=&s zqyfz&`<2fD7a2|@^rzR|fK=J$2T3aZtY-N?9q)chK)30bJ#Ua1HCG1^o3Z`3Zdcv5 z-fQZS=jGkl-{r zgMYsE(^5~sd@&^f?fNHggk;&~R;>~-S0Qz2u({-E7}5vZ;QW06da%<-Ja1U}ir7Gx zoMrrB%BtmPmok6)=4~1{LRN}XB20+!hgJY^kMCQzS60o|+{spwl8%9F99#B|)Lu<% zRnW5UazN5ryaVM}>ECgT-%LS06@bSJxPssZBrPGlRhWrdyIwQxY1-U5IAL)Q89=_G z&bs!IItN!?22?Bqm!XmYEooa>;0(TgoS;xNfJb=-cEB3ZRx<>S@B?r{vKRH~d|T^? zlH6{KAxhP70aA+x(6-jTg?Lt2R3Q}^1{hlpz*@Z{=z@c~mGtgup+KR;9vCn-Mopq5X{^ciEJiW$0J8$5IB%uI3 zJ-F){-`xFS+JUERmA`zkRbDoTk%Q+#8hN1HwCrXGvhjrXsw|=<5|U`jzRPfdlvW3__WU$7L)TA}XNziEUHf;^4S^^5jXYk!tw#%#1G>7FZ9)8FgsN zHhxUl=}ZAJS~msrx#K_kle+3wGm6OlSm4*24fPTAYw{B7jA7^V+@3_T!G@jhp?Bso z@9MOLK*s7Z2-tO3a<+G9Z_)-q`JE&)h#4THI=Hg!BsT@%!0K}pmIdyb^!lQ;TO1-02__8$t-))-WT65{W+tXf*x1|aWF*P1H0OSDqkoSrI&kjpYa^lL z2*g2ZQFYc{gm-kzbL79Z=tU6akUHGdfF0ZcQ2>4G`M9Xwb5q5;fb9YSIA2{c1TJ92 zS@VP-A-91%0!0$pL|P;#UwO-1eIs7YJSMk0&00je`^ywF;uYTF?AF~^C>WR%VBR#2 z_x;62E^}!dA^B~OOk0fOtA82{xgE=CI3L78KTdHMFa;)%>|n_ol_{6;xO?*Vt^5}S z483blx})S>r42b?gAoN&3xwTIAh;1(vzAbH`4G8y;M~a`%MclrqxVT}Nzi0)1*8lV z0VD#5nGjSghFJ*xZYgpU$TZQR#kYcrqb@Q(z=D9(6OSYAJ2^ediQEShyVivhnQQk@ z%>yL^^aeAEN*O_x*_4a!`Ekizk*Tg~8K2N;Tzn}kA~H6-U*?Xlx%D3Z>bfdL`SD}; z>Fu4;7lzV#HY2jMTi0<;^!MPuY+3@;oU{1yO(PGL33=D4!ktsQKoH_Vt0TXq#<9AD{E3+VkJeaDx1 z0GE5YQPNH3gd8c1BI#Gmh$r3BM}M@@!|lbx3nG$OI15*p#N+qcpI;`DfTB9D^gR_? zy6eFcAY=oX)|O`y(z414nkS>ZAf2r*|J}()@GfesLKqe;z;!RHZXVE>l*!$d0Wemk zEwi*`rn~0e;HrCk?t0Ib;n_enpD}~;wIIdCjHtBE&CBEh@$qV0ge%ptH^u)B5FBYE zMOZfTV@!9=MB z_r2gltN7UZU!w(YQ@?RC5-4Ert_9?b6^r7G*wk0LH$Wi%3JS+}6EjWsXcuG4@*v`2 zUjZ8zKvhlfhY4&kqo>R-b3>>A{}9GJ1Y~=o<+a+kMF)e>TmD_omJ(0L{(dGh+Bm!x zINo^(TpRaD@Z$@!tS5%zF?i;?~k8PeY3ZZpe;VIYRkWhrXaWFmzeA{91wZ-22eMRN^UazvmP| zuIwfmO?jwR{dt@ES#$*+P>vsS^HAZH2T8?oRV+0PGy=x{nb!Av!+VqLI);XZcy^^b z^y%s8qeIS%ftow7HT^)p@{B5J%3p9k_{oD&*)mOnQRVfl+$^uq!^)s!FN`9w&)9hds`J$v6~=~bPN9AS|LpW9BbLb`N{rWe0ss7L`s;PT7}Hpvf%P@oq%!oM*uz=C}aH32K$^%w|{=In9~a+uQT3eG*u%Av>^a zx{&r3!&TrIVExnYgoYoI-&ip05fF}n9?e3!LUjr&Q-Ui*w{VMc^YmOnHqEDXye!@2 zf!ldgF6dT)s8HMMomUklnO$f_GeDZ6N!m8RWy|sx>{lf=1tsKNfmOe8*OQO+rW$dY zR0D3KJ0iz?&JWDSaUt&d8|~s7D&4wxda5lU*mZY9k7pdzBa;O4?M&mIjAI&d@vM6- z7EwNe%+IVxZ%_A)gM~9Nb zn&pQu-MAuvu#vQ!*RRLA`;8SA=or!hCR*63A1AQY0$wWx z?upT(vF`vs;-sk1z<*m5k?YOO-?kcz5 zXw2ik>DUeb&rO3Z7Q2I4HD3Jk@|xmDv~9Uz2exNb_&hn9>zFElS_cZ2hQhEOiF%4d z3BMnIn-(9&R!18EaI77YU%q@9Flb(eEDD%Qdp<{3aT*Ox2UMIuycdS+*L@SVqY`|f zNrJKn*E!W;zskzW)SR4zhks|e9Mf0-@aO(oKs7h&X@;O$z~c@y>BSutoy7#5bcz9lSvEud(Z3hl4HOXn)I;?4|+^S4R>Pw zB5TR6Tzz?R^@$teF1Y56BCMRFy{@^f`S4ytNj+wKqB_aZKR$i7+^XI z4N`X*#ZD+TZ9zS*`G0y#F(}mc6N2gcope?a@eT{G1sJ*5Nab-~FSo(o@QI2sAMVx0 z@apf(ZNJQKw>AaUcfAnoa{&`C+jsS(ykdpfRYpzdVaoqbSDP0 z*CP>(p&_^;Wud+OD+G;l$~R@3x)5y@G31anRjb-Pyr*wz$P({9AyqyXq-UWK-IW94 z0?5>C5cghm4qEU5*#)&ccN1sL`N25OQ|aZ}9N=Msh|3sMaQW7bQ}V9N2z;e=Ko^T= z;g=j;x6Kl-eB8(Q^EKUqY8@t@R9LcQ<>a(+%r`Ss#adN>X|>yA=Pxnx@wq9h)0WRr zyRT3)lpVBUQczGVmfJK=RXA^Itn{M&!o`l%<6I^nfl4FXD7I#{hdA2p`RHmvb}F9` z#nx*4&Kj}%4Dhum`p4ccmC&{J^o3)bEV?rJ{=HwR z5+E`Fbgyo9ybN^Vve0KnKc*`Gdde%eA^IDSCr|u?-aI|UvsR*k`}6s2_3)Eu+B2VD z%f=^yxyqp7@z~b#SVYz#a`?b(Z^PoyZUx4od%$qcc|&bS^~O+dYE{)?T!hQ<$mVB# z5AzG~18e-TM&z#cYSEAbK)tN24A-LXSAt@eJw%nuNc2JpVie%9l^1txY($7TO2FJ% z>g;qO^?m;5VIcQ%_Z|yd0w{fZ?E#9T@@Q8X1}f7hndP4EaE5G@%D1-<;l?3}5-Z#$ z&LUsY|KlM{dIM*6dzsi^9tSX~a<%LHdO$DN=gtAa!3{ZJfRYI9ry0=)_vsX~W0_7t zQ*`_-T*ipN&`uZarXdRr;EPsmPrj9PBO$3MI^=D=`B-a6?#pT315v1uz!r%1)M?48q|sTv zWVgi)-bFWhyc@-rDPNB%+IJAk$%oMggk`-2AU>NQAl%D^F+@JTZhDM5|JNPC{~u>p z8Bk@~v=u=S6hR4TlvNs(5Tro`mG16tq&o~S2o(_Nl5XYDr3grabc0A9x|?r~Z>+nE zyx;ErS>yocIrlwz%{4QhAn;Up+hU=Z?6FZY>|TwV(m^#0`zpT~!BVfW!2PK}lstqs z<-?w4r|s!vmXhe=#Z)PGdRb(_u*Xg?Qfq%Xbgb7XCdy{6>4k!Vf|Uo7;n34QRNel2 zHrLK!)YQ^^tDaBD-P#(`Uu;J{SnTPIjJ5$|XW}ckc9sm_K@`K=MXL1t*af|tVGm>m zw<1^T?1O$Z2$eBF!`{1GHeij~mOb(KUDAJg+rmrGgEoj8=_+X|ZRSaiw zXrp}Nb<4u8H7tzV1nr+&bp>brYQ}bdxAWl5bA(xU`G>@oLr?kWPcC>I3ssc_694Ck zNmo5&y05Oexvrw*l4{E{t_Rz0?3|2W3Wvk6lfSL$u|VpPFWQnDs%yu)(xR$wu6XQ) zKXyM@aY&bQGqxt~;?rPhn^GT&w#VkcWfOF*ga?@rF{$E1uNg{tyAcPlyHyWtUasfC ze#XYRGiUTOosB-iqQISIG5(+Pki%~4Fu}&4dbxe)YxdO>Zvm|if^for-aT|*ccz#3 z)5Q0EfsIi$mR=h-U0$51p$jOgw`;&G3LsK{&p*r`e6{|oj#O%EGRF(f>iC(OEr;8a zhtkajhfc$_m59EMIpGc)^2+ByPoH`0PkQq6A8fTu?ph-dhzG;&8{?(fKL6lWDjPA+ z1o~CmvXF}d2;bXBO0&b#*eKH7V-L)AJIs!)IJVr4sTaAEQvu(*iFBP-zi~LwXDkDt z4%xtr@xt>ErUH?B@p(Bo+JE5C|2Me7F9cJI=IQl;W533qbZm_BzzGF;m1i+|t|@tP zc)(h5fts3{EaX1J;Hp@~s+f$TqC=uV_dhYHytq?5JQoA4`iUZfgv44(ov3_X3rTOS zyEshlZ+_c2$mY79!#OO{FwOhy+wFml!W$6$7%tTqXPC5@cs3ob!y|nlO&Ss1XEpYI z7ztE@)%(FHyH}_BIl|49>nw*h~Us>V3M zmXGaF4;TU}1C;ctzkesgu_aIo^cO8HOlRpM)VlM)JETLts}?Y<_yqS`kVWu8>loH7 zdd;C4U%3lY12{ZmESjbAotD0B)+?2tKgYK!EiNvuZhO$;xxcbBXXVKw8N=`1v>|C< zg~}TXr;{R*RBil!@r?hy)SR5AwSZef1dG5tP{M_Qvd~?Dl2v z^>LgMg7tHWx86Z-ec%;O*V zZg;?h^c`+Q$7ux6K*xr_^25n?Wy_K zOrp2sVz%Y;3JPjrH4vFmb7WDHrNEk;0D;!-H=?oO-3U+HK_O|!L3<@PR04GS@&gH1 zhJO-SC9zm?n0NMc!f!{-9=(L|NZhOGQ-zi(p)g000w&X4+I}yourAv?^M;) z#*{q9mEs{NOj{5m*#E?`E4;fEbKoqmprFz2{`}CteY8e+Ph&?C7ha^?DbSMG*PWW) zs4u_?38_MLp&J}?j%5KuoUrI9Zihiqf_TZkY+I_4*%cI%%~3Bv`|f5A!;b4E?{jWcU0E8gOqrdB|MjB{0NZcY&`Sur z8Wq%e@%)>NApY2!UQB*o|B6_)ct1b!*|PGGn=SXIiNsD#N~UG$zTx~&41u2x%Da>Z z@Xw7vf3n0cHvCi%s~aKOWS5V4>>{N;m3wb}YY8 zU0tn|k$beVAAYUh6-V|Z7$$hp(|%NecK$WTKufHl#@6+Vi30-=0j``ShYu5wpJkk> z#@`fR`7^otcM101qJ@Eo{QR9{G1ClMTlaVCVs`5YO7=Hv1p;7?$H8-Cy9(46o{CaJ z(>o1Kc8_6g0D10%u^NUfwWE*s_xJeuBiJysA}k*;rqI7wc|=`g5t*o_&?%h1YO zQfDP5UsuR)*9%y^OziYyRDwY_Y5Dz^3ATe>l!6tSzVL@1u8)3tgKHwiqY%3u~x_U;Ns z=)esz%IER4Fcavus4LJ9wlu+iKY#w*(8RI-zpnZZUsSrGh$EEI-*^RXnV6W=TquV< z!NlE6hkvlckMKXiSP@a7)mT08Y47{L)j)!1&b^tMn8`vQc@Hd>2dazA zv;WZb^q&ukjyX^X0F>p3uNJ20aIw7^U=c=Si0o{ZfAoCPchM3O#&Y)%RZ!8lG6z

ND3V{T}GV90Rj|+#wY6V83qfquXJz@oKKcyF3##D>{5T)wGJ^HVJF~A z(?<6+y2<1QiW&{D0Fr98LWRr=4J zJ7<(B^AGI6vdYM8r8IAmo)(ImYHw}4n)!fI>ZFkStarrlHm}-nWJwPZ0G@Ag++Fs1 zUR-CrVb|N0ikqB%)|(zxxJa^X>suFDlvA+x*y9!a(a1QAiz>SwOHqseZM4>s*>!~Z z@LLyb#@IEnYrcn_tOHdgx1|?ajogdCd}8Zz8^zXoY6XBbv00YEb%Gf?phn%DL#4_Y z1&T^a|L~Jaj8wLvS~#M6-Xl02X1Rf96eJh$RyD>SHw9|KTmuE1z`}q517tiOtm>VI zJaVrPPpvM$U>Nq8e!U!fwZXYESB0@7-#(hz>a$sY_uN67zy%Uc871yQG z9HNPEizVJZ7J|H06wKKkJ%!rbojj<=eRc+kOtvj?{rhoTWd`Ou;b;d3N(oAFwr%==%fK$%1nl8hm8fX3U=}i#)e6 zS~@$4s0na7`-X_t&Q8V`Zq7M2&TSwR{Bw)v0;c|*hN|4#8o<$aJlstdj$+sxBp-0{ z(s}g~D!`3>2g`9# zM_n$^D$iP=p`jV;33LB34gxiwAZCa(sIDSu<^B`CkYGP(Fi;H|m4Xi_nXDnXz{Gt2 zGt9VYgE4RbU#sM?C+~6mYB)(dDy!2F+Gr`NEE!tMx*7BF*w4Cm)YJd2eIpXyW67sr z0ERoDZqas5?NdFQ5n|nY9Gys)Td~lznK0SYY&X>o#L92Gt-Z)-y)@G9K^v-XZ&DQ} zCi+lpGzb}>KlVo!3d0g63~@_-Vc@;f!5`QY+`Yf7MD^&}Mq6^iiM zhg1}ik&)3M|MZJ?3hOFB$BK0*_jb89Z=K`A&&xRKR;DJpmZXqAlw~@1;2={_?Oz@LkR*rp z{-={F_mnT4e))Jd>YzxHRQ#@uh>hawZhL`d>d9>nf#EgYdxexuOYZ;wX{)g_c;Sqo_wI?ruGVBX;^X3^`IPcz=?l2!n>VCZ3`P5dV;t}&X6}v)4O`S; zGx(xEp<^4u`J&*H0~E^RlO=Hlp}F@)JgK_7@U-2`Jh|#-f}ow}jdZ{cjlc zJ&Y0gEl^JT*unIR$-FS)|BTrEvn*6nqHk(_+8b5SJ|~IY7{#LmWY`Y6a{>-Fw^N_N+Ts6`33?9R=r^U_{ZrhZ-13q#x) zuayj&@eDDNqjJB=_++zVb}TeI9j~+;{WTx|9m*0H{6)`6ltyXh^>0fh~&u3 zQym>0U*m?gte)=(ValuPzy8`J$KDs2niu>G3kPdS$#VVI>v+6sG013+IlEyqjxcz$ zHFMp9d>$vXVy#PU?R$`vgIa-ZHC9s?TXMe#_y6Bc{86JDF(1+4BZpc*3J?!N+u?!rSCn6WPBvHx6Rn5vqWMLopDAi zNhDEH(HPdVE`&B+`sEsz5eAsL>vpB3_#Z#g2%fiRMqiec;zCEbzpH}L*@rV$N=qNn zE3my;!^BA+J8c{azq`5B=sP3f^zide&=7MTi`lbm)adi(aR<<8VkXZY(saEMiiDwP=V>n^0rY&ohKFBICYmRurM-0TB3E4Y$y0_c{JQjeAu$#$ot{{Yv$jdFcz3q;dC%4J~>+a&DCf454Fi(9tCh z5N-$SGRxh)8`M$|X1qpb(cEDymgGa@uK=cf>sgBy{*7PWW~&v(rq<}roLIxhfBNuK zq-YXN5`EGyGqVd=I^&+7z~R(TaFkcLLz>sWtz5oje^&BhBC%de_vu?wvR-tRH$0m2 z*elL@V+E$j-8T)BWlFp+&YkAvrRZz4t}kS*{ZvHx1OgT>oDq8i3ki9P$ZU_8>Mh6v zI^L(Vhz}yL7$Rt3kQg4$d+?0U+uQs3vuD$5*8|CJ#w(;NqSkTOH#D>ZLZzUjl$?`u z4Uj35&rk8fxgNwDHbiNuVZE0oFnMgUth*g432|i0#vfpK;oEO}dSpzA86` z@ymL;%*F6Ux0jtzBOp~W0OjWhRf#!3x@XO6R3Uo5Q3r)sM(ct+zhc!C!Nn)~&m8$f z2x2U7l9Mkh2IdeXl5A2xGQS8E#2{r8K1w(9wr{n zW)zTBwyw&goKs2tQOtFHwk=4C9|l-?YQTLAS|r|bwXu0`Ay6>nk}eWJ)*IKNW#u;< z$5OiZP!s;l_{0P^qG@DAyD3IMi*vyGQ8WLJd2)^P-(I}!Bo-T&D(R`UQ+k6i!>0dr zwde-yPjKbXo898J{Tj%@&Y`&TJfPUZL+L6Orm>8!O_6L+!UQ> zdK^DOQhGYWoc?ZY;Un(XS_6)@yXj)v6O@25kxgV441KK+VNg(0>%XHgnUs^0)9hc1 zxM+LO?YS4qblV3vXw15BjuUNXeQpK$Aw#oFwOhk6s^Pe-$W#R0MFU=sgPq}iw0DLr z(Q|J;ki^|}9Cj~Yx|@~Th7fof8XVkH!?)7NR=iY0x>kAO9PQ=HpO|)*KmTS`v6pXI zG)}0N63wte%1r6D=skDcc0!zQW5AY#$0373PWo?exN*828=o(q?Xl!_Kfq~QX`nCVA-tLSi-=~n>;3VPNgCdY!-1+CUimgB(s}* zyUooK z9k^YlFBVCCBdfM3=O&yeo)lfb>JeF2y!ldyxb|U8;}W%t03F*Ad5xDS2JeQ_nLx{k zx;>$h5GBnvi-N%|I&_OjUO@V!`C8#6oV0|12?)b2a9LlH>}Q=EEHQb1CLO{V{1GuBO`B*-@gB| z_}^AvE;{LMxoFt+TC?i6Z{HS zFq(QLGTafg(&gJTn#i=dd#~_P96J@1P!9gSxcF=xzTdB-@WYSkIAiPAuW3ys&x6Zu zXhG&-vQaKO=(cr6!@Z5Ygq{>z(F9qZJ=f)wX4^3(Oh9loW%X`GyV&j9U3AszY4S)t z>2ET~o$e@#ne|VtBi8=+!&b9LXv{X{f}j!W@bs?98ee z-AWTD0}oGsm<_SO221&7yd1*fPFq`>sf7jb^AyyAs*Y&a2vSlrG8!E;jcN#qh>+2)^KDT6CNio|{o9AhEUng^!V$5)&$CIVWk5hK zu*%s_a?*eWJ*_-fkuux0+UWV!Aa*)7DJ{kr-N8p|Ne$@=OQ#jUp6TkE6=y6f>d7|Z zs!hrHToJo~@hqQI$szAKEoA6- z>KVK*XZwiR#VU-Ta9@>i-(9N3nRJ>jnzXAWcQpR~w>OQntZE{Q zEq3spMKs4T(-;zxl7pO937!}7E_ouSbMZ8U-Ah(3D0EZKy>C(T7fA^+_@YBFFU6g& zudV&zd{9G(DKo-HAcT%FV@WxtAO`)EkEwn=IwW+)wn^VEUnncX=(=^NI{oW$fD?%} zeqyz}Q2*Y&XC8YlMKZ*ZILx0@CW(0>>*Hn6T`B%WNY}`U? z7h+sy=zPr{KYrXJ94*N+mj`dNqhb`5SY$g{kJ;q1`0*6aDy661V+Ud+-+GT;t>c5h zUjT)_+rdA6bwXadHo7XmH5pR(zT%S4CQrH)O%x=(nR)fpldnU*vF%bT8rA3C_A*)~ zh1ypyo{`}Z%Hr8ahZOM#vpOYoz$Z^f?; zdLHcEIDlm-dG1QdliB6lX%x=i@sJTjXgq6eha@O}Jk>va_&u%I+*}p2g86R1;aDHW zJjYR5+ib1b%o2`uAhM8Fh}swmv|Kp$U`ZjI^FU2y*m63E;TqN5svyanC;SG=@7Zbl zW`o>wXpe|NHv_>~@2{wC&PhTG;wsSQi}NaAzKlPmfTqaXc8I}~q4A@beTcSv^V4aT3&?tlHWRRF!7 z!ECL?-qk!#Dm_!nY=&geCMTP7UcvoVknk<;{ba0R^35)$l@hb_EaiM1`QvY&3Xxbu z>$>Yhp&?%Q;imz6{D~tRiMIz8ZtJ<`N<3c190(bRg`}I+u?ZQaA*+PdJZr!*pSJSc ziP#?#*sltU)96xnokM8k7jQtS6$!d*S~AXn<}MkVlq3RT=dfY%n9v!y4Mv5W8MK}8 zEG7SdPLvETC1v(Xi@>nX9LiE;(lIi21d#I-&siZ0As*?A=H4|$7r9yNXZ=nh;>JyS zTeAwbeGx%T<+rw;n=SRw*|>bO@7>Z)_z@8Fp()^YLFLe+aTkMoSy^Qp&HCwel@3Us*U2(9tfOGC5E^cha{#tf1urSEGie;}svgA7s zEoAMHH*LwfWW2ltux1BKx<|nPlh5STXker(WjkHXU3R`;4Jc_n(83~tYUJdh zpKi6*qLF8&A(zzo$Txe#$-}t}LQ(KyS~Jf_Ile(Nt6G)wB0nlCVXV;+;z>i2bxwy$ z@4qtIi(Y71R}9Yx#(H~vpP4b819@S7)N9_A?XkvC!cjI&cGKqFAZ`lG0N5~eskuBN zcic!g7VOkS2;ao`cxq~D-KeTwjo$<1_HrsdzOp@wf|de{{w@Q|V78C)S@U&0kJ)xu z*S}{rgxQMRzHR*FL>bKZgVdfB!mwe*_44m_znjzyKehdLm{y1fu$3 z%c1O=B*PfXw2XM6m~nGyb+-_yjnQY-RS&5-=H7yDF7oxG5k7F5{S| zaLBAp$*DJ+LZC^td7URdt9hU1o;}gjL}R)7U{J27QH@;zjZ*W1c_!PW_`Q1%wl2iW z%E?9bJ$#@j92*%Qk9k;(_v`P)V4F8E03<(7x6|)t!NT=@Wsj`FJO0ySj|*raWIXc3 z|0-IXl^{#-^70xBY~VXe)>=_z6qoAmgHxfQp*+0$ii&Tl!iVs=umVyuGv}IC{q9sI zqk)qVzZ!J%9EExRSaTE)!fsj?z&d&s7O9#RBQvvf;GVMUf58O@DamHk+yQY2D(whu zQIM7WX6<*~N=kTGS5I$z%BTn;!-ln}-q7n6>7i_T#r;9)W;f*)8gKdwGAXebHJda# zFOCQq7#bFH^Q|3mQHKQ(BN*O3HCNqBoGUn1>qvHkgR*ZWc4{gkCdKf{{knUWTwVFS z&x!h=KY>v9w%Qab+|s2!xn<4C=zo}q{rh_-gn$+4y-2_D&951q5hBl3U5wgW_Zkc*U`h4(o3sP*9dK5H-uAFA#*h*KEy)FD>0_`3raS+q`ea!KS9d_;_t2&!z|Y zCu5CQa$A!}?~UQ(fAKUObRgR$SjEdmFn6mE+`gLdr_lKxi4*1MwX zUS7wNC%DVMgyAO6dxScfYBkT+afwPu_`94hw=4Bd*xBB85xoKhIUKV97VGiI+Q4y& zUn|FTVLu2ixS85%!P zPlabi?YgvrLSlqfxe7`wP*^txM(;t$f_TsC+8bt3g}J%84byw}S{j!5ME)nnBg?3C zkBU|QY_>of_9SCdi|&w?Gsv&^xgn{g0N-AZU?U{hw!-WHJh6B=k7vgpdMkdN)OnXP zX$#=Jvx@N_ED@r*a;18dQS1$4+oN;*v_W;>bGkYo0)@6?^yMpRQSOYmW5+%5X<1m5 z0P(IZSg&WuPAMu<$u3(DA4HI0Wc8BIew6ea3@r(qMcQt=xI4PCQXXZ5{;JOV%sJmz zGF?^^71A=YHa0TXwLLRUO*m8paFx3@!Zx%EeSP00iZ3o=i65D;ak1czxY322kfEZZ zyJz^JcfJigQxXbyv6PVFj&WPQ2X&CP(N>ga=D?>K3H$-d_vQP=v;hqL*$kPB`}dw3 zY?hb1_vg(o9t6h-F7_~f7=M~1KA8HUWk)2rdFWf{ap{`$*Vo@}#P%P|zau(Aj!Wrq zzc*DYr(0p^8ye1xe0=1<>4^diKRrGj&g=YsklTnyU~g4!@Ifs(PQwW_G`p|&Xl`aC zpzdYS5Ht9}zUzFED5QaLtgNiNuO9JTY8BE3%!!_fX|8o5HR3}HzuKismng#a$3wLg z&CN4JCmc53W!BVYk~>-qm2MRhS{@NnffTtgw8DQn&tJbjpS@?1QT1uyC#f7c0WsYK zPI^~&w;5H16rDN~)>#~RcRkTioVy+#7YH@8gF9psRquzqwUhVujv9V3VI}$$>z?%1 zM;GLyjW1jMW>p^lF*koXqX8Q2U2>oF1;8LW9W@zZ=Ins(3)8UzXS1|aR8paV3kf+z zr$MNOVL_M7Wc-~Mc6)2x$W2Z@2M)`L#<1ssfeUFbZlLdKR`%kiEG}BBIR!24!Gs0t zJ0c1W898QwDVl_R5>6PmH9srPCg{gi5Wd(&#WISmSW#(dp;m0LeJ8g)*> zu6sjS+&qM7m9A4gt#H%SQ38o@7Wr&=c5{IzTj#@3a>;cCkXkwM6D3pMUa)k*`~_@8 zUS7`=@V%`MF|b!R#{!Wxq*fhUlQG%Q5MH9!$L%jL!mL%U?zuZgiAu;296Y;y{grgS z*wS8Jr!caLXXyQj*+hDKdsT~U63e$|7&^PU9F&((2p2Hw-U&TAX>V**vIQ+?>7D6H zNU#ThN@%Lt7HAE-Aer_S4)(Ks&JcgKI7|LFojO>CA-uP|+#eTi#iF6927|!RL3{%U z%BL#Q(nn+73ZEXvbFdr6pE5MOf2i^Qw;aVzqRROLmX&_u5)uduZg;?bB%_?0!Z}ze zpq!lVMd~^#11(TL02J)pn@)?daw`P{)ipKsm~ErNvDsLUeSML6?UEn({Mcac`b@u7 z1p-5uI8GX+WFGq~F+99OkTtEYb!#e|0Gj?(Hl3a zN-Y*QuOXt)QUD}S=sl8>DCz*IYis)<-*V7&qtJC@K4fXC&K|%^7;KbAbBp+qmlhWK zsZ;$ABmJ%=qk5{D=dOb#wJKfmk>p*v?JJPn@P>&Rz5a^EbN5S&kd&vFmk&{k0Wg$E zKC!pA;F%3A)sDut-A#{AU%)3JnaP^w<(9g2>(Tnc@UV_QjMTu{_}Os$)e&H>YQ`28 zDnn#cdlfV{bZJERcORxrg$Z3~sqfx>?~(D}kOUxPwP#PzozCPP(QOzm9V%7tq(|`jcyFL@N4;NM}?b49wCn z=9AQ}52o90IoOpdgP}yVYnu~Ym`K!AC$|q=>6b^6(b|(F+IP2BP%&}|kNO8ge^pgg zd|X`Zf`Df4E8495B8`Dr*vy5MW+hu~fgbOm6UEtr&r>4>)UPoKrj7!t0zT1;KlgqFImJ z6O=8HHQnyM{*w14@t`JW0zvrS5Lo>LYQl6ZfHKcjJ)$8O!(%x)&K>3$3Cs_HR$ z;Fh4ccq@-7>Dj;Vc#G%aJ+iAdal|_|j#ywN{7Gtlm!E%srG)WNom_CQlHlo!7brWQ z3h;1Retwq`E~@n2%#CD>$!z(Ax%r_I8VYAS*4O6;(^pqlaS%1h4cx_m5Rix1^$vgi z`qjX7rv|*@dO>Y8=?=IOUWeh%py=6#Fg{{4`-1GO^|pu`M!QgaR@eMA+&sN}{Ni5( zK6Wcm`#-Y){(Gs1W6;6T*_Tzlf{cV+>jbAVwtM55+ z6pw^zBpdAL7Ls8sc_;k3p3jiyfjg>LnwFitvCRxHuPKvyVJi~(o(j$VPNi-x7ozj^ z>(`R@K%9x{3$U6pIi_xJzqTZ2UmO?^FyCRw{v+V0`|Y>eHC3dcY?B$OsSlhPpr9q` zKixc>`Vs#~P57(CdfoC~p1IZMY+dF+I6L?s7wj4U)Vzd_2=j!Ad z%28%Q#^Y4X%sYMZiSO-oLz0@yf4RM%Zj1uOSRMPkXXWj@GBFK2m^O$=lH%hpXOl0Y zysw-T-#B7T8PmI*78gdo?%5?BLwV^z4<#MRy1F`t+KW*ln`&U2f!<2qmCurrkzwxH zKfW^G#n}g?m145?t|{?YIW>#^?xERK3miue3BVS(qM)G}PMj=D>Tg0!?lL3RDv0t% zrW}hsU!#DHK0Ln-TMF92MDngynlTym^5lMme`V6pV?hjbb_Tkw6j@+$(xoP1VIT0r zxQsi4J$NK!S$V+ zQKv-9Hpia4E{ONjsr}_fN6b*Qjy)(Pt-(a1OaR@dp$|Wme}7XiF#0ktueucP(qj*p zXX|W7epNJ90Q2G{i2$@%KihEeB*|Kih^DB%VpA^b5ibC&x)hW`Enwa+;)$qzZ*Nv^ z$y$k-6{mbnw0apJ(d0M}3@vp@=alF$kQWQ0pu7bkqVSB(oY@Rb-$`$^2$=}o*(;;n$7VE5q z^z)*|ZCb=h!4Q;-_kGIEmmBnCNd^`4F!t-+UUtGbS-5^F!J^^fn4uq^R**>J*O`U= ztVoRpg<<*ruDMkuYNwfZew0(6+a_rlfBQ830RKoaEhhvq?amgY)EvfSRLM6Fd&8`e z2C$wzOm;1$r#dk)v1LreX*@8)fWk|XGh9ttNoGh)PVWB{2P~4gW*KSe z5n0TRERB-c=I*ZE!IrJ5wuJus#e1f+b)5}i?)CtIb=xEjg(0B4V)y+E-dPg%FJW{R z98l9>WE6)AmA#AP-z1zba(V;oH*U$02WZ&z3#%2oMQMX%Cc-!JUod^|Q%>yeEOJYH zy;Jq7{!xA<4fUo;OPqzNi9MfR-KJYha0lz$aKu10)-dOtT6u;}!d<)DP^s?P@bDO( zoHjb1BzRV$F~^5rqO<2xI|psw(kboY_sJg0d&)lwz!I&>ykt_qm-=V;%o-WxKrnxwn1uP7_ zH)vRr&=`VBpuK6}^XKm|jsuiiAg8qhqJP5}MoP{_%<>QCnFI}bB(QqJt)hZ`5k7JlZ6YrvbD0=T6o*16K_}z&E|9|^H5P$ z>X84%Lrtj3`axyg3jBIMHu+(-!pB}4YJhZ70 zZf%J8kA#5n?xg^KgsM0pfS*r%i2)|EL}6iLm+%Qf-VB^k4)<+4wH!Y@T7$d?ms(>m z{Bk}3R^HF2>)?>z@KNLd#u3al_jdJa6u!da9j8T~fh~WnO`O96H$tkiW8Bxnsz}Br z1+HDoY2enA3dn+M`GU!VVy-60s(ET^M#IgG<#dH1$R)q&3)$J_)dW&n@%WZqs@ZtL za7nsmL02@Cvt`HmDQSD-D`N>0-vP0RWIH%gHECqTFm!UJ z$?JPh+`nwqU0okE@$$Xcx~HwQVPowbxLfXDl$4b(9+xh2enor~7tsTiziO3{?dh=r zs{o||7J>N}po)_I{=KY-puB}eCe!Sd%a?^=e_6h$8o3tUMOrky+5mf}4uB&vflvv< z-P5X66Jikv1r#V%)dx_$X=|*IMpsibpPDBOsVSZ{GuwqsSV2=WtFW*zKgqJ9tE0mh z#$pYw7udsKh~znw$@Y8kx)6i6hWT z-SMhv5P)z<(*8;qF?8I{OSD! zKIlL0ylKn9$;rUUspRbJ+$GhI)de-U;5Tn38e4HVe5z3PC~j_(bPUD}?QPh%-60|$ z?wsBS!U2ZQ*(b?`AlMvLe=*(hMxK_rC4+#1ZqfDgQK}sh#w`J%Gdvf zaq5Yafd=6bs4HaYQ>gX`qYJ92=M2O?L(VNPmn^9O)$u_D|JL^ldjLKK;hi*CK*$@G_}HcUc+ z4yG?DhPqq$1GW`N?juOf@1~4mdapI}EYO1Wu`o`~MzI+4V1}0DTrHmfsnHAoN8O8~RXln9B{%n0+e2Qzj{o*e*VSNqtHslDXP@|nxpINM zh*QtO-td8a|8Um>xNZ8YSCi)6WdFqq9jO|dw`0HG4dGa%8(!ZuZ0F}-zu{_5DN-n} zhAHp7rZdM~*i5(^tdw_8g2|sktvmEWg#PO*mn0IHRO)m1{U+6qezW~cxFeMP+C=bE zm6aC?fgu~oy}ICe;Hm)v+E@5MFL@idtZJ=+cocT4Q*DxawVZw^G6>hJG9-}B^1CgN zX6EL3O)J1OONY_Ux7_9Af`;5yB-8`}lu+<3&$6@xwa5zR$s?X*Tc4j|MGI?& z7RgOV^SRnhCx&vL31U4N-elN|-C!EAn;5G3^_=Hs74U6D4wlCoP~8X&0s`ep|!o+uHWpotBwm=W|ner$cd%bv7)h`5ftz<_WK6``541I5Ug|O9F@zRmUJ~7dJS-peOPDu^YYkCaE4`kjcHf2B4-rlaDsMw|8n4AyHA&E$? zw)LGn0I!OQiX=Uy8o~1bf*Lh3yKweuP-+!kfqAd)PSH#&*D)(U&+M}=*1Pje+WUr= z$&S3b%Cl%S!%zDU_*#LN;bi?&etZoV{ka&9iOTv(iyoDtC(Hj#f%YN=js#8b-~={SjCQ?Xtf$xz=Zv4MDMbR0at0*49>j?bk#x ziz+=`UAxgIL^IpS>~6rE)uUd!pl|zCq8R_c)>GOTbc_+*FuxIjFmAh?+aoE@b#3xyEK zmf)k=1Nqpj?)4SbT-_&fW~=YMY?hNgJuc0ct9$0C-2KnC-OCKk6{{g5PRhA{|ArQ+ z1DU*-Si0E-r?}!m+K=0pqDu@|opr8rvfu%eUnVsu^P`jsu=(Yctsh4yX>%W)gj=B` zP-4y_zY7YKb`TOE!;GS3+}+F6q>Ru@fzHT~Ukj|zcwb5nW#A~-AxXr2yecp+>hQw2 zx%>rNsaZOHeoab`%{VAjeEYhJYJPwiCr*lg>vQ6RW_+m_ehpNk5?dmUmVNtOGjbsyFm5rXW zl+3aY+IMR-2g^SCQ{;E_KLOhUrj%Mw5Z<=_V}Ut7<#}trZ0s4Nu)HQ=;^t#|EvxF( zwIDs;eV$c1U{|x@YhziPlHK_qUXTT1?Sl#tkD@97JC&d`S_Tg0`_i5}kQTx@%-&=b zO_1>IZ;sh;h*yu1>K{iRpjvFbbq>{Cw>{3{nF>CVvvD(<4&cu`z`Xs*dAi1LK6>|z=sXo{DjRi_4$Gn3Lfd>Tp{!J zpaxgYX$*Yva2`j?>=q~x#G*P-VA%hZ4784%^ozX;5G`gZ*4eWbX7g>`m6d|1Qm<;c zThX_%F;{lc{#e`Qo1!Q>fh70zyZj^yPY$&A<-5ybsOB5yrt6jiTtA#y?EqXWj#)9o zwTLEFglPyWB+>M}L7Y7fhBh-)%R@z^lITdp_s?2<3@r-_pFFGMn*pi)80Ap+^ub!j zywfbDYWSW;%o#7%W=-|CBlO;)NWD)Nf({Dy_*rkoLy>AUy*E&yApJ-B*98c~w$k_-&lCYT$VXjPv~3fQME4 zg-x1sCi6GV2=G*f({H+P_xu*rc-jQKd@ltd(l#p#e!=_(J3ASYeTdD>P4Mbb*KNm? z13hw3ZX-vEi;2kzKlF4LtjTR!y8T#iNn#|=3(wF7I`OzQ7V18YR{27D7OKS@ecRBo z0{7a`W{KODje(Jo84ysR9arwoohNgYeP6!ZB|a`tBajEo@z&Yi4;)rQynzQoB|zwf z=BZpJKZ~&%|N54ef-<4r&Dvp1H`aA_-OrdP;`Il^rmyEb7EeI;sPjPk;`Q}4sIHp9 zv_ozHbkow)4N8w!=iHZ&=uDAGKoKBJ<_C&wj0&581H1~8>e6Ck1w~?mmt0V(3<;l0 zUVn>*s%je0bnd1rX1;fqx@ZO|QCfMqCJ>pRxLDvWB_7Ux&vv3Q7hbcn8`;*Z)Ms_~ z)N>cX?5Nupr)x07NpMN*H6u3RPlR1RD+xF)gXLsoIbkjq0nLEG8 zoFX8c*!5*wcbJ5}bdoSup?@2j%T>BAJmX#$r7%0ZoWgGY_AATgMT$ZXsgsD3E~D3D zf7#P{26Sj+Pz+NOlck6;b?A(6*_kWYGZZjKN5yinRw%;68uFjTKPKK=oTVx8RSMCX zHDfCg;!O9&({fGD*_6?00xuaU@&R9l5sLYiR{3ZUvSXE7g$mB)L1pK->_wxWgFy-;ebF{ne_G_s#aN zs|5z6JKymh-GchLY?3@orm?wS^zdOsnN0v{48~Q%o`*3tXQGonUwom|Wq|kVX9WZ0 zT~3F3Ubn&0VkA_%?$=aNX8TgOJjJE;bkpg%@T<%ErDGSYz@0L?B>hST0lnRQ6sMmN zu!fbcu-<-r+bq}_;7yafn{qmr@P<_0D*SF95BqAn6rdaT{VS`x$2CFPDVWZalV?Jj zXa|(W$YbncGAEn-)1~9l-P~W?LEBHEB1IaGC!oR|k z#lAkqquWDUJ%N!sBaod~E4d7?QZf5R(PVUbgk|wPRGXWyuy7 z`{@%h#PqB?)PbP^iJq!*X~)miX#l567hE-Jlz<0G2)D{dp=Qs!mcGvlkH)P8V^M8L z#olpST4>ED1);_uuI0IhXf(9Vo`#FbvmQmUW0W+V6RWly=9Xim2y6;f{w{?T6(vmR zp|P=v(pKurzPLcSI8&a0pudsT2gr{p9fHqscwn$a|Mfj+Xsy2(35DHodNE76qje|07+PL!Tir1CXtaWH+xFXw?bfKJ|KKsfI5^g=2w|WZ^B9 z3V8p!U>gh!1escUkQ9vd$ncgrz;Xwtrs;jW6fqPu)OZxd8k4HE?3icj%#aU7x**gy zZuzb>^R(zlNMT(aY&_@ox|Hty8#Opr;hG}epyE)r-^>y_gpsdc0i*BY~$M_k;!=meon)fkw; zpx{cne${LDF8+Gx*AMIAbqXxj@-b7-a2rUGa01!vrlfs2%IL056cC|txNR;DiMJ^K z108m|Dy^cC-HF^VHjpguvSR4ikzsxoc% zte+y%=&2*qthDr55voVBEkRkxsg5=~6C`9=dV>?S?Au8zE^S?R4=@jq%&Pmri$cY=vrW1=LGonTJR z%|vw(@av|j!ouOMc}eMyUhAIhN6i*fAqi&}Ii7S(DLM9k(Z;h;5Q4&dc~K8gATv>I zRfCr0@7}#beKdT6-UGWhCW!M^wD%J7)$eY|shuJ8Ko{YV^P zoki#Z0p8CG@rw0Bij zR}@qvl(Yb8MLLd3N_a#-q>*l<8x~zfT0}aIf`GJim(nTSAt~Km@BDOk|Ld+G`+nm> zJm-mdChoasX3?S6kZ{9N=+5qM zf9mOVFef(pgOJMyM}5L#`wX)tHPr6>|2mJ&Lb;QDL_tSdJ;7#Lu zx!Fb%t=mP&!pZ`@kKE`Nb!;Da(&*;k2I->ed`EwMOoM-_@$J*6{f@YL(h7Dhg^!@i zl&edEu|^JAo1k+!Z4Y-T!R9p@^OT#9ZrXAa9^6|Ko|r}QR|O7O>qYC}#5VxUmOd;_^zAF$8;1XEANznme5n*Twt)>9#f-RrESV*FMn42-`s}RVQhj zILDD(-*SV~>g`>wWs-YkvnA6ap6TT2zwkhwLJ73Lw&k243kw_B#6k(4&#kMty!!St z58pxtyHr8f9yeL5O`ps860i7~$EnZ_F|zpAh|Zuj*IgaQFaK~I10l+Py%kd)pumJlUI7IM?*u`ZJO>#NW4aEe!}g!Z48+*9VBhH-1GxQ_PwwE zEgH*0?qF~Vv{4xDv@__Hp7a~OCpS7lb0{`4T>x6cbjk+Ak+zl>r`To2k5*muNu5`? zJY#Gid?%psE+x$eNEKhPiVc6Sx}VF`0t>yl)-o574^X-U0>C3YAM@-uXxNWNf=Mhw zg^wDAaUOu+eQ8Jmi7N)@sOad4Cxl9$Q@jNKA6l0C(#Q7Z`0j+m@x|yhKq=9OabruZ*!c*SePkEQC6<`n^teLH`}D+w6{j> z3r6jG`?N9@g|}(Fg%zZjTi9^OxQPlExC-&0HO}NEBdGXuVT2~h#seiTHRhG!mkT;N zJJ0(1`bx&%6Z~!Ww%Y&tr-v*@Lpa!oyr$WoA9Wb7vg1ZEamO92pW-3F)!(on;aV`8 zL~Yx-qup!A9Tr-+Pr779J+P--)v@o2(tfKOoFp;1*UVd$UYPDuOT6{IV>P*)%ks`lD< z2lwo@$NSd~9JXB_d~E>Z&-@at#!IJ(h%o_8srJLReO~x+p0#k^$+8kRbgsQja*uLl zF&3X%YA|k!%5HBZ+pAyh{5=RddwGL4&rBlYt-7JMevSG0t^f5G|9*?>Z3j4!hdzreXHxX+*+rtl*0OpE%teDVRa@~pU5UY z3Gsj3-1WbHqjD0O>&pF(+v5G-R&%0)fr*LzloyXZ6W;pb?DAHFqb9|MxT%!!Kzkz* zG3_&T6FX0fnjY3CSOUr6b2(D8l0xxDU8h^{j|YzA=jS)HwE5m|t+Ghp7S!Z>x}~RO zLCTch%hr<4)?j8B(G_ImI`IAlt=QWh(-zwoXK!g6BG7- z`u?8}FcT1wo$*MhC%yHA*IeU0#xs}Kwh$6pJAW&O|Ja!h%)kmF@K@dI)V_0n^AfLiCa-g+7&ua{j+LXy-JOcP z?MO`Jn2#UF7jjhAXl|}kekLuhAsMfi&4CAlFpc?6Q*IquPa_N8^Q&YJ*&SxUey;If zmm||(-PcLBlQJCHp1q;%?u8zjc_qW%ksQHddiH?ybLwXr<`Z{eB1l7T>q!sMr0n;V zJ$vfPA}P2iae-UM|KmMbU!L76UoJ_t^%@@5m*JEWD7}6yeNn(v)Tp@?Q_#pNS&$T; zb+z?cU%;;pa&0+Rs$6NK?2>Noxm`_47_p)0FirWtXaDCL+v_(fWLVSfX}Tw2AN=A| z9b%s)P4$fEV$KGbH>}5IAcMt6OZU3* zGcT~7B&W=+-}gW2kdt8`DJ65QX51m@yK|uaJOw6XjEkgcPHbH7VicRe`utvM>EB28 z+~~l>y=EcN4=+A*7_;^X=4yvEHon%@9jiWsYGGRzy2@cJA`1n%DrYU<*IDl_-ec)`{;R#*S6!2ugBjU#5krU_HRM^y=z_Ap z$@b(6hWSgad7;J`hoO+yjb1)O{n(~pL`Q2Y=0B{G$?^bcTc&`utb_s)QDCmPar0m@7-k?;dXA$7S!NMTK8!&FChXiuo8V?Li#QG*d181WX6B99dJ~$}I>A11wkBOsJwpDiq3OD_CUi|u-2!`2Uz7p+l^~*rPX(+Z@ ztq|fp1@i<8)7pOx@!IR_{yNLAKfLS5Dl01=_*ki!9BnI&Lr($N%2$U$2=0b%^0us7F2zx_o-t zAjS(_+F$WR6sNVlo6x0%Urg*GL$<>uBT8(ch~yX)-a2%uR93iZbIBme4s9R=!U6+bG=GsP=4WXgE1zU2n^fYv}Y} z_S&!Ew8+?RS1@4}uN~uAHUC0ZgTrC{A?2cu<;pUajwW9bYvc}Xe)g)8?gMy%aD1* z$BsKIKn>Ccq~Xe2%;H7^1`=k6Y^zX&d5p8xO%Odn8uZ86sA^2L&rScAe*C8 zp6f0-;>tF5bl*1GbKCO@piRS2uL9jy4m-{s_Qv5uQZS5NBes1TN_itJNib7+0B`OS zt|L2uOpNlmCkP;t4B)7S$+LZ$q$^FBZ8I|q3(W+{??~@oV*Jk^vVsoXj(ikG)=_}M z+gx02I;_Toe`u+yZ6smCW-k{ql~C(Hp1|2dIXIyrPk5)mKj29n)rOg$+s$4)<`ja0W#MnBt=GauwYNPy zq{*>55Jk8#DC1bW)3iS6vM(p`ydy?^RCg32r1hbO_5Fwg!&{zIp z)%JQmDzC9acu=RkrrXJJ4Y~-u^!vz18ICE>d9xaISVI4U-c%tqbiE|(c&i-akvx}7 z+dC>^sF1PA|MI1$`XdYt&Ecdsr7x#%y*U->*OHFti(<5ROLepv=KP-8;Yi+H81YKW zLNGVV)@NsmwCD5$av~cP&vFg>On0W!LcidG8fj%^a%6_H8q`fUbab47iQN(~9%@)g z=57|?dF=)Ewj)S0*n!v1WUik+(QY_BMm(E2=%w8zAvKgqqga{x5O@Cv@^1DJ4mlSo zE?hsNE)D9*Z(&^(9~wG=YcS#3w^78XX%op8hJ)!2{nfs+C!hXy2*(Q_p2W0UEsfaT z1)77>_Y_XX1sjQBq0FNaJtXcaa1eKW{_Yg^Ki5g2bUgKN9vDM{vFG-1Xte{Q7RD47(Ltl18jgth5h$lxA2L=X+z*#CTE+!%( zTFst-(mG>cA!C%BaY$XE3p=Zgt!*^VXq=HSrO#Yoy&Wa*dV^PiO=a7EaQvJEE-{52 zwYHg#R15qVT`QkE5Xr~9BU>Z-GbxWXL%~A(iW-yBGoBtURT!On)1M)IMWIlp@C{36 z7w6StboE4%$D>D&vdM}a_WVvdB#?7k{iHOm4Q%hZo*4%=L=0Aks8DD4`1t&3f*Wn| z@bKmuMHw{`FZVFv3h1>ZdNA6LiO%J;AjP=V?!2y?0=kufh zGG*8|*xINpvx!g5qPEvCYFp;cqv}c7-4|u=5Ye7Z6a{3XWIaP%0!+StSC`vfU*Hrc z2GT9PT+~!9zO($=3p!`aWn|{lJ504X&;xHYnHryW@ilHK2If)n3kZ~kb7S~U?%%M> zBo(<7ECd$8J>(3yXyKI3RHg z^Y>0)$EoM=Z9&e(F#tXXvh&MFNQccFHICt&7SidM&8s)Lbnu!?5&}>Gh^Lj-lIQ|s zkS`f@^X;|fpPbGhz|hIQ_l3|jjM`zv(vhuqXM08K5u&3(RIuk&fMo883?Q5XL)yjm ziEs$-+O_Yk@B0!tE8^tso!q{>=?%yP=Vr|4P=^e|Xz{2Ed#5BGtxUZmMT-=Z>x)at zIWP!KZ|d0+ID~KFq4+RnZ%nu-9{k`S>E0(zdBBbRBHg$B7LBDop8M>%m25pxusSDs zcwn<=HnM7dBtL>D-J0}(v|_`F2{~3lTL6BgX?}`jl6}c$@yp&Dxh@CP;6faS+pn^- zIkx+8w<&n%T+k&l5U_j+MS)SN;_l(GrO;#NC?PLTTsTi#cxncmzX7P|!ps;?qW8Su zZ*M9~)#)+M7R1JhAlH(N7Uip}CHN~r6U1n;B+rJ1N7QsAn27|&Y~}Ob2$Xj9HF_!` zv_UF=^M!fu-t&a77aW0E%KlW|TBgviQT>2l<2liCFLE!$>dY-AIr2bbT5lo}k`RBL zuFlSZ#N26D%lrw&xz@}A7FX9z$|!5=6Rh7I!v1{)p?UE+j&)~t?&xrTH2sFEqh+7_ zdhKZK;2@Uqzb3;$HVlj$lD7|1lCy}jrnVR|;6v~&Dc@Pi6Rd|7d<$`VHC>TYM ze|zeG*AoBQjmm8ZAdi)+ap9JY&@0YWAQ-)cgTv)feUfF!7=$=qZ=u(z{GB&F)1t)n z>BRZ1)abp@s1x^}!5CD60mXfM@V)^4bm*ED zB$vS0RVYB1?2Yq0C?MdO`qeX#WC}zI5rF86huF#Z`^o417?sHoBLRNpmDFt?9=pB8 z;oEJ}NjVj6=28tN6%n0OXQf5{OJvScU?Vy>tY@6C`k%BO-nD?E(A~{rUDY zD3r03`jTmsg@Wf5L}TlOOed;~(SQI#q%@z@KAEAyC`U8e+LTRqcD4j3Ry|kaz$$ z_763EZ@J+IP~#f-=rqb0_w4^->JAZj6om%g8@4|pCD9;!jTP(>=4|jr_*-rLdz*2z zXH^ZVYhe*B;%xj6QnY`(2<#i+M%=U#(flt$d*c4b1C|9A=9?WmHgq#}cOwC`6>h*+ZRDePRR`FUa0G9a)Mm zp_Fpb&#x`AU>V~e)`BM8MF8Q^7vQ}+fe8xSTfhBj%l>#M1TTw%^v1)}AKWs-uyfU$ zqW{bx&}{15YAK4Zk(872K6)$r9H7zwK62f?LFt>C;uUr1$Yx$q#pj2UeKjM)!~MDCh@FyV2pTimAAUu?AclA3b&%UP{|vZfw3*zt;LD8)i(vP z^bQNXqFr_iE-0@BJ}nc@dz`{aIpJ62^zN0;KhN(MDfz4h}5fmJum#k*l*cR zlu6yGH20=VMECT@uIeBaf*4&tmGu8$2#=h|;>5Zg!W(kb zzSBD=qqNwEQINCf!d9%2+*w|E{sk)89Zr`XL*EZ-yvDXnU58fDs1 z%jwB5Mh$Z8mfXx5hoqXKxmUN-e-cHL%S@cCrw-rFJCW<9@1_WcyS+fg&{2@MZFZW# zqwI6V-k^X(#8pJAvvKRt5i!13 zJDZhO%S91@d=EzBiVxn7bd%`6zvHwTH?uREmATMY9+hs%SayLEZx>0)qB*2F%2gqQ z70S2EVBYWlM}>p~T8GzFjp`+nb8T(M2b*A`_DkRJR}|sNF^oC5Z0ZT2A~$XnTQ#$U z)O~@)Akh+S(v_wc6eq)PtEWY?8grpjw@ncAzXdMa35tiq2M&H19bDt{DzDqZ)jBYm znCHVq7*VuZa({EMwa~R;t|2R7E3DLtUu(CNn41!4z;MSBoIOLBf7p~P&O^)OV{QO-+vRu@4>LzFt^_2kK*%(ZfL zHvCI&YSK+l5mo{0i8;1P_Y2NdM0dp9)1xdbYzi(}=&s|mzFV?XQGLrYjqvFb7W0Yu z>2?Sdt8)Vz5AwX*67|NJlJBsxSE|#Z-p%;-&x6auV5If~z~~=l*9*b12FDb6z?L8( z;KzohKpmrkj7wDK?tF-N++aT0(QdCUPq{hPt+M0J5)`x!H}cHIrR1fpw3N~_Gb^bt z3Exoh9@MaWv_e)|9&c;9Va*{_;G@HFz+_b|S#c zR!Fauy1DGuTVF~{b(`E}2DOEvG)RwHE7QIY!>?GzD5CU}8pG$atXmkBGV}pZBH9S(rtSNI9aoZq_gMF>Wm@4z7f z=j##=j7RESI@rZ$vek?y7oBd&XF^=BH{Qhr`&FWNTH)g`y`8I1WlxMnko}G9rT4m+)j|cy! z_dJvYZtSK#E9Gtz)W3JI3cd#rOlecJTy3E>u{oV_nS`Cgz}flkiW0*iU6oaZyyjG9 zNRiMlFHSyl`n1qAjq)K5lDvVgpHd*z(BK0+Y^1gKtr0sZ78V|}j%}swgkPHaBZbD- z=xX9bH}xCyUxUyhgeU>|T+APd>q)Gv8Vkly=(db*ZXNMN zLQz){wZU;^cRhatSH`W+lE!I{^Rw}b+Z$de|2grCDTnV4+bC`A>Rr&{!4uOkoZd~R ze(w-YRJ<8X%lMAqkFa(^D<5#dSR6dt@y zv)Lgx*@MEGY1V$hv^35G$1RP#E=eDhrHL*K!Lde&8)=#5W$UeVZxtK4wAHRG~& z^d1duqPudz659YH+3M;W<|x9$LJa(z&AK|)imh^3-`xzz!*vHFuJ;;MIl7zf{ok|t zOPY$LsUFhcFM@`#PqwZuuu>f@CL$rL4J&QB{jVatnC&C19=TT!k z=Z2;oJ-w4Z-4e2y%E{zUQJLPBbT-p&%*aQDm_>b66rnz?HOs1|wN=^)ig-Qwj2b=T zlh-jP-o}d@-R0)5Zy%}Yu@syit)bi-y7_CrUw;Uoa`OL@F)}ZOG}%BPkC2wR>&{xy z%0QWiVrovg9g9({(#fd=(m*}go*ZEybFsME|9T=o!zn7VT5T&u92}L_pcd@qnUzls zudPh#5Blih6Py+p?PL>|^O-13uq`0xi#X_$F?D~TjGBQiG)V`z>jo zlpkX_BVzL`>PQUue=F)fdivr~RqGGt#s*ywzq0sxm~s`-O=Uz(2wh>aGi;n@OSH3> zdMO2asV;3uZm_ajv7|Bx#OcV@=#i&N2#hOD8F0O9Agyr!Y9yz?H;egO(DMJ2$MBQv zwhq)}J;^9gK#U7>-A_JQb9D7zzZEi_^enPvbW~#}&@d-0kD)bPA0X5^v7e+kk5sw+ z22`H>Y2#CGu`1cHhkyU=+~Cfa|(!) zAa5-LA1jJpX2{*8q<$!J$8|S33qzik&*2Bh{>#(4Nx*J5X~z8$+ z53md6>Jf?^kve~>;a(;y`1Mz1K<4w9t{wXMfGO1-h5Ii$>Fntf+f1Fw+`zA<!IOmzN_i*+F}) zUrd^;2y#L8+?|{TfFKc8su_2|PUi9nBUP@f4n#YD(Fa<^K4xWb~msYj&Z5pBW ztk{C(f`g?W&}A!_?JjovPwK4+hDzpIY0dF9a?!gT!HEh+5ZkpSZm|U#r-6Y*qh1;} z7&MrtJFKN_*pVV|q`98mZqI!DHwU3f_s(|5Me1`Cr1@dQ6Dh{HaLw>xn(Z{-D;kkl z1grub>RD6%B0ObH^BoG_z=*=D+)UbKQd^5gd(orS10&Lf?1;xgZv9DKR^Cu^=OjW;J;w@>b`fz9-CAw3^*G#YDl4-G-D^oCRx|BhTh+7<8Qb3?~4skP*GQ>%iwinCtGjpkB*?K z7kBt@K%U21Rma_vfd9d#)%e@JQu*h<1O;;UG8s;}l*CU*BY5e&Vt)f+sx!TT!q(G( z>Q{Mq(YOd#%BS%X_EEK_&RnX5?8Z{;4_bqVxpqcgp*(=*Hl^O4saMAx52z$t zFg7}Wg)DtZD?B^W@?Hj`2tj#OSsOnBFBna^$If@kb_hu#?~>oCz_c6*%F&!Q=T8}n zZh#T1TpEaAbl}pfCcd|MQ_7E_R?Z(6i!CAVLT~5XOn7|OdGfL=2Y1`fLoC=%;!m~o zrd01Bu|Pyj9BlTXx3^=h*!>n;`u+oV0`DR{HafS)7w?Hk&MMAE)6kvuD2qZUD3YSe zr}4$NO`kRK%r?&F-0w=cf*?TV3&(-O^My|9ZGovp!U9$ii;pG=@J;5z^eX2KVdwOG6_Ac{j!0Gs|XV!;Z=j1EbD$C7$*>FP6+;^*-s8{#af!>xS+gXfm9v@J~6~vOhVt zVn`?57*VTZSr672OG1eqF2&N0o8F4tm5j&a3>P+Xf~zkBnxt7mt&Va9XX{99Fjd0Z zm&7kr8liM*%?O%a--ZfK_r-$IdQ82I9=yQTv%5On{}RyOiiQqYnk6b-KX(UDLeZ1~ zNQB*u#L}XSVh&k4iGHVjOM{|_BY;W-*n_+4i&!0tJD~fW;^g})dvHrWw^zJ5BHlu) zD&3?b4aN#`|2qdKvgz} zgel7rgc1V2T6%klyPVcp6=rA&`3@4qXYP;2Ki(#gLc!5PFpl7g6qlH|IZvaf*GA(B zGSlk~+8bN*`@CK^INchJ1uoH&K`?B$ywr@S8O1&KB#X(6Qps|vYr&YGdDqO65BOt2 zhsqn@pSYi)-|?QP`Q^T~2!$7xuD@UT@d-KA(dn?fz?3&;>_I_*h9u-SJU{xI2&Jk2 z9qH%7WeBbf2w9O=&ZSywQ^Uuw5tVb;lb_cmjvd1UFCM|aByIe`RXHGy(%&sYEn;pp zaYoa?8eC0@=hPK)Y(9^`88R+k+OcgV7%PfRJ!o(Id1#uw)saHaKBwOnz2)^bNw<14 z)j))ede^XZ{w6RAmAHT5tKsiAPqOV1Gqhyf?Y95sd4CnmtXQnZVxq%nKxuO`g&qux zUUplzUV0nLNiGgXZp5$mZ6XJu3;%T4t;0r0(iL(BU;xD>vtAzwYkkRj!s> z;3KUH%0B{fGyW`@5mARFvt-wa1`rBrFu8Cc9OBm?Ap`L{ayV4qvJWw z`}QuVkmF6EUnaBXd@6L9`Gn|1)m`#Y)Mvv~ zFO-&PAt>L|NiBSOD6LQv5u;gVr%@apf~`^X=A?(HogbZWSyZ}FK}pqQ9?O+0Cs7V_ zU9!P_dE>{eiE-NHayfe)+oA_{wu-4Da+CBLQ$w;J(uDP{+{v@(={|$zd@it?wWo*! z_b#6Q=)7#d;*5>VH8!@eQe4^A;fenl!4cUd|O72rWdhJ)Ja6out%F7 z+(})bp*ibPHc|kkB}Ug+mV_QO+^=J$6(cB(Ejtb(0AR1Y79D+3wXBf`a?&a$PD0A7 ztwUHbEp2p;igcIb>g4?69w`cClvSqo$aeK~BCh-z9W@wQF2l7#1fH;Jy%N^B^6Cdc zIs{J`tp_i3j-JNHFQo$3gidc{$iLO+F27MAJphTha2zH*CnHI@Z zA0~aqU}{A(p;Dtbf>9GUO62h2R{F{v<=fo~saNH6?+Jhe-*Rc37<0sh5I!)i?)+{~ z@A?0v_t5qtdgmI_RE##OiGJToNyPzp_+=LY=_=MvfZUaZiG6Nb7W=9B^J+!find{` z^pR%%=GM?Homu3Db?QyBiBEYyYwcu#@+ohq6&1r$jn7kvSwYmTisvCR`*XJKjq~eE zF{nbB`Ud`!paqDWZ%^0d^eG&c)F9P<@d(nPDNnC^pxw)3QhNf6-rkav-CYgwBgAIz zBWhxb6oF!82R8`L_bz7c?;0YfWqUc6U;e3mE7TZ0>IQwCF$#pxGRXao|C&zljjyiUd9sH1e z+!{D5+G1Gjq#Vsmk=7#BPr=D$3%V;~|6xp+k}ArqPs&x7HaI*#o!Q*H#I*jTO7-O<6l#nw-^SXf9sqnrL0NP4OLSv03k9jG8}7N**W@ zcy?FygnP)2iW`^gy`9IM=j)d}-oKx8{1y03rRsguB9uPf!WT_!+!yts*u0^=oe1Hc z!C)0K&UjEY#yBVS^u-s+d7nBVyeqrk zmvMRAFHJ1-i4C&90JSe6KsKM3M$eL;`3nE~XvygK(r356x*Ta)V~y^t`6bzj`4~t< z&RzD9d`TC0H+0vGgUU8m3e``wJMNSLmz>xN<~*)keX*5kYX=n!p5SKfq)2YgI-}gV zX7UzNCJ6R#!&C`AbJ?FG>oBH3n79B#Q~csA7;V;FE5BS?xHBy7A5Th3?D3dN*}X?7 zBl0~aLjaNu{_(*HY{bDEcMB;@xR*pPD&tI0QzO={2?ax4Xvv|`{_rKyFOhA3bX{|K za5%sd%*@kmA06iz>cs(~<$*@vxLtO@zFEb_*! zId+$YcQ%TFD-nI|wzT!s%kk5@I%f-vkQ(2{IQ_cst^eeNMfnjP1X*VeBdgJrYiO?R z-A{x~s3K-toAaa;(y5EXA0td3M@tstpfw(0{;==SebL4i!k6h0f}KC#vWI*XJSO!67WKS z)*J+hrboxyB zaVYkgO-zSO$pp~3LRrGyVl+V1Is*1}HhTPs(pjLI9hyF@3hB0qpT0&0T}+{?KO>Vc;ph-9-NR;RD;sTijMh1plGi zc6rZV^xr(Xe{$_L%oTZ z;qM3X0;*3SANicv=_{^|(lW@%7#;yv8RGM}M+vrt2(AvUNXE8-tlmhj3M9>z$sHNO z+PBb<1P;9HWjd82AEglYd>3EhM1Ug%MGHH}#QuF3yZ&a#<@(ATGh9%jm3CJ_`*OE8 zkbf{OmA6UaJd>08FZ$r6=F}@(L-U_Us%F5aq=bqZ%@zfHJ=`zyM&(4RxN{1K0V&fC zkU|D5S{euxXVH`E(eYD$EeUQK=6b2EwSvWC*i?gsFxuWHgN&Hmqu3kUc=@K0*qd#( zP#K97c0)Pqax+7TU1PSIRPZOQ9e%=ou;EK9QwAMZ9Xcf9Vq+O{HTPdUHk4WfQx3Q@ zSMEcx7&L8YH=S(`@#JFlbN8Y06O=UF37HO#s}(s~4OoN+Bm$RPJ>OCMGJz_MjpUG6iZ8-<8_V^YV>_n9dfu{3cW2sJ>j=O&%DTn_iNU2QJGz-P_fSbQ?+zs=+*w|Mn^+Begt zsO8+llo>nj6njVg-cEe48EzuJ4{Fw?vuZr)F|s-Hnqe}ycfSOxOJDAx4ql19xl0oA z1^qO5R%S;Zh-#GL8xK}-jfiQ=tYolTXK+n8Bq^@=N&WWVDy*FLzMK-_PB|&L*L(Gb zo5VeNyFBOqSVG9Jw(MN`=^Mvpm%sk{H*~k1@(-gmzE+h${-C;x4+qDO-%(jWsx72G z_)N8%e+kg_L4$?Hc(DoT7+d$&XrB1;T$KNW(Nl36l1IcLbbNutosdtcjZbL&`Btdj zbd_A@$D{DrN^*Bn>V2QM0ehNU3=Q{ z^jqo#xg!^XcZIbBAmnJi$nLk_wZoI$#A7#0Lw#`}q7z9ONnDcWTsDbKg788)6we$a zbd89&v*3#dW|NxYL|JpoNr)a0o1=3(_Dd+5sOe;POrFzt9broPEhWbr=A1k0&rmJr zkH&%m4M-1!$L8^#jqMCiw=kzHN{+Fy=Jn7aKz9B&jq}$7QF#i8hUSsa7Q6kyUw`@6 z4^W-+^r_Hb;f^C33eA>Dr$e`nk3Mk5E00rKs~Y6!;W+6crh>4}fM~!ES+tPlHh09# zjmY$0DBLWKKR9%CcE%u>_r!|DwoNNfwI~gQKC!Q&QZDl@*A>=vCQ@^36gwBLWz?7Q z_AK0zeDn*GF`34o2TGcOD467;#?ReS!3|AvBOAyg~_tGA0Yq9E41jECiW68G(`j36|^ z?%=LnU)aAtBI|%to8f%|jYxqjOzb9Dmi1HgqOlD8fU{gn4{;6ZG=&0CyPf%gOUW6- zQD3XABGR(#M#Rnk{aOF~&4@HCCvU!M|ESN2fY|HC5)Zl^2Ip5Bd>F--{5ax7FP23l zZr1>3At7Hv;>t^(d}4nkAL#Z|p-?^E6+t2K+@~PuyU+2P!&y5AOm|e+kv$qdIo`)&`XqmY;4dQlWD)BZh{dpF^mX!>``kkA$5&)uofHJRaK)&Z<_^S zQPPfUbxR)np+r(RLW#uiOzM>mjYHsQz_c6zb8~a(QToO~JL>aQ-`?H4ihoL?(-Rt| ztDGboo+t!QiG~odG|;KHM#7b*3AJt|IxbRE-B3mQ3qT2sL2@>5lxEOt#?Jkc21|0H zh8n_O8jm7|3OeZ$ zcsDxg1v!raOJko_mcXlM3_DsC&AE8l+#HINA#=>Ysc@@O8!#}BUP)=fgxWe{JPAo# zR)ACpYZIdH7M47upz2C%O>_Z=w5A16St=fuA9I2T!e9$eR%%A<2HK#(rFX%|{8HFT z9@UrfWV6p71-iB~cb5CYq2g|P)oW)poW5Fx`N2Cx6y9ZL<)vK*L?e?P4?ZexsYoBRdU9m;gpu_+iI`f#>=&HuZr29*k ze|TyS)1kyJM-F2bhOymRUz&pL+(yVxrvB4BNAo4Bxq0b6HQ=9 zz<7dxLR^f#>$0B*;FD*7A?)@34S81{ogdbZ-oiuQqpLcP;ueb;e^=`>YZ9LWasyMr z>=Q->lkUD%kG^M_dBgO{W-)zFEpAa5?UPd5e-sM{eQFb?LIZxpiMOx_@f9BUV|pyc zCmSHNrZ+e6I#{j@&{8snUa441H1?P9a1)9;3~lbCTt^Ghmd1szgva}G_K+*bBq&yD zfn%2yz0;Aq(4SMk?y!h4O_P6-em{4R#}9m6Rod$} z38K=9rfy!Y&Nt2N@zbgHvN(D=ik^9@TTFCp21ish@b*{|{Y%4_?|<@t{}Jb-sG4*C zA^Ez2i!khRo~AOGwfcp|nD9!-!QT%ta&~$RwawX$hAh~JjV{=QwY52hSx;1|T`|$2 z+iv4Cs{7@JcxFR$i4_UTmI;W<{H&W}n@^_4`~plIpl*wVBhRMFnlbdbuo%DM)^+up zn6ph2GVxiPji1igGteeJ3l5$Uaps+>DDcPkuYc>gE4S>gLn`t>N_ordrbS1Zp>3oL z=+4rjq9SSAZKQ%4muK+~^xWJLHI?Zp6919x{Sp7usX3`B+JkrR+3t(uU^Blq9S{&; z*P&yXZaG47z}(o3N$IoARpRx^v{_8(fc6?^=ky~N3u;ic4At$j3O=syg;u>-{?9wRJhJ(<|m1%-}Khayu! z@k34bWnXM5MWx==$r~|)$)-j^cDY;H$Brph1r9a`7PFfS*y+oq(&TFDZ#)2o8~lNs zyam7LRGDlWjMe<%`7!x|T*dtE&KO$+BiTqeOonoJLb$Exa|VCeCou7@8|!LeI_F1Y zR*UJW3MFUHu;A1Sj&t%}C4rS&ruU2(uv-#yP6X;jB|*=e_Vz4#IhoAJ510%g0i7i= zQQ3V&5dla~nKQ|E%M<|*$K zoUGOcL!GFReZKD%n;2>lk@Atq?VE%NG1~5}BU8!Q8qi8Ji~cC~=1Y^;7^hu%=6{{8 z6Zz5A;rF=oc2&$!Gw*HHx6#X{AycVtQpG{}9U1xFYggeovWRo6{KAxw8gls#<5mi1 zNtJ(7yEI^i7LQkS)ckolXoLQRi+X%a*Qo^1M$j3=ys6tLeAg`G|Ah6A*i)-x^XOzx zE}O$drUPwfS66iR+wgGJ|FFcEqh=xO^Az7{GKONG)PVBEAoJf#3?KiUs^cvE>E!Scy$nt5mKVvz}x`VbR zVb5Ul^>AJvOxiLD)7xXO_zE{v{_F+tjo8mCs&KQMM3YT|Au%th%MUf4Un~mGv|AQ4 ze{mpBrAgn^vK^g#!#n8XP3w zvFcsUHR#I7n-W+1@#eprLWMy8`_hKhI-ha?9!ck>?1j{{S4|kI3G{E%NS_#mawG^Q zuro%bi19CCtJ!}rfXgRM)-acs@^3?@m$3hJl)L^wF@v0O&xMAMW>^SYbd2H3e!B*_ zF+3b!ygsQr;~r|usoLk1NQ4%rvq87%y^7AYXaJU^sGh26^|JnKQ(2S&AagePejd9* z2md>ed5A`(dQs<(E1^|jx=}I=C!P?$vX28to{m%AiGYw0cP%CM(Bt-1zoYUWZrUj= z`s(5F^spG0t3#Wr3D@kvw!5iUBb(8Sn|JdpmNkjOh=v!JF zd43i;XV}K1#iD=wzYw7_yzV3#F$h&!&VWO& zsy2v#g=VuzSB>yA8>JP=eT2d90>DTTNcQ!np3*ij@IVN_A=1a)_Nk?r+767aGiV;f zfRc7EUm^EusE*0*Rc#nwGHa;=Pet6CPammhOjq83wr$o|C)5xc{)+bmyGfy7RAWw~dl3-y|)}YvEVDO1gY4;JAaX?ih4j z7iGjS)fz3)%78Zq=D#E;hu&L^GFrCS?8o~qnumJC^gPT=3WuV+ynNm$&y0)m>J#N> zz?auBWjMS)3#(%NtA58Fa*sHgfyK4gmrUP5)>T`3(hIO}K1rj~gwdMawl`9kTG_f0 z*U&p(m*%XdLiNpkEYE`i9C|^ks#I<@Xl+u%(w9`DY(+~;>lPFggf#Q)wwXvUhKZi# z6G|&NFn4vq&M1j@S5^4!!H)3vvsfCr*E?>Fam27F3sl`x08UBoWO}Rv(~HW=Ge*)9 za-_lX8E~wuEIlVnpF}bzYMlG=hxZF21!xgGb7w+U5YDVR1LKc(`qgS>Wfb*4G$NUH zI8P}OCXA*jSS}Gb6X-r3^jiP3q?)Lal=SiXSU0~QkFow}(uN!xA$4BH67sVl(jN$R z$(Q|ox${y8bBCA6U>`dAtK;N=Eq431pv;UnJH!FD<}2o!#K7z4zwwFo0!u*_szH=e zQlgK2Horf5q?ekWH28|P@q=6X`gs@Rnu^iD+^9+!Khu`7Yx?zXu28@80k;ioAM)i- zyH$1n?T`Ps#|m4ghGcK{#sxFh#j%HNc*e$N{jOm?m6eF>O-q`~xYw?Dym(RF(BAET z+#wdda_0`yl|lNr>WS3XXh=hYwi5zvM^Z=Bv>#byIkb6x@GO(zK2G{;1n1;_edb9R zd-E1Q31Xqc4zxZA>S2oqzv=C}Hz z5uG;{yU@L##uoux;D(rL#hm*4UH>bF{Ga(fLU+(ftfI-qw7&dx;_~7We+V%*%akO< zLXmA^QAuYguPSPl^Q)e}*K%Bta`7@AYkIidB4=t6O6r~hS#Z#_7#P*k5x-pE#Sg4w zyYB~fsfht6o?}>QQun}x3m1$>Y66L%IZs1F1Jbw+`nOTj(|f>x?zhn3R{g4VdgSyz zB~<0cp{GKOCD=XaC_c=xMjxZc@p|39ffsXc6c5M-RKXxt2woBc=2G)xIi^ z5I@p^>DtuFx^G+|`Fa{pYirNIihL76+PA9VyQc@OjDa>@cQ-Pv2!=bV!Jx&t&>1_b zF#F{Wb{J!{xU!<$76pUITN4$DVkJ_l&N4<@Q}|^%bAYcIMc-m-snXV|Z?dpicYv_%O$sCI~=tlhmV5auU2Rz#nR^ zJS0-PXf*ZJS!@NQ(Nlj_Qe5Cex}3#+ygn5tq#G~Os!8`W45OT@gucSO3xW0Ok1a=O@EH6rl-%o) z6b&5V?z3hkUQaVM(jxiH&a8pQ_rYHcf(EOW43>^;5Z8tl+l$2OW^eVB%~s5B%c}8G zU5k7i>Pe!sl|nBf3Z0g-8HqQg+CGwFVfio;*Dr}m+yG@a_ zE!_Sd%ZQ%wRBW+T4|>`q1>avQP2hYeK{^D-pnU+tnR)g!mW4T1ZCAGJ-!^gG0n!mw z^`kHR6}Kv(<74P;>f%|Sp10B-YZDJWAka?zP z%QaXzntOhWZER$c#V+~UFNg1yRK8^1J3n5Sjfu`>W#oHh!lT<$)$#>QZR*UCKOZPIlGDt+4_o1M3283;D=P1a z{ua_e_|4szC*3Ah`30+`jAh8Yq;jvVb}w;`wnO{)FnCxa%KZZs!|hu`$?e&h%w!#H zXCCuKEEJBdSbiRh^=Os9&Cu?U+Vhq|_Qb?^rr^YQMtfr;0So@|RC*nreZ0$}Q&3Kp zeu%vDW@eu4%5;Uo+Uoj8XWom4XvoRk!*;vyzkjeH(O>vzchebL3>Sirx|u9*7r17b z0g@+LPN1=)z?wjv;96VT1sao>c;?xQ@J0o{z`fgPe*Kbn?uhXlbmo!~+K6^sk3QoI z4QAbC@&@pep{}nTx|gdL$Vf@?)A&NK-A{-I11XgfY{tA38!IzjfMST-VvxRx4J|GB z^z`&iN_xANxcu#XfY>(?B9O)?XWbhE^Dg+I}c-vmR#Bw zr9WArM4uiEQzdHBUYhSml&kL`5vbi;Q~NhaqvR%& zi_$3|-QA%Q($XE$4V#V)eCuIm&U1lT2LIAC>iT$Y$ot>SVyF(xg z9X_-bzBzmX6;#T0Ra&+b{0r@|XLL74|MxHd#|MbJ^_7q1a&aqK8j_M~EollL@#!lG z4$k3xvJ{DOZ20&z7jrshmbbh6O9TMPu5xwli_Uoe^w*7ivj4Qx@7QuhE9jo(T0{=K zu!u1Dsl7MXU=-QCH8C-vv>~smdYNjo;>0x~{I&XWY`tu!{WlkqxG=3FIQ(KTAz6aU zyrzN&(N}B^)lbBWJ)NKE!8GhCmvp5mA`PVc%WMZYeeYF<$fqI@cTQY-^-S(cQRE0N z52juLOla9L&)h5vjYfyffPQ~u@VlQ90*%+jVbcUA)x?or5LPFaudWkEn$Kh0U5tN_ z8vWmo3O?dh%MxKK(Xx;v>CN@Gor`Ze#boOnvCwP#3;Iv*4WRu1Jnu(-atyZgC;qz( z{O|85Iv~L8{;B1nmzkNFw|0srBkV`^b`~Se1|?{a{U4!h$DTPjIMBqV;^e{KY#Z5=!_vn$j?TXj}wxJo@;L{&moDR*t_9^P6x21qlZj3HmwP~z6{vD(I zd(<6^656ja$81LJd&c_+o+lAIk=FZdYV)3*Aq)8_%Yq#@o7G{3te+p|53VNJ$j?fP z0@swBwllHt8S}Axt5d!vV7c+dAK(u_DHbQ?8g3q=BU;W$b2`WzfPcGAkbRL03Q*34H zPO}|Ku^Sj_LmHlEGmm73UmkDC^gcWyX9n5w;H8%EMSXeNK z$j`BGa$c2_le1_nWk1SiHCM1u@GDLS690AhVOQDa@@VAiZ!HAot;TVKz7g@GveYjIQ~52XRjx{!)!5wty?T)@*N}qd_cX})670y3c-YHRp0#fO#{pMI4B_4 zZ)xGQP7AF>0OPit-5h4S0RWm^*l-sJz|-~S_W!0mdoctm4h#k$35tb8K3@hELM`Tp|KzUTgK z*+w-2wrEce+76iHRe6XriD}7daBU{S&@?^)ZJb)xGZD3%JP7Qw2XZN5YmXc}^HA3t$o^d%K`m{YuDtI@qtt%c@{> zV*ZIPTOSO{f>K|7c-Of=?HVgtZt`*9aN64)1ncP2utu$fdK$a3`YrC=?GAcK1;oWY zL`?sHE?uxi4X4X$i>T>JQ%RKN<^nyqi3E6LxrJ#*=fQ$aWW~j1PvmY-vbXfc1Z>|b znKv@=fc*#RmeY;gLzj^iA!)ZKaO&?sJCKl)3Q~b3=6m$&YTdTckGlIFpP(itDv=rJ zS#q!Gjp&i~oYOOOD$$l}-oV`fUm){_A;{`90d66}1F$>?+w#-EUUX3ZBzzj%ZX@ z2=^Op3)w@-s)CfTFJWUAK^H`_Khv3s6|utBIL2*hB4 z8j-hr=?nOE@+hF4McW8%4t=WLAetJ|Y4;8X>gL!qNq9psU~{Mdvgvu2JWz(0HF!1h)r! znm2W8hO3=th zQqo%VzD!S!f#QF=r2p}q$G0GFP{x|xbaXbE zTVjOR9gj$oox57~TfC60-MlBFcGnr%?4|`*B*((ASIi(uohmii`oqh}Ud=y(Ok3I$ zoN6L;EV5NGQGj(97kC+b@5Yf=-r*hq3206ZHbFk-S-vxxuWL_P$baF4^n7jT9($DJ zxp&hB+h}p>*g#n7#SK{*`I^^J8Q=s`&54OG)VNk7X%jK z2t`B^6`|>BvRQt-{lR#hM@#!Ma4kRw-IHMFOL`_UuyN3j4q?MX;;i3y1yBCd4AA6C zkjr~vSHp1xT*wGcsTN#-JeGqT>t{<<#$nZ=Ohoh!^Q9~X(?&RWJ{-BzVO1qP?fv$R zXA=&H142<K&Y@_ijT!{v+uuX!oCOUj_M7N;v7Pz+vXeDC*g+!olOd5L%i z2O`fv^Y?L-fNcgn+t1R4C06tEo>b*}Mi#4c%{tpJn)D3?shSstmPR5(L`GY_`hlC6 z_HwP>yGi$;lqaD6RckbHX5v;8!toj!m(fGTF_G1PTj|UF0X{GJawZ>+J2nbWe)x?j zbN{(C*XAvvv3A=e=0F{V23woO6%49;iO^RST{777UWB z_5djD$mNDG7*xOf3@1*j;2WSF155jOamQ^&$%jY*=#Z6GeZh{T2-MxV0l0)M-D?L> zHNQC<1bnjM)~?{es9?WfOi0v6Vs`c7hXfw@?pk+8^AL7uF4s(C{*I9m6;3Dr%gUt} zzoPa=QIx^;ICb$?{6;iIe%Z}rk?#`UhDGH0F@{rZ`H?)A0O%#in4Qb#F)yOw8Zvvf?lO_^DK3t0GT|*7PX7921JdUr(Vl|i zJNSrK)O4mgtu!2A$8u2ibbtd941PA`=65#M0cAST`?Z^f;*EBW*Uo1e4LD|}#o%NL zu%H#DQ#sA~aB0nfg9VA9Yinz57QO_-gJ=? zCu@5_u`qf)WE3|T<*-pU-x+9e!)e7H5D^-uZ*T&NbXPVox2=Znvz+mE|Q;v;ewO77KLE9#0o`_2y; zNnGk$Lu>}r%Khr)%>R05KmOf8dVngfoGQ6o?>tVg;f72(nPN9F<2L-?mGA%0U-;!H z(#Buv(nbvVnN)}-V8;A%@ucY zTi8KX#A1AX9JJl6(v5JiI&|!{=QWJt)E}O*J)1q=cF+(rn2=Dy!8npcY-#%B12YYc zY@0H7Tdj%RxJmW4Ybv{%@%Ofv{Np{FBfOONuYu-;2>E*CeZb05E3U&oQ^hs>4>Z5E z40N4qj(p~`TT@1{OVTWA*+|hcq)_-;P0eu@`pRHOX|pQ~L~e+fVm3Qy)yIvW{3-)j z17}`}v4_#@f`@zS43E?LW9)1iVNGnNhk_C-cRtL~Vdv=g#QF=reJs3{fd3zE`#)dN zKaX(@jv+-F=G7B{`a9huGmM;{@8QB)`gwY3Jt5=wK_L~H`shJkWQdWH6~u{56_ zyo&$$(b%8(gKd8en0$GS?{%@|g1f1>WLIO!h@1s99|(h-#eSw(jLDQmQ)*huC2!!k zHMI*kCONZ%gCwMcp^$~7mrX&xN3m%f8FIJlXhfQ>4OKRTic@T(?#P<$T z@^S4c@?#rFMurmwP^2O2JCey}nTQ zVP4X>c0JAX@C)-3+J%I)(1F!ET;S|0soa`uTB{lmj`R%u@;j>~rG7AGJMm(GpQ^WO zTp831Fe`H5qLf7LSQQ5Dh~iKNr?N`#NGE5yXY`8^k{aEWs6<^yY$PGpIIR3Dku=o+ z@d+s>?xs^=L0Q_CwvL9zgOt|BHY{vckqsHRoNBBbY9g$&sK?+pSiVM5dg3%r|0*BU z=1qV<+=Bd+KtQ^BY-}uyqraVP1C~3|zXC{}$^Vz2q{Py*AXk{$`G`aJZX|N|A!VE_ zdw!fDtyU_P1 z-T!vPj2xtPN}bFd9*E#Ir;&;j6i9VcNKrh8^m#$!8uYJ)clN@T!dth3YuOBNc{;5} zNSM5>UkIqN^g5wL*9c?5&Egkmdi;+A7Qnh4+ayw-+s$C$g{Ty z{Z#6epw6erI(kv_0YUt?E(fL=8_SrNT8?@Y&@oYB#Y7trnC1zvkg#~i1SuJrZKj2c zj0})OWt#&y<|6?C6%Ed0xj9XVePsBTQBM4m{^@od$s;iI>N85~G(sBKkqr ztxRgN1c!No-A>(BLZzCyAmw^Di6kQsWvliwrgW2QOd*S%a$W}D>9u-wCDFO;Qrs<{eOZd|Bf4I7vi9MulNO4 z?!Z`bB0Dtdz4m@rs=)E)m*2O`o$}`0cSuCHp>#(A+2?^BO*6A)cit7pn8ZTv__GPb z_ zRY+4Mi^Pu6;t5IK(!qg7eEjX0D3e#!=#EePALWo9GzO_!uQZfMoNTWg6AoKIT@3yT z%a6+_fbrC@9=LOgi(^gX8f);G_YbgNtlr7U@~vtvwL#Yn6PfE~-* z+e=W+oaue%h0C3X|L1W8aulLDvs4o+P|qLYO+nq$jVVO@-sl_Mi^?VYD9x3W8?QE~z9+pc?t zOR`!BrQ|){|H=JrouWoLCqnOca&Exv-~A>7NnGED1K= z|8lrNfm+w~=TlQXNau~CWu>`TQ-ck3GQUn=bh_P{QlI`)0@dXN%lg`TMhkVvg@`YF z2yMj3s{y?wHp?e5cHAvmWB6$K%J8esC zJ8rgi&v~fFehO~uU}g0My`N>em@Rt)5mTKMg90QPYR-n(M}J7*a1QOjrJ(~UN!1}q zoS`NaR972@LHixcyYC@7IIg)Y;Nlw_6o+lBouTi+_w>PRu?0^X_5^!uHNj|qaMIJK zGm)l41i2Z{s*<2O4d^h+Oq^~4rj{T#*AbTxgE%@pgdy}a+qzhU^g!gMI~qC6stgs) zTy>0TO)DCkYIrCyCy0omi}};S!4GZ%+J=i6Vs=Wn!m)M@*nO!avf%` zG<-_W7Q-N&PY#tAI!e4NqD1I=eB0HBnWb82tKvN_ovQInXhq10dl_l0VlEB*y_z#x z28Tooar_Ld5H!T+(2S3YVSvZO$6@R`vRiq_{UR(FAB8>kPx)8B-c|4{{1peIlIeW3ZioIfdvndt#7ef=|tA}Y$gnzAe507sF; z5{BYpSjg?Uirk*xm)Uwo5kcY0d{zv5bIp5m?nI8mN2y)|+g(FGv{_AG-T$M!^v~$m zKXv59?IswYq`-KI5mvXPeJq)Tra-j}0Q|3=Yvd#cx-`dVcUMNGC1^wN60Yu26)8&S zYxz^z+1Sp*E|2yw%ME0+@R`dzXF*qXACnc5C`UKA9FSjmmr&pWh+faCb8!u6Ac`C} z8CErH%fdSj$1Kc93i!Dw?IU=}5cIJLX-l(b-Ker<$&N3k$pg&RB;4;}>T5vUkfz20 zqxn+bt%dI@i2dD+(d05hW=(9)Gte1tx>&lw^yOAfO-=UIhrw48TpYq7S)7@jB{37X z$HC{Not+{l^%h&FKyVAq4cVH})~QWm^ME=p^qzYG$8VQobpj_-#sRoM+yP|Ajg8hK zY`$UHz4$o%cEn>mP6U`xIZ0Y-2$zWD!SvNMiyL~%3{3R+9TSj&-_ls#XIIX_sAZ

J?hi$;(wO1iIefC_+rJ)Fk64lXo<0$iv1wc6Xb&{h>C_}Je4T^E~?$6X~0 znTl2Q^kx@F?i&z!`d<7_6|Z}E%+G> zfz{P9m3J@7Q2|FP0^men(N1*POk83d`Qh&Ej|YZULKpz*_v5&jUA;7n#Z%HYQ`NZ& z3YgFFvGTOqciiQI39v=!xb#Z-8j=}ZQ;edvoxUM26-X1wDDwEWbym56*w;b;D12PZ zPj|FTxtZ9YAC5z@^v82z;YL!CY8f16uzx~RGqG6cU-6D`;)+;&=JS$vqf+siMIDj9 zpSOtQyvx+A8t-_2VZ=ij-pepe*Ai6rOhN6nus5$9y*s_ACLYsYS^!Awgof%=M!x7P z)3n0oyN2z!Kwc0Kh0=~Vee_RmKOdg2+ZeL>iKHw;xVSlW9wbWoX+m%a4x8uDg9HKq zo867?jag~PMzMwTb%B~2-Y@6}H)z_PNWX-iA=f0Kz)$7nw9;vL16ab7V>Gba)6C1v zfwNNEpIT~6PMEs7IvXUn+w_{kAprpu-_b-2Psh$3NZulN_dqozGSX5(l5ui$ID1xC zZSoo3LDx_w>Jos%UXrCUs$oUFIrIxi6L;@6V@q%Affe`#rj=-VljCA_$p4eEiQruO zWVn1pp~sYXFi}(8($-wPqBakW5!N98HWjvzXEuH>K zChcHyH*6*VMph0`A@TAd3Ku-5OMXOQxEiXW@XzI|%EAsRxOBv*Y7Dnd!A87bB7~7H zna))i_3oUQW!~I-I`kFcl%izkTTJbor3`f06QTIOt^PFliCb+#8rg#rs5Q|)MtUbz zn=%h1SdVt?Q-<$pT;x;Zx-{R=*06$0JF8Ty)u!D)IpXDpth z`y#Tl>^9wlWHV(woCi?H;t^obfvE_GAkhEt2=irBBtQze`Rwad3xzX1>`@OFcl=MN zUnZxdaF5Z8aGR9QtwFyV@~R&VgtnO}!sMCb(HT$qt83DB&BOj204}p@`}yI4Ba;n{ z6`_Zk70IgWXNqGo-rvN%CjFPt9)aqPg5K6p@flperhtTsZX3FFiYa%s>BsS}HY|iU zdI)4A#)O-N_#HRzc}h@v1Mt-X<9{@5`^(?_B}6&6K$!U58nZ#X*?85{y5p#DpOD_D zdpFd%92%J|=@rCiR;Sl3H$c{6+n|a4f1*?WD=>*8%AKuFP6VyHNGk2TVeOoM#a`0- z-SYBsLQ>L${nuHn3k9RzFybUu0=x9S42Bvrt{i$q*yT=ox#+@>?Xuz=j9Ce6S7wr4 zD&wC&jap52CJxOH3t2C6QT*E8O0E|Fs0#2*Xs$8gh${~a^qI)91nMS5spC>=GK`BG zEPBAWcvxrffp$4A{^UQ71k;+UtE&uO)U$iWkVet$dhrJjvVMJ7zEfOI4t=o|h7Xj# zE0z84jK1NwVPR=!M0<&Far!kap_RxpO|wVD($-xQrYGCCHirhgNuVVwtV!{=halor zM7_{!;S3!M3peR*HvNBu!VCwbG>g=mzA;2XNuqIHYU)vQWossP;tLX@zkXah`ur;lI3(0kz!s6oM>meSy9XQl}2ag9G3ynI@rUlq3r0)(x-6+~`*`;v?I%k&ptBjIj=7qD_Y1dBifTL)=mw`RT`vF zaSyz4mO7K@=GBy|cmK7Vb-~pyN+*s*$Tg6p_fEa?8TM0)w^*L*l5XEo!k1*97wV}y zf91^mdzyTqL?-udEEZFnWLZs@ZnWCWDHbVY<*L?3hMM~PK4zx$l7{4TT=IQ;f3GP>4h zQZ4`)!wAKqO%1iF?yEyf_EE-+)cNOhtNH`mo!$L*PfG%+*6d!Lc~n&)BsDgE+RLs< znB-9)=Z!m&%F&6L)B)}3Aye}y(m13d2d|AC=^r^&=QvVZ%D+>D2O_K&b&2C%gl|MXjY2AcT_H& za!m;1)T^`0JY3`EPm_|+KnR14!D;M~V}O^A4DA*>laO;8dxnLD5h#AVa9HqV$;R^z zpL4gT8uUF0K>v_n5xbQ2hR!&ea3MWn^=v|Gw7a5eIb8_3%ED5Z=})xay&8Pp|xCGenpcbg&RoiDCU(@lWgIL84^OC3Qmj zrN#Z-a-T8wtxcF6zkm$1#DE*>?&UR~s02MSR|@j^D%p6cDuIoDc1>G-ef@r$QLjV{ zG7$=l^hjcSggcDNotxyhzM|xeDAagpU`Hm;*dBTzbUz5`DVSi_Tf#GqwN(0xO~o#w zIt1pMZJ#@@6#zv-$KX@7mK=Th zEX^Ry@OYB`H$z<1xB_pLvZ9a3GP^_ib9P1PcH)@k(AYYnE$_x3*O zJI7@cKEHDANJoj;Mo*yVZc$PWqUFnxP94yxagLXlHzwlyJ3Rg?f_VnQZNR(@`!;)- z--bnff4fUeTDm!dwiLTuKRSupw}o-5Q+$G*9!UEDm36({Rt=8>f~U24_>JFSPxg0Wo3E8D229ZlY!jUuzo z8ySg6i0nIe+DlB#F|R+YB#3)5>*+H?w+;f6*F8Fx|6ga6@PjkDy$ydd>x6*0*TuX& zZIaTR){;jr{o85_1eS=N&a3x8J{*iCzM0tKwU*dOZCt{+ox&$6igT}WQeO7wI8y6~o~y{Y<*8G|zb861=3E*8>`k zYaJN-@(ugz-Wm-Ru+c4#2e_3NBeSbT>xo-r);ju7aBMwR5(Mhjx$0u!Z zl6T%W9mj+m;zf1+D%@jpWmMEIz3StOA1E|4I}TbSHW2ZQwe0;GW+*zy==3^1fFS%O z6;1um#J-{L_>yeDwE}%j?adEW4-XH&Rx4ljPrqzBr+K0?MTyAY z-yi8HdlI+}bJ-Uow;J@K3ko*FFkzKlj!t9Osoc82QYTfN7eWEOM{T^__*;7QKOXL5 z(UYf6pRW2iTCBO%dhd53NuTS)7uH1_9mO~xPS4Evh&X)|s;+)_Robf~P=sw&e=V9k zx+q0(^fK;yPOqpyGOJS`{$)haY585`hxxR^Lf-k!s#7$d!vdH!D9O1DNv(nrzo=$C zK#bUEGkN*eM5q>=*mQe!dZjv|G{bzdNl#<@{y*LEKks&2sgQ}Y^xW+d>b&^@(ok#E z*>C|`&G!j(rfKp@_Ia&YX-|XtBJ3V@g}ts6!($V4CwQP`I#BmCqU${U#T2)Z7fNK4 zH?OeB9>d4SZ{mHRu0F`SSc+rEsvW{rsBajw7SGSkdD2EYu+5jn#>VQ`Fk5A$q%h2% z2PZ$20k|Gv2^`cCQ6=-?*&nNYcio%ppXd0mgstt@;mzro7Q4r?hC}(R7zdu?$9J0b zi^2RLwW_QT$~#tdN@OkUT{(f%GyY3D;{%hce66AGO6Eu$2s!^zI*?P9tdO_xx_^<> zV^M;!$w{&0XA|iL&vn-sFd-i#<1=FLkHMO||8fl4~`$#JA=J1#(}=U8@e+ zE=SFr7vN|xg8gLPHyLhoCp;@CH8(ShHeGLpe>da8{F-l zp()#1UtjkO1yvMktwxuNtLy94unxUd3gxi5^mfU~8Weu0bIn_KHWDF2lTpvZ~nA6VoHroOsHSdFQVRpC*r8--uckXZvYR@zbw4dj5 zv>&$L^RdvXF?)> z6_IK-4tw{G$UHef81Z+X$fM(;F+EpNhaw$+<&YKBPzlexEgsk6dw%!s=L8z;xW=AN zPEJlGgjbHFZ3&UPckd#5A#1dJ1OgH{fd3OqvBhF)#4d}Ufwivj6x@gI1h2mKwWE!M zS_Cg9qLuqAc`>1h2V#3_gq5`m-ZIS;22a=$^!e|+$M>*{DT}&(>3ozl+^lOTXBI7b zkuB=X1Y;czc)yRB6p5%^g3JBrMx*in%jbMAcvPH1a>S#?Bf^wVzI}3WGnr;^OG``3 z=>vqN5*bI;*lrCR0qHZnkx)VNw$6BRLlYApAi@VsC&R7d==s?5G^^a%)0>?#ZC56` zv-O&L0-NkhpLHayq!s>;AoX{rFG~Lv#X{<1JU}wrnO{BNHc2db8rt71%zLM4i@il= zKT2jgF1pHX%I*tk^loD=Nw{(u*=l0&7!@8l-6n?8;BAPSBb%8S8?)>qRez{jAj8rz zk0h}zv<;?BJk$~Y`=JB|fh!T8bM;v(sAY*KDH+Byg|ts645X9Ep&7;byvE*gATomd z7}20)1qm(Bl?cXeSSXGi7}`~HSTK+D@b+$T&Yt#AUyara?t%Hnu3vq0$T1$`8ThqRGgGd9TbvEloRnKzTc!h+T~$f) z$zTG+G71VLIyyRTgnj{{Y%DXd48(ME>6;BGJCA&OMdj^9&Rq>Db#}ey%w!m^`3UUb z;z?is;(HzA4~SUFte!>zrkXq^BV%b!T>UUK9{mJwosyq&!_ZLt80Zsz4&8`*5#dTN zaa>rS$WnuU5ky-YwN;QG8V&9Wg-t!9N^@vU@1Q*rt{fS`Jy*81wJX#06{_=@dssth ztd!&9?xJkwixVU@(Y?e6SuB$xdA569Gv)eWiR6FWv@f7vN_X_lx7U5f#f*E{=6(L6 zVs_``$sYCH-CfiD-HrGNJq@bnE&yTppr>-Ugc`MkhFkuQwz=48-6g+lEmg04?a)6M zcJ7{&TMP8;dfJ04nB~f;ZY%bVAgf;;fi<0|suzvq(I)S?Wv4&*aC1bEACYHE<~z6D z9%0v%C;%luaCu2$P68WLTz)5=lm$0zeFJAlY2P_rH#*c&{dH7!(Teyb-Gxp(}I zyj%oj2N`+N#e=LCynLG3$ApRRFIWHDpF8*@Y`{C$8Gg8SbqVQCf*s`6l?wyK+0~HU0soF&4kXIpKZE|2Q8a&zYQK}=eD-CesfhltO^#tH~d^oufI5xkr=x=9H-XO z#`nA>9yOqJo>tmV*tW{W<(`_wQT>Zw8odg>ZA3`OVv3_aVWSlli?0Dfq&fa67xuFI zntFeC`RIQ?%Exn%R8$*V-ERU_KhH&{jbcn|U>S#)km5Vq#(pn$ThR@Y`2GePgM;={kZT$t|_LlcoxL^Ny-pBm5+~m+z{9bbB;h z99fVaSy9&9>QBCoYOZ-eQ`cMcZ)bbh7sEj?Os)-jG z?Xf@6=1K1T@cHM{+?5p8{RmtzDUY4p$3~7;lbd+vm+#J-g1FU6^*a(o%2L?KXg))W z>5wL>iUy*oS*q8sxECc|1XERpF75b)g)QBs>{g(t56wm$ODZ#aa^r5~Bj%E;GszdF z3Or8#w*h_cJ7{xg_z0id2(6T5O)25xYkRJ|Do39= z&!)7&`vTfx_O`35V`4Hg`yUoU^|N1;fO*;5{+Tt9>s0v%$C2XNdjP2s4I(mjY^13@ z7BMEpawo?F4U1P)PBy=JD?NO87SXU6k=|dGzRu0fJsJT>6DB)ym*HUDjmF-%hNy;y zhN{!nIEHY4i7+N4()8$ICFmQ=BgN}f;(xp!Bw;xE@~&*Rmel2b4GIDhs*q2VPWhH1 z_bnNW3a&xPe!dwDFa2?yee3^)v%hk>?|x?q(bnCQEVzOurhioVl2DLTW2;VK)f|XRo-pxE8<2M3^YN?;<*YEG$Aa%JypZH>r>Wfqhi{kU(3p zdXZG(cYY(gXU-`qLl?o3i}nqAr`-OEzI6(nl&ty&@)@XQ37%E2@2scXX+L%70shLj zg1*81*WElF@nv*;aw2zHQZz;-B2XjmT2(t%ZD2>bVX0|W&#?bc>I zlvjEU(!HFqHO?gooDzrZ9z)Naotjwycsq}tczZSUm33t%ky=knH;dY3K}F)DM|ph% zxj7o9Hr4Bz>0i5W28uxnKK^1hrsTsj2kwkZIG^SU%T+iI&F^ci$~hlC+!$O-vyP2` z;uW(JxzE2)I2mCxmsz8^vp(N4G!%L6Ls^+1@p=wyBrBMX3mp+*wv)MV;evKp{C~U2 zzkf%!tRgBVUUQ&24-iYvPE;oC_V$Ums;Cpkmq)%tB>|epdUZD-u)#ZaO+i@p+Gb)R zX6EJ>`mi^S3=jVPo4)4@kP}OUAy{5^=?w+-B8%x@p@bVkY(LEme|%{=T!H4XUDk%Y z`$pNg#T9pV_iJR7@+vCiaC>Cb)t&C!@g62OaEdu2cmCnE)kMaH`@f@%^aVH^S~v>q z9s`!_D%A0)0j>`xcV6YE-uig-zb@ij1pS1zfEUm@O4GWYiCTlRI1PuoO%NW%r9cP`P^T zS~#B=iHCQTtXV;JFbG&9sB;;URD3hlfqT&+;c?o23X@U7BKOyVXd~xlUPo1;WVdeC zF}p>-c+1YXx@svVy+6&q7(Y{WFCyMvN_2m?%4KJKVob!z_}2>)Bx1(K3vv`KxOGa> zfO35M@5{=EZ>qRQ`;a@H;Socs2V?51R)3vcMSXpR_U2u+C0WZxjIiSU?SK`t6|G+- z%@0BiV}T%@u9tux&F&e|U(Vjcw^$E*O>4c&MLC2C?KWRcEO>_T7}@A5(`(?-z_#o} zI}eW>@v1JFHmB(?m^p3QMr^`wZn@ir@+hXcT^k{CULyQ#>^SM@Ou=`?h|+5A z4pYE|0Ghp&6gOU-)2#V+_ZuSHE)W=mUf-HbJb{bHK~FOb6S1^yEleK@uQdFGCXe8D zzT_%&x(Y!dSn>q`LWwDfrXLNz$(ZCBdpbD=xY2Wa7LxD^bi!g_5F?Xdb#0>>P_hwB zc)45r2I}I=yGOM&Dm_z(re@|n$5w_j!!vE0UncGWhORk| zM0(a(^%JSA^Is_49YBt#Py4m4i`XY;Al#0N_Q<+=$Fg?bL`^2Sl+_5A!e*i60X#Nr*+ZLe!3 zA}?%uU)sW&IJ_``k=fnDKHAAxjBYbAZb1>i*jOV*W^U$*W_@7d88e@KT$52!ewR0R z<>wD}EV*F*J+?dLyoe^#8nkq)!^_+|MT=DZL?5?TjPoS#n@C@SDlsFD00pV>t z*GKjP<}Bp59qK;r6HP7fA?EHqZ_iyFAV4Uhu$Xvm!#Jpql23Wkn@2GCD6xNhbX2Cm zIc)nm`E0R;t)2*7V*i@fZF+>OXNq)2>=DH`M*H&-qASl+R_+=0+M3W|rILKQ)fA;i5pW!6c`*6Q~S0g?%^*?pR~rHpARTm4rTadPL{ z;Yds=roMD&OlcE){QN1)8H>5s+EcP@@5kZ!k+eHV&SSpRlfOozO9(S z`ra%>Ehh7aBQdZgur(}9vQN-KRr>Xb0_FnD_4BFR|TthXBjb$y<=;P8pFzop+>4LDR zr5E4Lfd5J$Nxg)mHLT}&R<*DB>1Y}D>^=-z5a12b*Ed*RcD+Vco#SjG*<0j6#=toH z0CEbBJh8PfBHwrKoN|?1#xp-S$sUA{*825D-0?qBK!He%EXCOE1*9+e5smk>}~CY|xw$;Yw~ z$8oe?s=~`2u@yJItZ&oaHQB5rsA&Yi;Oe?{6{bz>{ME-=E2vIe9!jHPzBHxvtaDx5 zf1T&AeBpm)a>i1|)w}@A25lW-R^N5~g?H@h*M|;U zwENb*xhv)b(KUj4kwRuxI+7LfH$hL)j<2LE13K?w zS~=_4jQvm1@IE&b1CrGK+NQpSm_ao!iONF&(iAmsVj2pEW0#kAV70I!TvfqL^(QjW zxM60#tT`B_llV1Wo*>-^m~Z{r)8OJB^W)_$s8NYYFGv@U5qoHm#)=T@8&umbm8GKY z6>CdLNvRgk`NMb2S;)?FTCcxtUnv?-_cR&Uo#rvqG^u7>r*ymxmp|r128UKEZqSG< zlA$Gy%^%ttNTYa#Y%GWO+;)z!qd|XmCS%?r&w1NIUQBZO{;QkJjJHiT2KN@aO`nUF#|GM0gtmFXZOQYoR`(D_4u4slt z6$*exkj@`b(~l{>9>~n><0E(reQyN~Zxycf#;MR`yj%%%vfGa_UFvRHInh*PwpB74 z$H0-U`N$IckpM-yzpJqq2~n0mIeA6n4D}1VcX~>y`ODZ}6@C!d|L;fIZs6e<3eWZa z@LC|*{pi)NZGBXPM!n(qjt^W>6ueg3VLB&{zZxkMv#M-lHk49B4cgv7+_xc+GH*Xp z9euew*)F{v>M9&0J8?E4Dk0^zKcK9yU8~nCXPZ2Y9{G2Q!fqoZ9fY1S@BXm<13>!C z?vW0iHMjk=bg%kJH6qmDW-gC~gY`Q1ZaT#Re-8EfWk))ra_C1AaqRe%!#OIzQ}j4V z!D}N=OF2C*nl*r1ouY5u_zMs7)`~=GQUgm+Vv>^R zpclk7)%=j2{6P1L7U8dL2-JeKprLE5M69o!q#GEG-!O51+aq1M4os-Y4*T>2T1N)5 znVc=Rs>%7#P(D&l%Nk&C=V8V3$lJk{XH8{qPT4+7DPths&N2JA#u{JpZSDymq(Rk3 zR+0j%##hqsyP!Jcn^5L&!}KpMSPgYjg-tttATbCvA=7_)CvDNd#4GQvbex*~!W=>c ziS&lllB&wtZzbG$Z?{uS8$)6LKb_t`Yqx^OfPIiiSWin}lR=MlbLka|U;l)l+1R}j zKnJ{v7t8Fs1qdl_Uw{=N*3oHc#@(hQKMtqQZ=K9p5DbA@MjTd1+sz1_I_9>@e zHP}@eM0sSZ5YKHtPl5JMS)%tK{4-Xx=CGe==#xraFQbHqth(5l9HI&B-Os;g7hPHGDP%Qp)9 z=0r*`UM8N4&igx)Mmxyj2pDCZQOq}1^Yily<6byqBwsgLXuskyf0m9(e`y&fQn1ep zSd6^z=@|;767!K0H5$MWsB2xLO)JY{#K+&!vRbSCQhUE#c07d_yeX28=Iqr8XUJcq zQHe#pfQc#5vtRCGe_o0sakvuDSV{-YVqys)5rK#-nbRyb=d8<~&{~N& z(YEe?N?o~h2~XdE#vsEv!$eM{kuHr?KK$At7~>y4<1b$t`sfM@g#zNj1Q;NC&Beje z$qrEO78JyOBdj5T?cFQYsz3HJ6%yxodU}X91hUwnOH2~me=c5X!@IBL%v^d?@7aXy zLW=7V>}2?DJUdzUTm@5AGq_;zSS*DTdKp$P-rL(KGy51a?fCjh5z4ekfO5!mk>&dS z)z#OJ-lf`pShBynNR{W$>^LosX|gV3f0V8+FjwI5J-a4;KcE)5+3iG$F&*t=vk5ANa)~oyG>1G{2 zaeN0!klBp^S0MF|Gmd`HbU?XzLRp`Nhdp9>zxldc177dQeijU+}+de_n4%F%K6Db4S?_*`k(!M z14FqxsoSB@fHbSfUb@CLe(^z{g49^0r6hNJD%s-4S9$v!+b6bLne3~gNraV5b->$Q z!mh>b6W81()aze>bm?uTC{FFF=tPJBoE_E`d-`~n$jP<#uv?j+QT${U`XLvo1^pbZ zVWP&|pVX-RxcNnh3*TaGwd$qMB0{p7aOH{YU=n}Pyq#epHL{WVd0)^fYv64Zt$A{w zC<1>1$fd$+lBJSC1tvwK`WVm*AIK$gb~qXh+-9=2A-3^$_YDz2VwdeHo~B;}9WBBn zVGF!u)&lDTHU(=6Wcf*9oV00b>vf)%hrEM7vtJ3MPbRVK=z8%vEVRfNg}r5n|G7ka9hz@DrPutprwzS#ee?8x?n;1?GUEOqO%Fn*td$R z`D|~|wDc3{Zb~9Lvc}QKm67lh+84U^)AJo^Hl}i`x1R&vq93T0#<#BLCRm9ygqQLd zHM*Qx;4h&zGdFt`F38eNHBcJf;xzMkj(1y*zjVLlo_hW2y|j%Y&V5$=Wde5pOE{4f zEaOmx#$hi$z4A`T)YP>1?FmX7Sca&4_2mz*rG$UQdbk+UOl*ADz|d*pt6VPYNEyYv zPRoGW*{UkX>z$q(Lbe#BImZ0j=5&@T{dMLpF_P-OZo6YAwc{(a_!iutTd0joK_mAj zWH%2~*dtyZeR&j9kw>McYG$;;NM^v4Gptg+mrM|Td#(h!$!!`f7a#DXQCjtj;N*6D zXolbU^;n6mnnwtUiPhM4HI*%a*k9ysr@@c^y+rUq(|mskH}SSbvuj?r^Vpc*8e5T< z`N&svPv&z>C7#^4uoOpw`sLcM@%{v9K1Fp~F$R*v@EUdBR6wQDU%vK=OgblTTAs|H zI(xWQt=vm1{8by164bAo?AVYT&K@>Jbv8bA`+;Jup8n*lSZYU{GLJ7efi=JG`y%ne z)Xa=~V<>mL(YK{qCb%0{#a1nrVLH&1oydbevqRFsa+~%<}bM8$;un}Sp^`Jj;1qhbn z>iO7-FO9X0iroHqc(x*#PyXp%oNi}Rp`}o^$G!B&dC#A>&J$5y4yu(1l0cg{&Z@NZ zh3*|~t$lx!@P{WlDG^tZwp*h);M!T4%}6;=#0>5EH0C>|1G7s}Nu;D-+)7G3`0j$| z`vVS9g%+(Z%`3jYo0iI4ye{-unV>(P+PoX8HT8(i?Ci&_OTl3DFuNxQ zmfbLtZi@(nLY|q*^c>cBHM*L zX_*-q8WaKF{Nd{&xelqWox{%F5FQF9Hz;;)GwBwlgz&-OM(vdZ*Fr6n?A?x$KZ86F zt-2kB95By-HaypDYh!@8gkGz+-$c5Y>>#Wk zH6|qfH9WV#tO`dJwpWa)>NbvkfejHHagOt;Emlizo;#`fMZ8*)dBhg$mwgroS9%J@ zLr_x33`+U?dfN`0qd2&M=oKjV!^K8End1p!+@vcJ4s%kSa$}>kv=sIqo^y0`)Ka#` zj8thQDg0Ok9i4e{Y<#?WS%j0F<`^9U2+w$nK{I#)Ks_F9``&q)onZwj-|C(k|IvrF zWs9fvO%@7nw7Bx{SBwB4Th*S&4MO)Nt3xmh1t#?;&H;ytFgTGt(^x_C}>~21xUGlPwZQ2Bh1GmX7D5Gc=PdRswV^p zbybZe+mhKjJ;grGrk4vWo2`iE%Iym|8{S{tORiab?cH=&fAYr5_f7?S$(af3D>AA) z^hT+_{@7{!>1Snc0l-1GHt*PS1RzpOR$%E|bXVq&FXb^;B2v;hp8>JFnunO`%B6=R z!R)F^kJRbYr#xM&;ik$XJ^mW$-kZR6O8l;!0$67o;E}Ko_F1L!QN%m5+(r|{_i42? z*|T3vH!6dS#pWW`YSu^d>CG+c-Y>8$Y0FgUv;TDxiI#g#>Q1VeQ+SyU3oE0c<$H2- zi%-S-Y7-Z4iZ-FYTj2;pJ3UHsc!e|d#!jMjK_fX+a3SlGuPS^I03Cia*% z^&ja=_fpo2>(U+(Omm+QUTVuKZ1&JT>vw=Rct%zn?Q^~i(O#b<1@f~;4aH1GjW%bj zfZ`JCbqY7FtvwT0*ZtKTADmScx$&knW;_x*M0PZl(rH5B&ijbbNyZ;K&p*PTkQLAb zb!j_Mp{)20o1dvDM^7Ch0l}m3es>Jb9wp6p#)OjVAwmnxLMti$c63_Qbsz|Qx0N2g z5j6aeX7(#Y6rcw#wsV9Nemf9N_4E+3UYzb;T2&+KDl*<`^bHLS%``5-r|jKpo1&T$ zFIK((L`k)9YNNGN`}1(vav}0swRXy0wdI*(SAFm8323MdzR7F4g2TyvLMNVwd|ZBc zz4R#>naM#qmR1in-@u59ieVunHXJZkbG9tK(i9D^zXBJRXQb|zX&g=PpuQ-&EK1;Q zl$`l9DPWzy@J&n-S3;j3%vuvmcuntG2Ay2u{0^TurAd?~@eS6hRb|B#YCq_G3DT zHj*wy0?ZGFw`bFR5PSqADNSW&gMJuXDjGpZYDiIfV!&yiG|%{VLWAYV;> z{QNnKQ}KKaS)fYcVYql;)$ zVq}x`ADfDHvQZu{Uc4|>>(l(Z@scUAu_q%`^CDKa=WN;f@hL)ngCbH#$dZGd}L*?OoAQ zsK1ov-s^5lHd~!M?54I+liBnc&e`W6UL9gcM=II+gPq@-bvjvyh32nYy=knZkK32CWAgMxyGly3N~ zmzn#$bLVmxo!>w6aj5gYd#}CrTF-jcv-&#)?$Ut{-|U)xq?@yAzxFK$HlUxv5lu3T zpDu=0XE0MLq#RZPjG6|n&F%?TE82-{XvOH3M6)hVb$_JJFP|jLjEpj=jPtdnmnRf8ht8g3zbwF< z7h+#Ep@=<&g;#H>x}95`PA!Zyu(saMOm#{ba{w9@!~z?}tQ+ zLCg_7K?X6>5pF&Nyn&{lK*ak-3~Z^TY-Jj%9hk`ycIbssPaf)hf_f4s>c;`R zlSr)49^V~-v~x1A{7pDjVNaAZX|4c#+^QnD`t*AjYd(IQepoOzTW#N)jBzt8e*5-q zSMR3Hc16VXjPL6{J43T9om20_s*e*Wt94P>64Sv+{nEU??!PA)!XCxD3P`3o*C}Cu zd-6n1F3)oVai5HOqn}eVN53gjoyGDDawkT)E}1vd$|^%<2@MWmnM-D8FJT={ZGPt8 zLJe)vY%aLbw(_VvSEXD~CTdfn{M(u>VlC%$k;Y5DGXU;wp1iMEpDnDgcKU-X!fkAI#^uw-6Ko2R6ClUkL-wdGCtT^%RWvFTP> z8puKsNGv$ftggmswP>g%&vs@U=0DjDe%mZ(T(L9bzSr_YNt<}yp=BGHya6hg-5Dgh z?gw5A_LG*on5hC`iyulG0pTK43X9)^L}qY8o2O=p94}(!A6|ySo)R?OEiwlhnTxzE z!k%BUd#eC#YDUk(x2)t>`tC;rgJtg(<4Dr-Kj0}HaNKs&X1g-QSihHqZjiKh$-JRU zx!!6Hdd7*OfaUpXP4X{Zf%Gz2i$8Lhz zGefVOR0^HNrxh6#^;DMW?pyeWPtUpr@DH}{(BH_@>W&dm$ls?^Aqg&sLhoCg66%${zvbGK1EH>aS_)>(Q@(zz(6xAwFU2P6-Y= zYa=et5s6RHKUvAzeJNhC5J0O^=1Ry_IEG-(!--l_Tt#zx){KnI53~=(?$^U*ZTN=? z@eBzBps8iX)1QL)#G7{*uR~tjXgkCWZK|Itaf@9!bNp0CGW0e>6V9d9B5^}`|7y(j z&5hT%Ot@`Bq~pHY1I3}wankMSz2%^vdI=-yrH4b6p7!6OJcf{IxkeEVEQ?e9G)V3? zFfj0Kd>pxDU-6o%v}3W=j|ho|X6w29Ow$Tk0C%TzaByf_{-VAcM)&}Cz-DQ_=BmPk z@9*fl<(Dp;rO0~Jo^1UG=UL4^wiZ}ia<7`5+wYN@#%X#T93J!llXs*(g;M3+aZMyO zL5DDZuUKTLfxxqRPpUZr(b_|D{zsr+cu(!F*4IZqwr!c8csn<4K>io*n>zj@gIzuy zCxx5Wv;VuN@Dw5|P&~L{(dRH4KJ)4Ek!x;lTd(qy)6#rZOUINgd(HE!*tKU44Oa3S zUM58xCqD26q9LSEiK4ggwjJ*sdRt>BX*b>F#&xNOWNmG3_}gc%gj6xnp0fek!C?Uu zL?)rPqZHUFQaqtlA29Hk7{R)9$F|Y<>Ug)1Omua(jQ(A7>*}HzHv9W)3NdbxTyPVG zv2wj))4w>Z^gK9>A-!-)%d%r=kmg4ODUfH^)dT27E@k?3=ITJH)9^*iimWsAiul3Q zRBu6RCnqQ8QK7*bBEcLQDLFrHXCPdE&z)mo zajEjE!p1p$g$@H^B+I#om5YB=Jyw7GUpL*36DxbiDfT*^#=8gq*`9(*cvG#REdtGk z84q{&q4JIVEpmp9Z&Jh1J4}(?4vGV=tGI&6yx!YR&U0adKEOmjtTD>8YX&vU{;^Z+ zsGnIkxZPE)?`>sQS*z_HUq}f$483po{D8?gr{bxk+BFw+QN|+=5qyg3AyF&&U76HRUH@G^~S+(B++WU8ooN9%36&-t(vGjh0Y&AF zh>Yg73-+JIs(x5nu|4cA>(C*QMKL|u?D7>zovw~5^}n6 z4k);XjOUkZ@&?ZS&E@&~`SZD^YbC116Dr>>OM`}f>PmKKA+TTm?7^Bk3f7EltR6C< z2btXhWsqaL>?=CtT*m098H9{J)!jvY1T{bdJr8#r`T*NzJ-s)nAd&~-X8%J0_ap4& zPEa(}Ngt6z?)QrC1o@g9zTa@u)UMxk(p4Hl10Fh`drX{7QKO+VZ$uS;E~QWyaar?# z0dsJ;y|gTB9jZe_t<>x;5OS5ya_fBO(K!iBbuvk$=|s&w4nNdp zu3Lmm4W6=l@M3Km+ULViWe&IZL^g>I>8w}jNI*%(8FVb5pxe3YmF06q_x!Qh99-fa zxUudnD~?Aj-f9S4eX(aPImWmu;-K-H|0svFr~3Bw^&RZ4iAMh9`56N5F%{W$7Ng0J zYNhijcHJ{s1-dFcyxQSpjpq`K>%2sHe&|IQd*QemtkKnFZ6YI1YpS2o#OEX*art)D z5=q|cp)K_6WOxY4 zuSM}>URR#E226&KcM#ikh~RT`0*@YrDiq(;Hnk;6t~e$i#n`J(t1l*C`M}NgoVwKm zpE*}mZO!xSLG3ySz;L|TH2O`({3GlIPlywCTaN#}svxZZ=YVRVDjLYtuNeZt>gKEw zoBc=Vef(S|^k=HLTrZe9wr*oH|M8*!b=RR2PuLqSwJdi$Zmxv-WM<*QN2NUOlTHkj z`UjNY6hl%4d?Th`Ol-e=9g(}PL;fRP`ahrW-~uct=e6nMp=&G6Zlb=l0#%Z0qZ>PWZwKCerRW&Uo+uPd@3zes*bHQ{Iz2WL-Gho=#*CBAs7h@FH^kS8fE2m%Al zTHol9Er^CEj1e6=6rGrOcEPz`V_>^;TM@0uu+~*G1a~Y|C&(TO!J$^Vy~|Al+@%zC z?PG*10xg|lJd?g*Iv*Kq7mx%NQLi{dNjYQd2pC&fT#Z38mQW#Bq|LAJ;tG#+x8xMN;cqTE$ExAVo%AgHi6GTd?kp zIXUxM(OTHb+5;uTY89R7Xbcq3s&(F5sse7yS5$wf-uZu9BzqNhsO@{4+{1dLzqFbo zy?k?ZdDLxl^g}ZP#w}JQRh>)>1R-K#V^2FeI<`(>CKRAJ&$OFmUzCW1IP)R>zzb-6 zFS2uaeaU71b-?L8fm!@}O%T{qkaS(?mc#1nfo505S30yZW2#cf<}{bHNPDw2{Ztn> za)tUFqr_B3P`&@m*i42b9Z;1PTbdP^wkFl7U3-Rd*6$;TjL;IM+Pe^S#N~|W22MfD z$+!y>#%E5wShOqo$M8`_;?(fW{HB!t+Q73Jr6HriiVr%`w+BB)T$+XAO8}WvGqA@) zi}d{u$D6$BJthgngGFU!bs7Zai~F7R1r^`%iDQ|E|8LEI&456P8q0RNo!a~J$9Rya(j7UZ&GUNs~n;gEMo<)xT^2e&O#75I}7mpsF>C9%gT`4u55J|&oLG^ z&uV{f7KmSR(aBW{>f`9mGAK6BzJ!*peol1oAc>^%fx9=zPK1OW3O<1re<96X#;g3V zeTR~`Z@B+bOMI7+f94J6(JS0&34-2V+h;AzALi#-OqO2f$aTb6lU#m-ljOk7Zwj<0 zTDwlKtw$`mh>2}l<(GwnEV#mcF@v%nk?8-^U8RPCdjQ)@THZ7VGoeO8?MkL>(~Auo zpOGV~GkmB4S}8yJ@a(FUxzW+r7k7&-ba@Z>DEQQ5X0O2%1)EQm5tJp)Q||fw?S1)Q zPTnLVgJD5*t1s^ekzNVB-cYvjsN`Ki>|LpQ&?u2FdK6gF*%Z`2cZFrh10J6mu-_%7 zro?$=mKSAiZeD8G*mwd|Od4niY;mozVaC(iyWQILOb>8%Pwa8+7cCQvP_&{eN-&N6 z%RPpW^KpB)EShel7lxiX9su~}N$Rj>DMiQ97cW{0rDL+*@a6FilA3eSW#$?ieTiLv za;&Pk94D+g@YGEi} z{zD0w##DiR55u2&)V5DfyIhnW$pAJq4Rf{4sd!DkTiOjCtc=ob3hf}UNqv@wdN`~VN zbIOFRE@%s`$kIZ?+xt?5st?QGeL}0~3ub_&mXKI#&%2yhqBm8ldZM#3Z^S8@0|p+Y zBIYwov8J~ABV9=U@rG*F$g%BXZk$B(nfIJ{^5n?{ZfKDOpG`}-E1-_OVIn@~YbR`?K8sqGp(%T%^|-F?GrQ%fxGQ zzxlMR*$CF0aB zp~Ng#vTh3!XqG|iP`r`mpzGSCWj#lJ1LdyGPP(G#vbXg!{Vq~^FrNbFY2;fS=s&Yk zH~L!0Hj9OhphIYM-AbMZ#=VX7ceP`P-Pky(!mWj#1%_osDNMp3KH}W`#ydtoXO=KL z1IAR;rH+P9EP}=%A2l`gP(#SYPT|4LjkRGpl%pcwD_kSW)f$oH^hCmWweoFw;-wD5qXc#p>PcN0#)PU`z0}jy!vWhz&O! zMY+L`l7od_lyr@NA=41~7ez@@*ehOsD^W5VwUMnddU{UZ`aK;oN~P>GI^j%ONbZjj z7ygv*db2}O&C;b)Vo8aOjA6NA|BxL>uTwZjAV^TW*amJw$Xb!f|0&FsfaI~+{W9^9 zBS!BdxX}u5Oo^t}Fht&7UPCU^PJ?drv%B>lS16-rJZtSo#8IGMPPbJS&o2L(zC5#pfJ~AephDLVZ)FMyU+wOvz2ir1`?^4cCqemwuCfa|FZG-V5{kZS( z?`xX4ny+o@cg)O`T(Zk!qNd&-Y`_rvBrr48!6*lY6I+;;u^yWB3Nm2$Ngzc0(<*{o zmvRQ6Ug#u(=D-F4r>zW#Dp_H?3>*mIkJ}`n%~y_j7V> zu#DW9nXm0RO^i-TzmLpB&bO(oE?NYqH#gCA0{>R+cwew9gG-fm;byndL3F4P1)A=o zagZ}}tZ*{8%(pCm+Amh{Wn6uiHc4F^Vbg++*YuRoaTqRHwG<&j&KyH3Q|?KaWjeC@yZOKo4F>)n`*JFi85zTG}pt)3X!9;o~CV#clOnJM=2|VCH z6J1punu+&?dBhR+tM?CU(U)R5lJz!~K)io)k+d_nQ;D05W;{>INpRWNX?)KuE$joW zCH7s`Vy9q8*B0?-{mm%Ji87F#-Ij$CU2XBJ9^Bd(L?kE42UoHA&`JHhXer;IKFA*K zNy_p?({gfT%0&8_EB`J=lcI?>B3Z27TKGC$rMTv#r)OpI)nb019UqSp?XZ60hkDGb z{Hr%EJD8c53H7K8wob1_18-r#ao5QIzb}t%+ey&^lQCWD=3J=-CEg)bw*}Sg>6w|C zu6&eE$?}jlqI`pZsNNJ@pb&WT-iVjCcMJ&qAYC~CMsAC-l`aj?0J@?+>!PvS*)`rf zdpogf?VV@r9Wzgrvz&#;pdn%L=%K33! zrQf%R9bxuMy9y@TqoCr96lVKMv3Ht7xG-J_&Q>8?PA(UcXP1={I3Z zO7ehk)1{@Wl_D@O8*@73dR28%-iP0SI`mz){UtBw!nIT>{C{#M^s<5t#5{8k(ZAr> zEzeYIB#0SSjUJz$vrLG;m}H_5cQw*PrNim*SG3J35fuf?KWmdTm&8kg8{>3$yynPGAk8$F$p08eAZ{#aY(fgQ@Xkc|@{q1RCpkziXpyP7Xw z?t#*Cv;FATuLd>8vp4pwJ6^MXh&Sh`5Ijmr$KW~D){M*$f{gUA7FEGWqGDogf+^+G zHdfR0JH=guo|X67v0B$*Je8!PjLyrKz70k9Zn+wpnHRD5s89Rl1+7IFMA~2lcLQg4 zyg>@I;m~)o=QWM7vak%L6;FA|1amez%|c+jtqX~yJR_+Wck!_qMTDH1Q{L}cF)+8z z7Bpk3S!>eP#6v#Dem#V)gS;D=g!oPIs-oNKkqFs_qOGx3cQ9S490(vr+=Z9|WgFFv zc5xOvA1!`nP?<5D$G$D~IKoh7_=}SJNmBCMyYJm?8Hb+k_@;Bvl=QNz-{>cd97c?#04|Ny5shiqO^%W8_Bt}KueVeb>nFxP!0^4W)93nh$`mT1Vw~wzcr~Mo#suQIt@1i@Jk+qkTWU1!!K>Uy{ z%E$&W0v7U3>u4H*8!*CK{%YjyMDSU(%XhykD!QGBdmLxzq#Vp!ZQi(a4eniBkh>5w zn1PDn--#am&#ff`gPu)pyi%QJg2;?$V7;=k@{uXoE?)IqTE*L`>=x^hE9(CW^8!+g=6zxvyoV*&czI)VASuL#n!_)^ z;@(f`;fbl<(MjSpKya-=GH@+f)uz9&g=x<*k&%qoq|!HnBbqT{42_MA$zSg6640n= z5X9&Yx2)(X0BT8x1Y)?ktGDi`oJcQqlM$pn8zM5%Re7IQU``K`9+Jh0XGHH^>fLYN zauAS~POJP{FWH&v;Z-LC&mR|5mXJd2WmDbeqk31{vaiQR6@H~hLkQ}_&exvpdIjwC zb}4(okFb9}oAOEI1UY&2xb{-fat#O+noe15ZEir(*w++$cBjzkkK_H}nfTU|&-Uwy z%m4>Z*$?4jE;t*QsSm%6VxRe9?Gcj`%}=-5PQ`^mOzT=)$GK5SeZdcjYq3EN`OmM* zUensph3BUZTWfuLD|>6^MN;c(LF&qY8O-FZrt_Cxz4fgmgJm})Vut`s?HGaUoP(sW zdtj;OQu)>fNVlvm=9kkUeS)fscPD)D?tMlbq3_~i{2yqX{m94KSFc}(E=gck7<8Tr zO^b4)8m;8`Vy}i95~1t3D7^t<{LsM9Ln5KQS?R;U!gP#-R|C$qc`99tQm(^4*$x|I zJ2noc{<=|$g#8Hkw)vT1^X@=D1e`ja3eH2Mw^UU4P~kW5-b}whmM$wRYY4yx%#!~e zug|)l+|{~hw!IcX0hBfDK>EH~-&vSTy51cgHsn)$=)67Cg}GOR4o@sihx-AdF+t&; z0;M)Y^4kBL+?|Z*PkWHzg;YbLJ#t@bY^;xpF@Tr%9mTl0Hkltje7JIYYje5=0cr$H zW&iX`fAwRU8lPIf-xpq)Z-EQP&e=%EPLq&~W0C!=j#Ze6|qXdOMku_N8OP!vPn)EgpPP zLdlt#P8Yrfec-&fpiL^GoOUxLG0%M{VZFd{apWf}_CW7T1qzrtyMoohrk0N%k8UlZ zw#fZNSMRUG&~HnygodIq)CLx10A&9(m~kEQI+rE*F`_zvc(GuP1^lG$b43;H?a)Lf z7)I<;w^xi#(8{6(D{PzPSmiEmKoX8&^UO1-nvbb&pQ|fcg6Mn z;{9X}iR+R&ZTQa>9!pX)h9Zbc59uBBngcF2SZ}b*h5dn19)6TgZH2()aqzHExD-6! zpwpt7Ce66G`RGN(Bt0( zT}8n?hr5W`E#H%s3-d&kE}I$m9SivwzldFnDJd{nIXSU&K(~uOV80GG z<{?MQyj{Ys3_(J`0EZJdL^9XJ9$^4x)wJ9%U|qz|klzhjm85WM^gz z^CBVc%Qv-m`OKZab-ox$L$rK$2{4YnQvEa(>yO-EnWiXJXS+dv$|7Mw)P|LUyw7M= z;l;p~k37%kQhAj7UqFr|vtKM1Yu4!Cw$0%EnWf24fvh97gLPbU=V#q^h5{7d1Vku{ zg^Kk%BhnE31{TS>tXhu`4j&8a$(vxRU9B0+;NT~5c9NEjdKMa}pSF=Z2yU?0B8Y#Y zo38mGO{!8_h`TEbiApVv@14I^lP19yf`qHLl(9z_$Y@%pv4AD=6C?3?bGeP+_Jn)Y z66ccXI44e{$Fc|7`39ly`q`Ch9K zL;z`d`W$`yHmDLKzV$JrYXkajnr)_iYZE;Q89vW`tFoW^W|ut;+4wR}h~6_?Lz||y za{cbjJro|p#N{(jD5stL*4%zKdMA=C znG)Pv7_hs*VZE5l#^u|{(P8=RL!p4FN}I&x&-Z!w7Nnq<+0G9!)W8>?$zU?&Y5-5$ zq7=4C4JW>kH6>j5YkaWF)BN!*Z%8vuEHyioI+6K%GvD7IN$z@7Yy}NEYX%ra-MTg}7G&b>{)YHrk^y{yUOyZYqRx?A- z;f}_YuPkjbuX2Uoc^S*XQZvl3JafYLqTR%jJo6EwuVB5c>E=egH_xn)q*~MM${H=_6 zxEO;5`n-1j_V%u3A2_TnNU^oiyXB-wITdEWmw(ldVcCRste*n*LeJjjzWQ8UQq-Aq z)SmwAPui+gTP3vz?0F#MhNvSmk!zNS#n3E0CxZ!?FadCpKTVZelDO;I@{gXk1bN=h z!LePQ*BIS0DwH-uIXU{y_$MAcYpm4o2^E9mIR9>-Ha7R~)4nYWOrm zX1yuIHH%=>N7H;|?Si>SrkhZ<^{-;-KIfU)lzqJ4$ zMaZy1?zBE5eIvX}hqgWW(ROvkK51=Gwf)XG^**M+(5IfB?JuU-LtZ~!t$u6v9bnO? zNdSvhubw9Snwk1F0Yb)LJr`U7nH& z8(lTe(RfEw3K!IqCo%p9olQ)pn$tq0<(ky8)1rBi4TS3Hcn?#^^?&4+k`*8#8gnvC zHmo5fCA}aZFvoEA%jz#sk}tGKL?o+rOt&&52=+TJR))MCmdWW^;AneEVyK>SG-1|D z26ArLKp7n+Bbpu0AI0xe)ktC8u$B6nWmkRt z{Az~%#ORCdrhSoKF%kw*T2_S!9~j0;NkgOY;#s(WH4CWq1>fVqUp^}&>#|dNKKDpgv`TGq$MU^bRiK@v!)zv3fIW8<-i-5rx2qdS03thitwJ2X_g!sxfn!F z7@KkbT#_m0H8-6q;)P}e7PWb<#4>+&E(9ZXAw~h}9XUW~(|{_JV*lcGmVk>UKSvuy zF06_kq=fbmGa3Eb4DM2xoa$HQwuai`aQE<_Fd&x0ms6p0SJMx zlub~CWz{TGa<-L8Y?pxW1%PQJ26*yhSasBOJ~l4pEF@OP5N>ASCSn(2p|iVy@%T<- z8W4Pnq(9d?6KszNhMvu^^h#R1jzXT0#0G;@bZgS)o!mM@2-%_JaIWB?08v=zTFjCJ zH+36E4xC1jJNkNTT%7q?umm6j0L;&_o|qPG&{IP6)9dXP=2N;~H{j zw(Jk9*B4pr(l0cNr@;4S8Pa=w=jXTS2a{e#BoRJ|iG3E%&cbAg?e-R7BLx}&Z$HaL z;O#}5Wkr{-?zHKVbol-l-epQLg6Lv=1;_PiLJ{ZII0K5ST#>i&h8h_?ge)v9!#JBR zMW7u*ntI9A`@GFz z8Shs;RoXuj?$U30_x`?T}hBz*i7 zINx(J?OUyjiOogb!A6+3l-zUe%U>`e6mN9b|LK~`UUEse7o#)M!sDP*ok*|&>7iQg zsu9hLRL86GQPFxxR!*+UTg#|5N&$F^rTv!6-wI>K>=yFMT()+FdQ$ZR2PS!d%z5)Yjzi%1n)e$kCUr^ zh)B~oviuyU78~^T_!ACf78ux>Id%+jMhf9Ehrb%ypM1*lK{yFw{~EF?5?5C4m`C49 zbHxR_o;^fFWV8$W?p&<+;HR$$@d~+&moOsas;!rAvaba`9hRceTcfK9b_rWaeAIPv zuSqxN@z0~YrJzlvj#5|`Y%aDP$C?fxM?fLTBWdpYyBNpgVUqh+OHe+eRCE~r|`U`+G7H@l_O5W zTPrp3`nmhXH@>x$*gUv;SnJ$s%hwlfn_PC~8~|elIcpK&2XX#nM*nSLF#1gFfgE}X zMjQHHKQ)pvj>CmmH{m5qK6gTQ_LZMrk_(Q~XF~~xYBX0ht!aC`i_SFYx-6j~BPDF# z+sskl^RqldBsOz6a6nTgvb0ph^{cAu>9X~&T+QmMUjh`G1TF9u^go1Gd|SRwD`t0| zWYMO2(^~90f0>9~V{nF|<;-=c#7io!YIUB6!h8GiG*OpCz(8g`uy5t#9;e}55xllm zaU8-P^(zw{CGD)BsJPwS25n+tmOe<;!V*gHh=R}sBYbh@(E>~K=awi)AERu(x4j)` zdi9ES>u$D^gyh$onv~~?XGUZmjw%6f510?Ux$_3)TVGlOPusQtH5LLk%c(00&U1-d zn*pZ<%F{SdzB*2m)7LC78#g(b?ovy~cOj6o0TdN=d%PqqHncDaa5mTYW6HW%;e6qr1H)1#c)!Uj%2`=J{>5UbbpOiWCu-;Tm4LZB!5 z4Q@^jM(|DHjRs%o=$bk6_+o@8_d5vTDaWntkx6iLLgWJ; z$Q-lZDl9zP1Dzfle$tQ{R&zkZ1CW#Y%4W!Jz0~0z&3iZi&Am4ZPi}5{Hdn5AvyP?| z4|qzP8Va05`FPlEn!I`8D*Le1bqrrRg?AAcmk3;oMV8{&lZGoQ#DyVR6X;ds4)MXP z6=J+mu5)1*9bLa&W$5}p}kaESQM4AZp8yu`WS=cp-dM_ttBnxa$>JS)n8E0 zbgt<-%ZoOmkwTNSf#WbYaYby{bC1rmnk6^hYtZbWx#cFts z4F6uMF~MHT>@C%-M`ph}4a`NH*4PB)1?hVOGApZ9I_r|~?SmE^J0IwRlV|`Ov(XA3 zjzYuVx;OaShgnf3W>elV{f%M*CHFf$E0x;YiJ=E(wQYLtmO3A|d8&jUVN&5L7EZ7) zSh2Z7Th7XO=?9swRi;zxU+q8TVZ2MZHe`hROWB)rgbP3MCEjBL>NOX=?4X-zn-)=- z!b!ZFiPveUu_14H+MI6Z*MKWU>maliR?S#@Fp;j#uV=Cj4LxvcY94IQT84^@!*I*T zmj_+OysPJpT;EpaSc8pLj;!{^K;2-fZ#fARAVT;s-rhrxS`i$-woHMT_8CGSLk-qj zKFnZsci*pXrg!QEt|A^(Y|XwaK^#93k;gOjw4wCpD_-Kdv4{^%RH!b)=YP@4tmkbE z80l`#U!k4o4s-F7^Whw$8XMcsyp7B$$#Hj2lpsg9sc3D_+59Tmv7Kjp zZ_A5wYMrY!p7VA9vV#Ekbc3ml4UfCKdm3kOw-m%p9i}??J{te#R7g_*6w-=$zJZrA zSkHD|n44TNK=IaX&GX};U;rV|F6MK!G~Dt_gLgLe99EZ&myGz?9?Pu!MV8=KUSpwA z4Ew}l-(vv%R2UC_D#4>XcpTCax-fdLBCjChInJMdPMA%HhzaRiOyR8rVKXX1%A%H` z2?HW}Rp+h3;Tavb$te3?n(5jIduKgi`PX7EiCr8wy8%V?LiH-M| zE%Ba~mu2Odg<@V_)p6l5=hH6%p%xfT{e?JXBo8(p_y^~h@xdH~+ZF>i@eX3rB|bna z=*S60yy9#jZF}5&4ynu9n#XV&-Fu`pbpob#A~c>xu2D(Q#k=#ojkt0Y)2u z;-@whRg{?MXgkz$&q4vv&fM|L7(0JM&wEDv7K%4UhIso*BzcXpDqjCVc<8Wp$#Fk1 z+fxrGC!1Wg%eeC*wdCba4>u%^-S4-zxV#{v)!x--$Gex!|CJB4HMYE@uP>32mWnfv zXG}lO*D*+sKp~@Xs**SEuH-oEs!M=p(JESzG(ymn9sl`ae2|kb_R#L3(PK=6=tr7> z07d>BZ$(y96|)0M)-pN}L3zdj8Zj*XWmG{{NFONd+YsnTzm*PLu@XGQAO3-Dsm)%sMlyfdk!TGIr?a9PNA$&jcrho8Aj2YYrH^pZ(<9;!s?J_Y~?$)xJ{#!L*4(xYn7V%t| z^=BS9*z-g5d4yfX|@uUZso~w+Yu}ehpl`H-WeR_0`FY z^6>_@y<)ai=&9_BBNR@b+H1$UGP9jBgG2J&T&momb`9>XqeauDz9jc&Nq|W5g${(7 zmL>*b48{0{$d9Y*+L#G13cE4EiIgud9x@w@HCQNcPQ1jaea3gz>+|VeDQz|}K5TCM zW6YGF5Pun4op*AOnK=>OwKolgk6s&)Ki^O#uz`8m7){@j0K&;i!8KU*n&!@pAewepH0@rXh>`>r2OwPM1-$TQGHeKMC#A27D-GiZ*&Gq732dLCYjrGrb^{ zq9Xg!xp;YOrFpM41A(&Y07A-riqv-c9<0?7W?cLS&&+59wr+K;@F9)Cl}Ka)eS<>< zW+eo3C+BFw$jAsX;2CP&O@*#3kz>5+4vnwpSa$YC9h`*GL&C8VhVGP?pcyirFgrs-T&&r(s zRG_rvjT<+Rc3X!EXr4QW4ppq#83<7Et@!x$cTy7Y2?lnGwwwJolXy^snC=)~2 z1Rlj7e`Jj2U%iUjBs`JI8mZ>+8Uqv9%|H6cAcrc4{6 zQTtc`7EH$X1DO{1!9y=_TOvh8DMp8r+g&z79x0SYqlP4F@OvGoT7l(w-?! zTO*IT#Dx-u{_JhJpM&viXO=C0Rc85;_&YY%B$w_Bgu1G(M-e-|``O;!#)VfRy=2lP zKqv9)WBg!bOj1rh93jEWo6q$@zbsxUZUbQ6791YkDv3Z0J+<# zgrp?(mjj_Y!q>W`2Q+D?{L&Pgs{k^+^wQN&!RPD5w~$}r9?i_+Hz@4dr-AdcUc6wI|9 zHwKQ4(SC*X^Ye-E7k$Hp3hqm#rxcx#tYWq_D0pxSd!2Z)w`TnIq(Bo` zj?y5tIphpTy5BP!C)xctZpT0UcY96)*{E%d)s%(`x)^;51TYqLXR;1iJAxM>k9x7+m!ya~W=>zzT3WM1y^_x$dlU%Yr>53wl? z!|F$m=-k}E?b#{+^XS;riYL%>RM{q_-#IXaG$dWHap1`_OQ>}h4=?_>#t@|tZaF`( zNp@EW$`ruakd=KyUV@R(V>21whg@^t<6E=*lb4{0r?Erh-&aFg^w)oSHFTVU+=Evn zm3BI}q(u1PRG&j*nuv%9QoRru&$H;K1_cG{roj&qBk4J+s;W}Z@6hKso;rglxzIa) zi+Y`I&Fo~o^8l)#tDlw=(aZ9q{y_gyod6*X7MA!p^O!XW9pTm3E$VZ<;>pV42vSgt!{;(}D@Eee=>fVYy5vX^{$Vq~#gMn~(zKXWyI z6m0_Y+Ce$n3sFHzFHAiGQnWK6&CZ5RMa^!TbKTG51c>mo{#4i!$;Mrch;YLxLE;RG zN;t1Hn-6Wd+8)KnGZ*qql{uQ76|DzTPTo8ns~UYlBse5JmBwd&@Ib*T8+#JsEX@Fv z9C3n{M6K|-Cx4B-zHepQv3|DXwIk`g#@6V!r6Csno_D>xOY?Nn5B7;mO61ro_g;C=D`=2p zl|9SDYZMXvez>UG{?U&;(N4>C*zR9fBHh>68#eK-|08+b&ay)scKgIQ25%-zI@P;i zb&QOjk}$|CJ(Q1a9-03tfeYzK!}&zAGS3^|HjH4o*PxORSf2Jh_M-L49k=KQp10!T zu}_L*S4VTB74caG`Xt*&EH;X`_lQZA+apXJ+xSLaK-O`mo9^PVRer5+QUpTz#%y9w z`^@%khEV>1apCz9N@;@M#ED~7O`ki4i@BAJjjwHr-@bOOn$fm83(w|?e?QdT&Wn32 zSEXJSbu&3!hbF9-pw-ObPBg9W(L3kD|8*xlUGiPZXX-%iZmRW{@reR+WTeLiRnK-F z`=Mj7)03i)vBS}L>Vgv)T+Z#(W*>fw#<$L&v~}H#p9Q6J*$|4kDBT=8tbfS)sG^fo zg0E8b+0(gLmuy$cXWlucnyZJu8jc7s#IvawigsZ6m$WZERXq~i#jCI4uOoOM*WFlY z_8wJ~f}*v&s@c6qI@r6n@mcEMFOlx412Bf`xVFLKQ+%6to)Mi@Cz#rG^9|XGq}J{RW&uH z6J4317xdlbg(i8PczCRpVup!;=^}25O=*b`aZ;{G(t;cDZ>iSAHd)1VZU@#lTa za?F1*gQJHDdY6L0^B>A}cJV*7sJ!>8jP}A!6%&S=2b8QdgIEkk&hoP>#3-^jbG zIllO|;J-dA;^0w&uOXz$@HfOi9-_P~E)#UFr8QPY3oU+4JfP8VPNp2@I*BbnVOs?x zlLtH6+wT>*A3J0_Q$;b3)j)L+QfTU`4`LpH7~3~oJmw+QL|xCy=&X@E1 zABNq&eOtK5D8rkU#6#N=cUoj^B3nJrEJRRHP*y|w`t^@ws+ymBMqdk9%Z{J@tK-mp zLzI)3Yr27b-1L#v=)0n;KU2OlMFrUVhTj{8R+x zTMKu|rN`ehrl^Ib{&eQipV@9|^(F+;EE%u3UrAlDFAvq|@n}&IvfZp2Cp*@H9gWaq z8sgQJ{bOuI5 zh(@kByJY*fErb4-!V%Hs$bF&R%lYLSa?|BoWt(8ua5&F1s-Fr#mlyMqH9_ED+vFzMtH*z zeBUTXjv>D~%QaZ9eViW$2GZ;X9HihZ`X3u=Yj5ZIexvl&XNvOWCqTH_tnn7=3(m{U z#hL1?g@Sl~^x9N$vx_A|47A;2VSG;xOk5XbD=9B=-Ea(t!6)juCV^R5StXbX1v@0! zScO^hquJf2rlt#p%`WmFA=9C(s`WM7A>_bcwh> z#v#hm%?Y<}&)zaBXv@5tT$MEOMpN~BTh8k?GKTQJVNrQ}ulXZOv`+zi_1kPC1Nh-c z1$3@yXVD{!w)2c6H1aLp8Er-fYdhY6F?0MIS(VV{Ct_e^H11Ag@!e!|WW-R*rbSYe z_*cP!iZYiq8!)A@_eaE(l$6?M>K`vI0QEIWVNawls`3E&+B=G}Plt*HhkY*k>Wr}e zH3VnA78#p*r;4mMcl>5q9>e-g%S9fe14=zd``zLvZrDm&yNeoJdXLf56FYd+?~q?d zK@L@2V?*1Q34yF9*ABOcwDSG+FaEffK3_rz^BV@zxr(p;fF!9ef3RE_*GPzY)TrVbv=SmO!Wr+YrUw|$uM@JdvCR*8bD7k> zIK2vOo{tURFZEgb!R1{#R}CUn618K+cU_4{IfOdo;g4{ zeTz2j?87IblY%ePuPQsOnF*cFt)^leez19GJ>Y1FsP}$jMy4!D9F?-Pwde~)6N6lO zqN%*!)B?kW?*wqqL1u=NekxD(hU5-8`Yq`JA|k~rSKfW%t*G1i)&J|h-tk8c--ILc zF9Ite*u(Qj(bFJ$=(yNzw4eJ)^K@f#b0>Rx$xus)H#tXZxZOaa!`w)%MgI5)e`fV^ z*HYJoWF1^1{~%BJF!)*Q?ALl*_|-7%fZd=AJt6aH7M3yS@(JQBU$=TjFVe%Ce$0M; zEIGZ1z|V)ipRU?NV!Cum8-(PXrs~%D@@#Oo&HIZwVpht5UOGM!hQfk0Eg30k%Y$&<;wu=jU?k{qEXfwc0omEv>vMt#{%e6@y&q{s=9WrUkqO-c**3B7QPk?fL666o* zFA$@9vW@(tqHdG2$bQT+sDBv{(59c**QZH^)_{=!6tr))wVqRK6Cg-CxiQvzK%IKx=o#-Ss%wF_od7GymiG5g z)pEM=RKDd+gD`|u5!HfwBF~)+O1tonPbgxhhs}{hd~sZA36}p?1wv4+oewTwud-=^ zX>ucPen}uj6~SedYfw)v0tc3rWZ{zATq8enk-vQTlD@MTBfPafH#RXb;fi}R8Wk1g zIGNiif9TgS?N@<;v8vh*lJOCaNui;kMMuKt*OA-4;%X$IYDw$s>j91eGw+aEQUF0@ zQ70w2g6PT{Y zu2wW;_c-V;zT_|e%ecJv0Qyi_=Wo8?aL|`Y?ZGF>z4t#y?u|_La%dlZd_*(>RAK#0 z%*`jW%VJ`#y1_U%+ws)=#XI`H35C#gYCEOIDPSyA;%Wm}i?NPFo~(UX>bq)wVw(lM z)H+AXR>zX#6niEgx`3eN`P7S#B3&1=F~)Bn5n7mn?5O7DThQ$p*DJX1ujxC4zrR|>O3GVaWDD!@o{BEy79R}h1cg<@^1Y<(yltL%4};Zf+C;-N=gWb zlytX>fYOaL(jnb-5D7&}1f--S4xNW?Q4mDBk?!uUZ@ulTyY{Fi`F^+UI|NdH@L8^n)+~eJM`INvb z7eFFwkv`iJ!>bHZ!%F)4$s|sbSk>msBPYkjbBFLaSQ!XGBp3XjArNfzs)z@mH~q-h zo)a*@X`U51y`n-Lc%*hN)y$4{mpCJOh!{(N+o+F=r0+$s)R!kDpCZ8xiyo>dlM0Kk6@!Gw5CHil!F*29SNwNWR`Z z+&;Z?*1qvkttbV^2wgTIl?@lpQcO3IhVIM+o5j=72 z#7qsNQ?{>8Zm^Md1~nZW@4eAozRoO9hRRFHnwX^fKc#Va-mV;fFMeQfu+V&n$6=|Y zo}pwEsWWOB#iv>YHI(;c*=CLCM`L;BtUFq0FNv+U08Z%S!X14N!SH#zWn17!)o!;} z20FPgvmrDfF9LW|;C3|{ELciDMur*oQ#GNwMRAx^z1Cr^h>3Fkaf45P$N&7R$KzW7 zPhxVp+kY9TCrJ;`0ae3w?V~CPw)|hk&Cbp~wMYUtiPwISNy7p7SI~xlR{v#j+xry= zgaQ~jAtogyVC)KJ7*J{OuD$O{lgA+;fsN}I&p@hZ701QjTy{w6)>0@Eqrzfkvk?9iVp81mUv=T8)a19tbmia> z7$BcNL)(G7yW0(eJSVPQAo`NNP~ zkv+DI5nNU}JDmb?*Uu`T<1z3VSH#DUCYe%ct`<>Ez!Dh?FsY^QFmar{{HCxV&i^1XYl4{Qb1qDyFBzU7rOjhXrU*ovwQ^v+onB6peiu?47CF< z&&*pJ#FKs&2g>X`$!HGadVd16OH<*QG`J{2VxzCc87@Y_t`KpJ$}*~`sE~9MGqODM zeqUFoL1x*Iv$b8Avvp85>~`70g=P86C#I7-a4J)W72O-HS4jAboL#_$2e!jlW9M0O zbMrFXO@MP-3{@eH595><N){UAh2Ds8)4WpLkCozl@>kULlZx3$zNdZq0wuEifY+!ur-y*>A^ zm5O0$j=x^h@Jp24_|n_c2;8WWcIz<(5QwP-^g_GV$Hj#W7e0$E-Mx+!_EwJ0)To?i zK+tJ{kFTPeU{@i_)^Jwd+9ZHAb$Dz*%+V;RKB+~)Tq!n1uT@^bv3ZSt9p|6!#1HOC z5Q@OwO}~a>8ECZtujJny_1p)O^RfZ@Se<*g2^Z`J9(YDeixlLDmyJF{hlJ{5ef_>Z z|1Dwdo9JK^U4*$H149cUrTE#47cYhxc1o1g)CMOGcYUk(*OTD;`_9}%msM`RnONFg zGPScSocnsTH;hL@V$PEz+ZJ9e$|>*lqp!)NgFG<6i?9;)^#_}D9ZgYHc4ljMVd0&E z7HL96gh9^A*3Kndad=0EiGng7a2&*J&I?iwEZ|^UKbeYnfh^e+HQ_pG?7AImcZ7ZDBdUN!stufpzQdU%JjQ zo1T%~p=MDzbvWAXF5uLi`0Oe&=4{SWtVIL+O-_tPlfo#;EHLCm)AX=AoQj{|5`o*t z4<9xMq8Sza@wgYI`0!XGT*cCyjQV`$zi{;XpSj~n~0Mj2`H8LaK zy})%)yq;ISuUeh6a8BiOw9C0dSbWc%>56A1Ov}xR_($>8wnW|z?=jMc1p{TK?nbg`g#!u{UZ_pV|9XY&Q!X@4h?QQLvYAhOB4;VSFXoLLUkl->z^CEV&j~L z80^qq`fS#Dgio~Sc6odM=*9%@DVZWg<9bUbt!5w24S;uzT4~`68LnIhjq-}bzVfCS ziX3VUjh>#ClstUHIQyXs7_g{Uu$T9;D(nPM8PAFTKM-dMxu??k{-44t-J=C;XJ2KP zjH6Yf+#%2agO%or2F*J`qM)n2edj`rs;cVd(Tcm4`)(cSelfvs`1I+0)AmAVV=p#d znm)=Nh2!iUDA*uqw8>*)$Y>c0=v)+pYkK&s5;ncsS?8Pk0%!oZt-JAdX zYAF$a_@Gj060h`GT#p5RYEMy>XWx;Tk`i_wW-60u36Q29C>8peRhk)V?Rc1c@`t?uNbA0Uva#LJcozZgK;Uv}BkuLz>%mjg-B*44evZ0-i#7q^i!-vk& zC%Xo?@rr={^XihiWi3j6{}Y(X!rn}rva4%_*7WvPQcMh$`>ZqYOsX_}_P)QLH_jj} z5nncAOz%n22pdq4NlQUpymdut*!0mqi#oO zKX-VSrx(!Ve=fl_R7mwBb(R>!5J1sl%5D?`J6vl}EjEu4UARtqz7$MMC^Tv(2TqtI z4Gqmcn4gr`K;qJzcuR()p&cd*4yIu!#FyysE0; zMazo@LHTkAPlW&klaMc}M@R2m!J=c0gGJa`sDG^9DO?+Zbqf<0_g4APvh#gWQJ0E? zhzR`NgQa>G)`lS8t5n!$?1(5nRr`2(QInIu-(ot3)$fvB7m5D>2Hrobuo)2g?J9e0 zZ`+?r=Dd4~W!`eIN3j-W*!t_M`CmaC0-*GKcI}2HiY9DtN;42dDBJQXQl#^t?)%q3 zs~ERm)eCB7$%3Ep2qxF|Vk3AgdGX@KeAO(0etT={d^%)WO3IH^8_bQPX!k`#XgN8R zLAW(g?Ox5wW{H||{xSnu#xK>@f6wzz*9*e|Dki5w$lZFv6KYas&Z&d^9Kz)Ut~_PI z1;e!*jzZ^$${lw4JTcnkq85;Ryy$S>tU2asvOWxl+8d$j20z zSXhru+J}eDRXf!@s03ApciU7gp0|guCERonbqBsXmF}0Sed}In_xi)B6-zEY6Dpa5 z%~S%zcFJ(|apSqC3lMZK8N*{&vw^X75n!iMXdd~4OG0jHn?Zk=9Kbhww7;!3P;l}o zx9{_sf*&IVx}Ea`z-O16#__3&01^B$rpgIHQMqm>mr+o2kua#vO3D-$)! z#W_t(*Dj!4@#~g$c>N?keZP)s2=5bmqTDUx+R%?0;v_uzqMwT^ z2kZnMNi(#RKU`~Xq@u5ewF(xN>vcZttt{2PnD@yfkEfbbgqoI5Do@s)@?#Pmy%l1d zoZp4C0o9XA`~7u0li$LH)u5+qD@0|_?||s9?C3=6cxpT{zpjL2+d`3}W} z9C4-7`J5ko9G{}6!3096U2RU6E_x91+R%p^A7-J(jpf|NNtG4h? zikoXSE^T&SLdX0&*w8{o^t~+pYQ`SUUK8bqRXHguJA84Gc5XR4B-u3ErtVg<52IcQ zh|Akd^VV$|tXCp>l|B4UJ?G#1^^PSPCKAru9Pu{WsfUgaq@)714zkkIt$C~0#&7_i z0KeppnApp?n?NmCa}nh~x}8!ugbr?3IY8d{*x3tv7eurG%f2#@xVkUSl2#gNl8>2t zxb9B%cHznL#-$S8&81lAOTE1Y0Mu3O=OiV5^5`}#HyX|-m zyqpf)nZ%R?9X)dYR~n>deD4ChKdT3Qh+vCh-?IKf=Ghz@4b?~Tff`+KQ7i^ z|AeUkr8c`iW4V5D#Lm4&Lb6<3ACOB>uo|A4N?o1R`T^DdriZ^jMo^KOs%knISQTUv zKu+-lsES5J)o&2cy+kUDAOM4Os`04|+G1Zp@96B5q&q5<)_pA9!)f;Fbc!tpMRg#m zFvm68J`BHBdZDXN&Y^?oqVE>k6$?dizon!+?Q%@)aT#l|ED+pp`qK)F6qc#_e@k<- zf?cvvACoR>d!cO_1H;7UK(do$A1FIHXEt0cm)tE}XQ1`ugN?`he%=0H8C=~%CcL#; z-j})Z!m5h^McCgHo}V`DzizMg^6su2zkShP!3Vy!1jGWjCjn5&@(mgCyq@ZIF9W!%# z2#p52%l0C3R$>1WLVjn2F)_-i2d%R5Bdt-OJXyO?>Kgw zKKtIBYQ0lCB1_VEt{cEcT?j&NAL79v{#!#hyP?sMaAp7;>pbxS!PX(UHgeMv*KR`5 zwH&#zG5Yn%3w*oDShp_GdYlVG*E1GQSL=URS$#sNaH$jHUr80yJ%~h*z*qqv z+`fry^6@1Oln!5>3)Sp~Qg10+$l(##vQM9iB3W5kWn039o?kA_8mVw9fZUm4_Mr>K zW`jLPh~62{t%gt#=9;4m@sPHhcl-cTZVoozZX>xyzyD%#dNiXEP}k!g8S8gcgZHua?i z<}n;!-zHOo0isx?eVNmSife3uwaI&mb;A(336eFG=Nze7iQNr!$UC-0ZL9H5 zn3_${)2j&877&w17wkSe?a*;Eu%z6`GvX#MR$0%xcsc%Wu$W0?1@3>{YZ( z_jYbcMkPIahEyz1cs%TN$7!7OlUvyk(T_274HLGJnT4#?o;{#(@?OrA7ZP83Y9Ibu ziOw%s=yHR$_pW_Ytb)TghPaMSw8nui;-iuJ{#?`~KrlktMaLcopBtd!qK6}! z@VDDAfO!?lth=A=xDWLZk+KaY${MZF z=TAw7Eh10MrAeSCTWD`Uem{7C*i1k^PnDSES^VYj0XGhK|NWje`{b1Fl4o%p9U1!Q z77jzN@k6FD-1ZR=%l(!etp~ZmD}RbnA&(Kp1NVU$jEL)t6z_4fsMRvH24-iUD2uCw z43^4I*sw8}IR_DDPgu}+cF%R~w_ck!8Xh~Ls zW*HrwcXNbPxcHZ=0d-Qk;_C#Ih;s(i5r|5$187c-piyylU|`^|Pgyny!sY=fXqji` z6#e6=1Cl=bN1&CF3|Bi7()ovn7WM6%MZ!je;>pR?K&(}-+L%_@(zYX|{7GJ^<32=) z0N7XvKgZDL78;Py(7Fgxx(6odXTR9gamRY+?HC5{6jdZm4Qg7sm|8p*zsh1Trxq?a zv0Zh1nWLCvzP;B%S=?o0OyOcXP1IXERrm1OvXyz(6?<6Rmsy^b za~@3X?(oLbG*TT=t9^+>w+TRTMG&wqn(Zc;0#04phNsb+kZhZ?XOZ`*nCurHeF3?3 z01esj)^88`!3}%0Df!&c%&d_EPJ6o3KYPu@T@E~0(jY!5J?d@D1+wZUl7Zn z>@XW}L|xQU8VVP(%d;Dtw5aNL=qb2N!6nzAtlvq_$f+8@;Rt*Ey2Ea0u@ORW0W~^3 zowhg`LgE$Q3EflRjUS6VY>9iWYOA!O#)_KSR zxsR;+s}4p>uFzyL$e9Txk0*vmn*Ky#{+k*R^(`p1@hBk4r;D-cQE6&!R(SZ(ca8*N z=t2Aiz?<#V57$V6*juOua|@PSL<41Z83Mbb=aen|uvEvtJVs6c!Rk{Bl;w&IDYrRs zEo)w5UEM_gl}pn6TcQ5(Tp8U1iN}Be5N=+vCeq17G4X9II@(36lQBh#N9OOnE&}?J zbda4CpA%qCDkv+%wT{=ro0^*DnbkEk=qW0iX8RW(h>5+%;?uW31j5xFrmUF;E+0v9 z{Z5hK$zE$VQx<%+aAo}Jt*L`fXGf$b@|MZF)CAKA8xMTB05OL ziM*-;GLgTrP2aUGmNx|sYIpW3{KY4wURQAcn!c!~J_HbKURtLcwgF4LNGZWt zWRbTuDev`7OtKuxS1K;X$wqWf7uAP*E)g%OmM@T+!!ZhcOt(v<%d^&cN1K_^aqz;D zAhk_GpI4oKhZWUh{nJbGf2DbE$RNeJzNV3(%WrvgHTTi&{b|nXG>{e^oESz&N8bXI zf8Qfud+SqB*@3Z!1q#Tew;97@dIm9jG4E)fK@!<*4}xt{bk-zCMGW>|0w~Wm9j|ApR9BWUX=)@X5}~Q05?QWRUO(L&If1skPzIp| z-<`0w7uq!vu6I+Q45X^+0m!<{!$(RUu)@87YiJ(^=ps|u@xc#UQ|FaYL%;N-u+3D~ zAJG`Iw+J%wswXt7Y+nTW98?^IU0YCjVb!MHh6+{0V47U&EI~x38&fMgEqQ2~K3jW- zm06WCDp{PRaxas|Y<$n$)Rn))X#}mCv$rH>J2cObt^;?es6NgU&?rpAw)r5(e2zuY zF1FIahzIL*7;q{o_|snn0EeQ^Z3t=={<41SD#YjN3NvQJ`W_d4Pm9 zG%Kw)Ib3tTy~uwBJ*R3vU9^x2cgX$Z=w(0v|C)8q4Chx%PdcJT)JPUy|117k>ZKi9 zOd0%EUr_^Do<%eV0*+=cj{7kQI{ukY0BXy^=X67>gaDm zHk)aez6uN^+Uw|C2Xv)+e&21LnYgpbtGN#hxtakcG;M6XO=M`~Yv``IWpo#)RqZz9 zcTe(~zVLJt%yWoNbQ(6|5@uIt!dxGC-!bFOttU`d9P6w7<+K_$)NbQeuD(Z) zVl@l|_!(bSYxF%3+zb?~@wYBg-@eU8XBALfSZEA-rbO}BrHgKfv=>)x(L;p@s^#`s z4Lgo%j-YF=5g^(P4~y{s5Y92_q0AGiBK8w?pt?jr!bq#iD{QSK;<=VJS1^SpIbUG( z>O3fl!XfC+OsNk!StP{npmFJt1-?FD+r11lwzney%IRU`4DUIe^|xnbSoI1YT}X3z zBMiJ2v_9PqjmPF8`BL&y*4ekoaJ8gJS4#!@*2?ChHqH?hDOehL(k^^GvGn&PC0W~i$9p}^F2taPnQYomPIbcT$KHhB*bmq^oio-&-+v-;SJhpoDJPZa%jxIz@ zcsMvyRwu#1!6dze-Vif%Dok0u@){nVi(L-YM?6i(1QOTPyV7E5cYhEKA4E+;T_HrZ zf`}Y|tuVmLcTk{bY8sIq_<@$Dl!1E30+|nIi655>n{XhzYtuw|He{_EFwuOJy{TEP z-qLLvo3ywlf6zyn5->tLi(G;C2@%vwqbr!)sA3a}9k+CZeAl^QIR6W6M*mC^FU2e^ zo$)h>Z3)Zm^wds&h_~n55Bq{=a%p3>3N?CW#nhRNO!!ar#{d4qIb4uyt?{BTMWS$> ztMNGJ47jrw^e2FB$u88@(K)a0h6~sp7xBqN>rZ+Z%NJSbwW_`vs>Ojqvtj22ddizO zH%(jUvPPcdhkdacm&ElGhDf4S>tQ)>xw>Zd2n^r0&pg`k0L4E61gt3s zltHoAcj(oMBp?nsP$+z54I%GVc99^+WK_7rvfT(C%RHnJ)z7n~U*C^OmuWl`uugrtKx3LiG z)Xvrx{25zdf?nV4KJ@+tH42K3-1bF&NTN_ zs5~fD!Ha}3RshNl4mS39_tyr$hJfpEQ1CJz$h+JYcf1Lo2tU|jXJhNy;vvjQ?lxKK zCHnd!4F@^#j=XrZz*pBAoPDs>3F#)$t{ny{z8akd*ADa;w4`;U(?ix?1W4HW2*hN3 z&E!)?_3eH?SXotMAOO&m>N^h=69DLERW75h@EXvS?9?U}Y8CI|OH7#u2|q6`yS_~L z`C(+&)?2FoEt!A%*IF{|i-8=oI@A{-?F(eC4N$btsoeJ1qtrtd1O`pcX1bX`OkwjS z$Ux-wKiThZg{6Qj>|7IFPAJx`N#NDqh;cvO+Xn;TH=9{oF}Tjm?+)0^v~&Y&mIN`x z0M|nR)<2O0@X)f8XV8f6L~ zknH51c{^bjAG)@$;eNCQxzh^=9A{;QXXX9AM<8oyWn_oqdyfkh9r!nW1l&3;HUr|0 zz(3jHhyy&01M}1%IX8{_GFV>J_X&J;x67oCj(MgFuzpqFXYag#n_7yw#DE&-zr7 zq5QYLAOcTAxPQ|cdeopfhEHPk4L_oPfdUF901N(kwl*Hcq#kGj1e@zY4Jrkky3Ib_ zAevjZ>KRR$Xpn@2M;8lPdpe~B%?7fI$s|**9>cW(Gtqf(DjZ_!L!r1T0Jb1)DDDh^IEvu?f70K%AOA+x}?kuuzUXVHst&S2);JNQ z4>OXI?h_LeXHWUBs2Cd?M=?Eq=D(T>&05WFxRbSLU#^t-C;_7U=Tsk;fov}hF7EWe z0jPl!?p3WC$iX_J&xc`A=MLCF166M2pv)Ey`axyugQ8wG^RHZAqS(djb0R_iug|j~ zKQdf^yHoB*SLiB~upygd=5qOh9+ zRtjYpFt`DxZEYr)+haEJSSebH*gg9`RKaFgHwq5qIwuSRym+H1fUC`2GiwvCZBt-i~Hf)sMy-T16Zq3gw@ z*4_g2T3V?cP0~m&&J_wOqbit>3dI9fJDY$Ts8gQeu8w^tLuWB@exBWs7#R~av&O`* z@AcYoOu24PM8P#2k`hvED@2-@+Dqj!(}!;`{iLEDJq8KgFB5u5Co6gn*lP!H;4;X& zh(^y@Uu88Ye~&DQ{;xyqk~486Xk2@%YHg@mrPEQS>Or7}=MxU<7SPH@0GLQ29p;jy z0IBqF4g1KdRc}V4{mnV5%QoQr@6u``UlA4+y`bSBLf!p<3xrMjd|aoj9)l&KN+pA1 z*#<>0>o^tS0D6`sr#EpG7Wp(8nC`fc&Rx_6a8oG=M{gfER_8Q8zF>_uIvawhTU2-@ zx`>(4A=o&85xocZP>4A2a5pQ|^rb_t1_XiI08T429SlX*&RbOe~Z^_21B!Dm(q*`VXol^3ussol(I_|$R0z(i}kP9kg2aW*8 z((T~7F{f1FV&lV2w)@Y;uZFRfy`R~5u2RQ-%Pt+{nrQmrG`GlWXH1#Mz}8?kBQ4lh zCKg@ZdF&1%2pUQOUhP36b2EkGSog?mPQe}W&ZuMDF1Fq6owtigzGdcP#n+7e<}Ad& zo-XM9UY-ByG5_=0s3D*S9b)jkh8GVmsg_tKEHJ+&?t{wePO~NepGf2rXmFgI-9|-m zm^Lf3Mdf*SchNF!$dF9ND%c+QCTR~rK^ob_HE-3rXO;~J$Z+p!L6vO_4%ND4ZHHfl z0m^^7FLx)-#Mr``SO1M$h4pKjp&sPAc&vSdc9Twwz+3@HjMlbc^Z)eye{VsXWS~U^ zDUjqO#$N4V9zk{uU$L+N(UTRU-hpTC5iMNq9UoKJJvA_~ zvERuzxT}T~NK>3K$<@O6mjrP>z%y(*cT?<#8~3{?*YJ9*Q*5cm{^sVx7VZ`nIyiFP zeWl=lc%Gh5t_g}vNbIfwa9}nx4d4be{Wdt)BWK&)u0gF`D1>XqER7^&biG21m}=Dp1vp56Ls`l)gW3~n zD(+s}plkn97?Cx#XAi}DA$-3d463gEQy96UL3#jisRcV5hE*jFFK zhemc~(#WTjWIe|G{gD3%>}uSR7^{x*VqNI9rWCi1)$}k8&7WWh8gF&&5`jJtfOmY4 zU8-XY39PiOxaI<{=8DUzAO$?|1o0#RE5)vgmYNH3H_VRG9FAV-MPu>4)ZFx$U2avW zlK30J_~@3S;0|mN6w~NUh{*NNSL^=>Xc3V`aX~f?b1#UOk`8>+R-fK%4RKI^!Sky_ zJjSuK-aV&T)BDE2f(AELLI{-5M z@ceo5C9V=F-YeP4@C`u+QIL4jJRX%DvLCBMD_rmOT+YP*4`1*XJO9TS{QMIVeNFZH zWdzrP6Fnwy`Z$nHK>9$lweixg_V5Ug>DkSfmn{_UKCGg19DiUIZIjP^6vu?RjDc>d zY~oxG!pI`32(wBBteepfKC?3ut48qk|H)r(`=w)kct|Q zaCq(Ft;6S>e}j0(INSNL=}S{*>f}RE zSWj?|_njj6a1bJkW%#|Vc7MWbW;vYXCt>QUGWw9IyALrZCXPS#S>KJPH#PJjxVik! z%Oip^&}lb-<`1*}jd3BAANHb2{$;V@9U0K)2j?2&Imu1RT(<-J!otEW_Qf(yS=q5) zFZ=8Y3Q^2u0uN%5S$U5QS*3#3rJJCOgwPG-ba{Xz8qS8W?l$&YJW&tH#LZw`G|p$; zG|q#nXYSeN54y2@>n7ADcvkxT8y4&a6WgZAWBRtBa$1us>G3y5He2=pGLXVzmKu&k zfodYxL$t()s0weFX6WW1kp%%RFj}tG!G^z*!oTsFC*X>A#jh8X@OlIWUWUNgWifSK z;0vX&|56i1pd%3MrdoSS%Cqvd371TaxJ_W4?zH~e$wRz0eA!nN61oQxLJ@?@tMX9= z4Xdm6!rl!guFl`MTsolH(G@>b1>SEx+W>_WiZ}pRYP;fbSxvPPL>jS+%*HJvbOzBq zC_=SB&vjx_C$@Tu+FwP&!C=Jymwv(t7QmH|Pvp2j^#Me1koN#sPf3>3L;$CPj10(z zMc4N9DB@1govJFbE90Ow;ivOLW(tSl4b zA}3f#94!U8YUmr;B{Z0rR90~JKpJSKGG(AiO>sjPFxgb@-G-^g7=I)1NnK-l{{}R< zfYQEvx5za+MPk|~q6DP_ITVd;*2Fp~r>7VQS*i__>RoW_jSh!f|Fl!s|GvYRd=USt z)f8)0%6_pgAqR9sIH$P-0?m(4XYfBE`}+ExD~)maj#K!bB!&kTrM8!4X!jyGX4fTa z0kL#+xBQb=m^Ad2Vh20P2YLb)RK9*W%78;jw@X8oS4+_MSP|f`tebN>Ahd*kh)}SA zCw+tBYc3#Nq8HtgZ9hn1{Ss+ebuY`~z%-)}<;3zlseQ>39l*%o{zsvMQQCwUSNzsf zW2SqYgXXEf^(Xo+rSh{%{-w8s>VIeSnB7f#yN;GZ<_5o;rNARDq_lpPpKKk5ER-2t zq8ph8b>B`#m7wnXDE}&j7mW;=46dKhLz+V1OZ`R<1Er^lYjXvF=`@4k-X!n%C`oI( zG3nd&ibc;H+zK4vQ5_oAF7z^K@!I&>vumAOnkBpEa;(oYwP-c7ySVcjle-i9$Q6O$ zbVA&F@Zw`4&$ry{FLdZ%StTS2;Ph(0Rrwf%1nMj|CKe?Eep1>{DR5$dB1Gi?G^E1f zP2ED?fy1g`P~WJMMLa*7yRT0loh6wDxLl)D6Q)L~Lfa8+Knoj!=q*tm-gGb5TWc}$ z8hDi1pr=^(O3Zf#Q;J*gEAIeH1Nj$yxifJr=n^dVqJfaEcfX#uJ)T)f1i5}uaj)mX zP70z}KW+Bk@-yyeLp=igSu=KJwa4Cj!`{ZE{X#GkD73u<@e`!WFHbGVbPqq}G%w@{ zeXW|9-*Gc8i4^!>D6XAgkb?k82bW0T%2N7q5N;>%sPsn%P0L1Rra|H}K|#b@g?VNx`;&I!&TRWD8rN{)&r*3_uiTG2XlgbInDZ9p z5sH&Lu(4Lv?I$nNldf85&rK zN>FWUZ^t?*+ncF;Bkp}O`F)y59o<&-Ii;%g2Ydx{qz7H*_6Fvm5^L_+?Nqn@PIX{E z{rFE$w|!;02U>NDpbjt#F|MXbyL}tJcg5{7&QC+&W@K*Q$mFH1-(lHIV^PY^dTGXc zC8qsLO7Fa`eRtWXCUaihW%Vom1fAF+=z+p!U%W`v2zZNV(ELo_r6%1v44IEwHMXGm0Bj;nEe>Go0RZs`UP4tLgvM zO_}Ks?19fn-^IEcs|i=!s39WaB_VyvpZ0))v8}I*(3GQUm|%Z%bD)XKcG59O($-e* zgU3W~+|8mNxji@qx)1pHE*%uG^r8U%JLw@ZVkl_nnM| zXI!4N?Amna+dyBXjfzsV;zF}c+~02ck?ym3fd74wZ<1b+bNsGXZo+y)cWu$sS)+FZ zg&&1CMstCx6}GYYne5~+1!aY3JqqoYZOeZ;@jHLKQZowLNMCAcUB<$VU^i8A9Yfc8 zA-L-)h(8uWEc(aWev=t|Rgb#Hc(|cv>PvX9(#Lc>2Vq?s)9TB8${MeV$Yg{YF;kVb zB8dqImYD7KgCA(XpT*Vtn+5th$-nlH?iEpaD6FnfLe0X;a$q-tdlfqNeRRHZAl4j_ zA?&x_e5p+9wG%4T?yXgx*b-86rZ09K{t_G(IkZrA%rN^R@YtMSeT!ygywS#|BKz4t zKVvS?L)~+uz$F3&ba>NgdCP4DihUdCi@#z!e6PS~BmfgAfM_dyxBRH(; zbKj;>LpLz#dCl-Zdg|8`>zps@_U!@8w|H})#l|z%Dx_(@pV}DaQ4md0jG4P?SWo|X zSu7)#P3ZLY|1SalcBCh_Zb;4`9ZQyg<~6#X-wdB<m_HI=Y!ai>xjVQ1>2nN|LUK15cp?uryjk%+IdW_({r)Tp zd#Vcy_6=|5?J8p(<}x#M-}gPN%)V}ho-!R?zcOSw^1{bFfP>=bpNjCPIvC@}6TS}| zql$_OYC5`wLO%sedFdPiR)ftnDQWiRpC)j$;CUool3ZKdk7*o*H%+z@TP-?n*5mE6ObZ|Zm3SMQZ^ooAI%D5E6n=SfoFeq#Og9gRlat2`OP`S ze|ifO5mNb1@5fHl3?9!o%{~Mb=q=zg5y5Jbaxb1iBY(nwEg1j7ee##+)Al|BVPQ)5 z*Cx3stTI_4>An$w5<9s52-Jce8;&aK7YFufbj79mhT98tE;tpWyb*o4-deaZrO0J$ zf7mm=BkZ)^YQzYcrN5ND@89#l0^GE@BUjs>G{}utK1fTAuiybWOX5BPij-jDpzgr} z`%KbjA6?yvzVmh>VGdy0966(;3Pe>E!Q5NVo zP3)v<@h=tjbJ5^bK#v_+Z*8=!FVg(1p?JKl*_(9m>@W|7xmr)AzoUO*J|ouw{IwF#l^kJwXzyau!)j84NF+rd*uW zI7at?-qX=s)0b6}!@|R(56F}_N#fF$xDFIO(=()xAu}R2rt(2lQb2?zT-|RukPhJD z;cSyEBc3b=9MSNm76OKy)#iQ4nYhRNIAY#JV8X#WIHhMS-CMTuUV(k*2L>WadWHs~ zn&vPGd|vJp3F<(Fqs9sNPNA3GpRMP=Pydrpy%IsH$_o>@LZP7s?v2Fc%uHNiW#;W_ z5yOwncTc_B-^9bS@X}gyS^A^W6qZ=`y)DpEM!fu>zt~|`DE=iCI5!1ZO;s#P6r0ZF zSoZtSKd(erFIbHUb`I*I+cFfHkCDm&rtRxmigc2&INkIRQCczHUpgv3oq}WH)6H1U zmn8y1qZwL9CP~9TO#$^lzpK5_A`}A(&wG{XQ{b zx6{@xYai~*m1|+${MId3%WWoz{)N-WaA6f=VRd!HO}V==KdtBQ?-5fzhxaEp z=;sgplUKAKK>o$WMDm~iKF5%2x)KbDkWzNW%S+K2iIs^ILu$)5?iaQt1B;68>lxTr z!ZVu}BUqPSQv-ED|JF(vNP1~2JN>VRZT+3y@fZW+Q`ryerSQ*t6crWieq@AUL5<=j z07FF+i2ap18nM$RXvVLG6<7*s*cRT*=Ox?HpbD!1*+&W?7#>XzeC`x}++_~=cF|)& zm58#fik{(=t;))?pG4r)HTrX%+93*}@_jzPjdvm2+fbV<--(7GQB%`}_!WST(|}Ha z@2aW!xgg_+77Kr^SDxNUs&d9RZcR-R}t66_Vq8ofQ7*VetUPaw%_S)ikfrS>s860mJ34 zN?3Y8S|_8g=K)n*8Hpq*3+tfH{3Zp6p$4cW&I<;#`bF@&J!aXQn|{QkD=g6*%@SoU zn)9>F{oPkqSHSH#q{Vm;pO{Ed1^OyAKp0masFQzc()UfFeg8@LXdWqVuF964OaX5{ z3rNbu7twNhwNMOcLP;{2Qr)P;+|*m3bpL86TF`x%CajrGlTbeyhq>BOWm0d--X-q* z|Koyff~k3g{H?TRJXt_ccvmV|N!$rV9!6m>rjYA)h-?8)Ya9lq)n%|&pC+){i z`#Vqe`+NK^17njo!JRA|4tQ}EnQFxh^v><4$0S<9(7M=(UYfb|QV9V*@40x*z@qIQ zvdZ>%<57KOPCc)B7RB?dCZz8_USI6mlTYbsKpei-o9*nD)s(|xg@=iXkS8+gXF~q` zzuQfZ4GDm$IBnl~zF^Psqz@A`7az3@Qw~4%aV2+9K$m+R3o}K49rn!I#l$@i(HT$j z`9$1#O){Xg&PgKQUi6v-w=?ao@%V@7h%gKnTDnpO+?EpZTS0LG%`7h zk7(p5X52ObU34Q03u#X&#OWpb+k>-C>9ZOQ1Edssm!1!K^<`M)$@j_MrIv8mE8$$) zi;RpkW6oNAkPnkI-0e)ds5?IXj#5^0KQ=-u|8~9J40y*skI{eMsG3<^V7toVMqb!z zX=o5bRV1R5x1awm5dZmM27c#gEvm1`!R^UV0Rsdms1B;N^arHei}ImGHfiolp}w8& z`|fpA1Wy)2isWqxAAzhF`LAC_@S8o>)6;ufaA5oEadLm$j?-WHovKNy1{Y`3U1X-B zPL#yy=D@%}ixVl;^-oPrFNOVZKLWNC9w^07mWbw9nhcHm?jJ&*S;Kt|_>uwa*Bu&~ z3KeDD;speD@{)o3rk36-Z1-i`rcb3e5iTW&4feUwedW_4PkTegp>G2OtUZ=PL7~!_ z?pg=J%M0)&99gD>)F8#%|7rx_e-?-ZtN`vN?dtAC&`#N{!W*ottl6OR{-dHND(mR4 zP-1$)pyM?>_cgqsWoYyRX=!PxVKd;ket<&mzB0Q70-H3CNLIu8R?t2c0$7SSPV0J) z;H#-wyd>2s)vQX{Y5wkRR)%aGg|R$`(Lu*+2yJ zx{U2)BQV^=Py{7oBZ%?v7=71YJ5YgeN=HIUGj9ITPd4fw*zjR$V6+3svs*DAmD?{7 z=vok!Z1t#3fUy?#phGqMvx2_J7-|o686qi0eXBW0M{-!dx*=BjW(8A(wb4+pZV5tp% z1e(n}4KghioD$8?gx51Jh00wjnu{HP46kdYaUh=^5b~_DEo%2piO}!2<*$FRBmuUQ z=L*TV6bXJ@R`?rAp*VA26jr4VU5Bsb7hM0rY5jC5g(yLWGWBL_Cl4sopId9O zwzeLxy+BITaV9n7LLjN2z;gA;u_I{l3$srB+**B5UhN0UOsODy%rL=l>f=0W(TRA6 zueDZ|WPn^;B%b|;cApd&9=|J-psFeh1#fxQh|6;qA#-!nYZ7TI%}>R=1?t-rAM-Uqe#L8dJn z{lmksqBQvlP<~>1`%&%oN zfPcU61#pSEa^2-ZTe$dAdFl%HqONn)Pp);o%S?I)QtU08;uNeDHYI`brw!LXS=L)J zun-LFb8vJ}FB}VVs-FQ3bXn`ANFUt9hHKgHeQj&Q1Z{*g>dVRr~ zTy~Ztm~}-Q)B;X}S^Q9EuMZ%y1u78IA7>7Aod28-DkU6US3ODje!eJ{DR*?`O*4mgos;gx#cHY%JYSG{TWW+VdU=uV#rF0K ze}A&u1Fdr0D!Mx6W8=W!u6IlYhw~=H;oSby>ENe?_gDT(9R;`@j_R$^>6PAm$=dZV47-c%83;)5L zm<@SqQN86m_g;ey4A&vCI0hxkvsN%L&muS`h8T2*G0b$Cjx-zxB@tY@hXLmrf=Jyb zWWpDKLT2dw>1My^wI>b)IKKUP!O9A=5pK%<;$rPSp?K`z7q6d{0-HNUi6hJA)WTh9 z`264B-7_9lK*`Sgb)Zxjfa=EJkdUiizI+*?tpHu*f}I&P$`FHlUD}`%*5YvGf&U)l zGC;`}yB$QIm9e~^<4Gce%Fm1zpny$8^!o#62B-lZr|c5DZF-=g_Auhw3ekdNC}C_a$$1O{d;k@RRbb0f_c zMD-*y-x6PcC`6A9#Ze<6dA{;*u#AOfAJ!!*)sA(g>xA|7?fZGmWG5fu527PDDW`E5 zA?)}_ceH3Y@y?IoC9wEMlQQ9P?)z|eLeM3CiDkRq4`ijQ ztv<1T|1BV|X@O$DlH(U~0rV59umj=Xml0bhKp$GjBmhNI~C` zy_ga1!sC~8*)O0$An`Q{ zm=g!BJbzT2qpp|_y!pUQ(t^^Xu~O#+6_>LK!ioz836R*UD=f%l0M=+FJNE8@w?J4p zL(oNCs(>b!9=0F1>7N)A%vni*6ItAwo>;PQcaeC@#XLVh@7xUIP}uQw|%+)S^EopyuzpQi=>Ses=PFeE%9ajTeLM}S=$2aQ_YU3bbyh-C^v zB`LMsT}jZbh^Xy&m_x&5sj!ls`BZ4vp&6f?ObhP>+h01Q$l-;+K z+Xn`C;>Dm~<}2tVd~RT{GX+fIEg64F=d7v|cxsm`@3^?QEN1r;jvS7KSRLLM9oQL+ zJ8xqXt?Mjnb`1x(GY^HD>8frwiPOGwtzZ8{s@4`K*Yl(0`R~*G_|d<9;xir_Qc&2v z^Wgc6&vZ-7bx{B80AzfGpzSQ_;pl}4bgkWcY;0^8MkoCo$<9rmK5-2qtbVjjuZoZY zzBK3_9Ea;H#M(h<+*>2_7c`yI4Qm>mokcTM92{gPj@NF1;Hy@eRh%U;b26Wr&ds5j zuIzSxf%U%X|FQPfaam?t+arRa($XoSqM)FJbSNq+Al)fQcS(a7AU#S-3(_Usp%T*F zDc#-qt=sdS5$2uud^1P?nIFHA=i$EhUVHDguXU~Ks^mQSh5u-yG8sUEWPxg_seeee zbi3Ue_|C&?ovI5~%?Xlmm9#+k-dy23SaDq7>2(1Y533ID-l9Lhfe{L8)p}LVG?BUx z)242fvq3t&&+PfGn~t%ryL037UTmxcq||$QKluCIs8cYzI_@6;V2_bVNnllZ(UrK4t$#G|fBxlAWeB8h!#WF^kM>*34d>!FiY9nW z!SB&XCe-0FBO{~Zc56f>Yo`*3hlR55eCDmQpA?{S9|YZlg)a|dw&o4p0y5&0Ar!qfbMx zljnlNg54fxr~1)Qz8;DqNQwoauksp@pbGEfWYH{=|< zJ-^JDavu(Z^11}gH$WP&kGT00tNIwqcSr)`z8XmUr$7GUf}Cs=*xZZ!QegR(-vI7Z zRV6%3yl{ZQL{*j#X3w6^#Gi5uge>y9mtueOB zvwRumU%jt9bm1>9`Zi3R)H!$?fwc^`-#`}L(o%3aM4nJH#@G!lrWPSG?OoI6`Rn-q zdyA#%1sv6sCcM+Jskiw=cDX(+ZnYE6Ig5 z=vWQ4Y|Jwq8HVXm?StLTA0bOh5)vFjA1TTL)KsBP{T;ADAammndf#5_wawzY-%9UTf|HC`{5 z6H~y|tlmJv@6FfA7=hNKdCQSl`2YS>nHW-TIDMD(#5FMq2|Q3u>5ZJZ{HMVdTu`{f0O0{j&!R6^$PWfkf4>;cH5k`S$bpx4{VaQQf;ZJ|) zN3QdyvH4eg0;~eVSoHq%p9Z|_GLDYEzE5DzJY>rBBW@G974v(7BgYp+PF$<-Xzm$! z`_xKIO6e>YoySgpXWkO+`Rv^5Tj6}#N>_u#amS#hq(A-q>({T7Ry<@!SHeGhzHes% zX|~Uy`}4(bLN4;#9dquD;Z$o*=%F&3TW9ZSVx85k7dol&Eh<%GB5zExD(-gZHX%wo zE}En_LP7CstGsBg4F^)HNQrE@->JKC(?540eTabBUA4QU+u00vjp+3B8*|$aFW|xL ziwFQ%-Odn~Bkw-ccWRc)f+bvDmOwPMn^OPgJXDvxGjC=ZFN{7xZ)?bGh^Tc-Xq`UK zoO!L=X`ICHLuDKhr5of3H|(?|l{mG<{rvlgSA+s8x(_#qB?c6$ch z_V$R~>WFfX$&QfW2^$cmLrnyk8@O%^j36b`O4c=-#{1Xm>SiY~YT?ho}emWo;~(-+eb`lY^3*t+<|Cb{QNE9f7^c^UdCSccR(Ri4RZ z`f0_4PMfYy#0M)*`;D7}2RlwxB5eIE8JO%|N50fWDKcl?ntSE5x3ymVy7hcTSaVw+ z?*;AI-jHyp_I7KG#M0Lw1d8!MB}$w85!X>ig8#mH22=AtbbvEq7%#w1Sh{HJnz0+- zmM@qm9%+PgPl{Wn)DcH~W$ zH$PvdSAlaJ!O|}xBsZh&2Ra--5edZ7C?yZr6Db3=PM!x>Udn?Jr(GX*`%M$>)g}QX z^O2~eJd#5ZA9L==S?^lZv*a$$p>y2Z4d}94-252P)BFY(p@6i!b9LEEW~2Q5^_{vP zPQIFlbWz+SyDSdFb!-f#B0qCHgjPX$zhT66rla6Aawl?w_Q~?VlGT{hv#Ce7j?mNO zhC1#ODt#oPDFc*^MPdYi$_N%kIJBeD5Tf`4ZIyvR;Vp)P0wuCP0Ie_$y0{~MPbWni;Dk33+o7AMFeD&B3$GURd zsb1%FRjeU}AaP!`7 zGu58>6V0u(GLy~32S4Jy>=nI9d=B;Z^DMcWFiEwC;#vwt^a&aQlx!Vc>C){m1SOpi8$VPS~(tmvYU;YkJ&NI}|Gdo)wQ}csc4OYP5vxU5S(8I@J zRztf#7my{SM%c(X80e-^Po7DbA8G_78A{@R@j=V{C` zs>;iY5^k`TvS^81^U1#}q+atf>T!NLv?Np@2Z?H04_=x~J4P)Djrgkdm!SAGDzdFD zT3hbaC(C`8=~G@qH9?%)=8=#7pJ3R-Pw;mJ$QBO=rQO#%p8*A%Z`Z_rl!j`G>FIVw znAHFY*lTWpK*q-Ew_^_blY)Gt#&*5E>I(zK+Ox(00$@09f5Ud_(li3O$|e6a`-_Nm zDX3-$2IkK*NX)$Cx1`nZ{4p8M7KA!_x}Mo-Jd`u^5%B5hlFl3*Lp`?<5kU=LO%xTa zu!Yq=Jk8)eMpSw+pe~;`gX!eHD9W`ILqJ=nt@YN{t<~2$*yUOsUK?)|$81dQPd|tB zC!9eBonbH2_P&-4s4c%1Yq_ORt)aX4OW6a>ip&& zzzI4Avs*%6MY)&mXv;~Rf5R^Q<>cW51p!Lz?2-r8oixVieahX&U5;hTD`)aJhTHS` zC!4Oc40=>#rcGIem2Q5LT$&t7D|f5Raa<7D049;(^1@q=FvCYcq$Sz3umPk6p?;r! zzI^^4uI?WbhC~uZc3IPfbqq_{>6oxGkjPrfbfJs6?N1>nKEW-~VIs`r-Xm%@5f5q%qh= z7kX8)^`v(}sG4`?4M;T#Tvf_jOkR2ME)PUG8%UvCV!PcESc1>D`3bv$eVFfH;(#2n zlK9TYAu*4`Ht5NEfZjHB4&);Ln3CV0qNL6B8xQR1X$cxR~$ z1-m=@UC$*qrjGy9&EUv}p>_eNuU;>gFY}EU?~fbX7QF~DDoCi>`XsCZCbKsjwpZCr zhWK+e9hZG0%8z4n_@;ndjoIoR~E19xnMJhFpvcr4E;qpEbTDedl=54x{==RDS* z($O%qKj?HKTqs`miAvwHW?I!XH0UkleZ<0MVK+YdJo=4qD_(tlcYj0Ngqommuss5X zpCzi-b)#n}zEC4%6j z{;;BNu^nfZ7A2h=mous&-0s|Y%p^Ru7WRrKm8;X2gan`|TV;Izw(_XuBDtH!NX7Mz z?x%`3_xoS|7o_qZ(Zzob6J{{xlfeB6LO7(Op=+p$(A><-Hk4}iiII_!&qbLOg?2-9 zscE5NdYR-&3^z_iXE^=B3~SgA4`#zVe&sLkH>XM+nR|MA8|1NhD3ttS;i`~dU;Qkx z=kgUEJbFU^%9zYS{4s#k)Cc^+jjG5Gf5VZu`v08N-;<1?P`KD|YOh9*3_!iQ2;6K_;X5JHXLWOKGVE; z(@Z%xM^em4IZkkPne05lLsspl=1SRl+=O*`ypRFoL7CoV>&e*9U?Rs*ULe$L>n|$> zx0RKZ(>8J)_qJ6!F)Tho-VmL84)HS23}^&G(TtMbd1mI!DjVO!c^I*-Gf`5wci2fb zpVjm|PR8$uG9TW_b%Is>L9wfkF7#;Bv~2@~DO78&vGu$*DtVOWhDOaUAZ`MtFG&x7 z)u9;pJEIMaLAVHBqTUw05(%|k6*8yQX2B5<5qF1{B$du}t4}mO+;~8d;4uRJ3fIgo zz!r_1fx*OJ7U8pU>F}sse~V%`XSg(48R0ql<~VOse4bYM?Z}KwMGf1>hT!||M&RKA z8#G+vEo_8I*$m6}dXK`n|MWfXpG2B<7V%s1TtTC1wMW^(!2y9&O~Hq7qacUxj5j$4 zUIeh)?D|_IeSi=)ZR!NFJOBH!(`IcMyjjS}t7%T!%ZQsA7G;&tucP>aai^VLnYCWs|=K@_#3c0~c`{<8`gaJV&0&A}$w}b#B+I9Ot zh2246ur~_0^HoSXR%o>_Z>SrZQUx<4SL*2Cvr}7JYZuCPD6t0W|*CAU(=USur0+$;A4%so@}1vZJz|G zO)0Ci_9WS=Z(m#sLpWJ;ET&o{BODG#a#=n;d$ax3#C2otz-y~Jvr=lUYPTa1Dm_pO zFEeo9)e?z#Xs$5$WVEGjN7Y<-_NcfNP5Dm5)0Q2Rl103Ky1vyZ5Nmz1AdXILiqzvftD#vWk&9*OuAH+q*TtG0tnSiClSLT5D@S_8v+9 zqd|AQc9V>8ztw45)R^4tFofg$m?%Dl54SOO5VureO^^GaWnM>bev;dK%OCTLSx>M{d^H&9B}R;iC5wbu<$xRUfo z5?FgjmZU>8R@Qs&YjuV%)zr^%d|mg%)MYYZLZ##SPHxp0uXD-U&frKNjmj-A8m?70 zXvkxkk|H}C+y8hSIV0{ss+aaF*-CH` z+XB0}rP96J141zBDNfBivYLr|fF^lCxg znxWlrPb~w`R|g0)W3Rog^Q^NlkQV*9=g&f(|EbRf9l8}iSxzbCHIeC#$*7=z!oT{7W>7( zkbbw7Z%^`j#d&*s`x{q$U`^?UoqSLb$-lJ#N{I$HE7=N^*AV%E)ROIK>w}#^Dj+OP zKCOR&%}Z=!W7FS2a+tiv?}WNe$xSMnmgHF#(TmSmVtc)(XD738rY9hvF7>$XQ~wa2 zoBwJ!8T;O9H0CXuiIx4&+sY5WJFx@ORyfGa6-S|JGv>bp-EC2dj0!-%b`J7s56$f?z#N<45}A9KM$|>itiSRJZpI zcQ%%ia;>whTI#3Gc(ZvvNKKfr?7z=I9Kyp8pk(2IG-l6EouOg=!bjuzp)~U!XUB&k z+<8T{ESIhEK%0BiD$japxa;F%0sC1EDi0*$1XM~3pTpL@6zH@SxYS@(l36&#TnXyd zBl|lGKFH?G3%AwE_X4h*e0%_MYbUcFK0BPlp(3KFQD51=w3%3s^vkkK4ED4xA8VvY zX(U@5_|{{+6T(ZWc`*=M*2ePZjv}HWIEv2f%c`mUhMYUVGZZuh9sP2ugC(jeFvUJ0 z6CbF2_LA^x*ul;&fpGLqiHMyY+bXUBT*i`3b%cE7b+A8pUal7xOTKi{m+-rswD8)_&!}- zpfhO|VXK&KPa1ir9qv}TomA?MP!7dqWjigKe@jg*TCUkFJIRaVw0P19_|h+S&Rc-6 zyZw@fDmgz<&Y{D~g!+_Ji6eWsTrm&S7sk63q=~+OUW#;XiIEaZ_*d^68j1yvLZg#N zBx|5?Vt|9QKmE{~ANr!h^ZFldX3l2>fGhKADb#h3Aq@zG_lrY9LPo$w+Q*+>wh~m} zi9>E0x`U##AY|FOJTTaZy%Rt^d((Ufk=lUQt+(o@Ge9+b~_##Iz$Odp%4>M;}=S7OUTlm3~e7Z?F{3+AQ?ZrS`?LXVD-1}pO0nsnHab#PA(=_tKflEdd%tS7{sZmT~#V5R<&i@47; zS59&zL){_#Vc1KxB^+OZ=q z;;Hbj9#2IAq-(N1jgRW&%*Gu7NxCwft@z?NsS)5fXw*QW7iMaGXJMr0RW5W**4|qc zcKx}DVR!i> zN~HU*OQD$I=7I`ERfzY~ZYCppN#U=?ucOyS3Os!RJVN`%p}o%QltxumUSH0W`1Uur zx(*QE!}UV*DzHOy5f%fc9gBH3&EtjJS?u7n336&ZV9bA|SA3 zR~25ROQn#e78{`@_kk8cIVE0()ItN(tKloLK}_%2gB2c?`#7zv#rKq^SM5;Dj<#@g zRu!gh>RiBc`xKDzlW`JBg@R|f?X0Q0f#Qlq;co0OqqqdzU_K}qweLmU^5g{xo2Bmw z5;WmikG>I%ii}jlX*am|W9L!JLZ9Wz$b^oa+VJf3I4!%<9)_#D zkhlMq3zZ)eE&eBgB1dG8q7525pl$+C6!H8To>4up1t#upRXL91c28#sA$azwB3(Vc zqP!%zeP%*PYr&1jpI=gQ-@S5Z6}Q}qj7i~mO4&uOnABM@-YHwvT-kFhmov&nccR$4BA&C?dM+mo$7QoP_`ybH8ma|kE~^TSOmka|{;`joF1Ggbuf z=xg*2tU-N-)LnKu?aZr+>~wv>Z(@FACSFBFl~1TxO&`cV>NDAjlH;DRiZW?QKGv>? zC^9MLT?ilC7ERZy<*NYR>G4G78^(H|Ork7qx)frE8A;l7?B>sHFX*fCki>awa01yTE<<5aH^n#p9&eCcj&X*pZTlGi4fUG>iu-TFSaiwBuj zw2R`MR{K?h@A@_z&NlGp-G25g^gLx-w0irrf}IgjPln-vesjlD#q6Apj*jj4?A*eX z!l!^NV!wZO@^YXO-Z`^ZCTL$zjh6Vl+iEoKQG1sEw(f%O=-Q0NQ^%D($C6HS6ZE7N zaM(aJFAoUwy2OmI9XMrFA~T8H?Q`U5>~+DHAlco+SmK^m@p2rkj;TtC=VEO9N4d06 zW)^l~q1gTuG>+V)_*rEN&O(8U6WB?`neAh{Z{FV^jR;8UyWzn~eXWsjAAd3<{L!DK z3I9@O6Q1Kg{+_clqPwaV>RL|+_(@X zQz=d2a9FQ)=p>4kP+)`d)tr-j6xaHr3&zhJ&qx~e<9sJfD1Y@*M5;E^JO5CHwDjoS zES+_oF;Rus+0}Qau`rgp3qIYV&f$-{yY5<{-u%i%LOGtXeJj|^GN_k8ZAbOKSWt^^ zh9ACfLiwZx8Y=3h0iUqv|CZ1^_n&1||LR|x*gzS3d;I2kK@jUcXnc7&WY*SdYN4c%CcyX5~13;rFl_gd7Cp_ScmDIA1Rx{gXVWN zuTx76?%UgQe-jOKTSWQK)AaZLb&(OV4w%_noau*NiiqZ|l?jKEDuhY^WtP?&e|qKA zH7D?&-G(byG?;h?T3Q@djV~)hZ@U@RnVwO+J{0$M#Z@y~;iT4MqNER+ZPwQiZ)ACr zcb35%tTIAiwbnC<=I*xWSmx(3()8a+l>d3|uGfLCD5;ile42NrRz0a<*T+w?t*WdG zK?puWgv`Fe%x%#tgyN^uX~WfI`wcg*Ay|5d#F7ys`W z4a@_1Leo?KGB|2%q(jok!X3AD_uv9AZEl{~8Up|`9}dcxlZiF#x_#}UhB zMJe5!a6;b8;9{>Ple`~C`omj}o3wQw7avtruqCA4+@=b#Tyc8wqR{WHOwz2W*0r#Z z=%OX6rpeK2CbXbF&ktmOFJOMe^k4tvJaA_7o2-SM6Z*YPnaGbH#h~+fNe9=r`UBF3 zUjWP$R>K|^FsGV0l14t$DP6C5V}OcxU}D!tUSO4&KHxU82*X87}p@DyzgqYaxz=;pVqNk%X>k&L6!JbDHpJ>$9 zYv(=V73OOBF;7M-bLWPvjFfa`tzNCiY4p4d?M2n9%ACt|J=R%zeL+|dd@1tXLae;3 zvX^{y7umjI9oj4ZI~jmH1|glxvzLBniUH@EOUxd21zL>J|!l2%cx6eluj&< zqaHggcJH$_&(hEx5egpF>KpF{Lc{Za&BNnN2ExMOR6k4F9YoFo5((5hBOf| z(<`kkSC~~L?#6 z!BsIuCUEcYxpw_6>cbknweeIircJtEB@}ERh)w))gpKuHe8LrBadE@r8CnW`#3D6y zvX92eH7^Mh|IR9p`r~DP>L=sVu+NLsKV}c4JthQ}k-S{_knVtTiBba4isq87boA^F>@_{`)cghs3lrD-LdI z>Q9<{VTb+_26BIYTAW#0PPi)-&vP=Vtw(Dughoc@g3fayfyk)O!z|Q>F4s17y)VT! zN|=bf9t`#OIYMNLURq5Buk0 zSUNAC!R*N@yl*_u9YsS+`XqgG!Yz@pGCrE-J~g$n!+@HEaO}0!E2hR^04v2Sw;e%7wn{>FD=t+GMDR=?5IUz+paU0-PfVdRSc5-IEU^Qzv{ zENDHS!!PxMxQl0ST;A6UK2;Lm?JIs}r2qBVGBL#d^Kz-(@sAsjF@s5>$j-)sRe^${ zVhAtDCe0W-?Su$l@@*{`upoXfiBOoI)XmKYvl)jM;%y(=tBCPHj83`7VKMop0b`r@ zaZNpg3=B61)%@z3Wz<}1F$70TOS*1s zdVrm$_6a4BTCDQ4erIYsB*=*Q^=(+DdkR@GF?Zp-&waWuF!waPtxd}AzJKrS(}(uW zMM>x2ZNlR0IJvY1_V~|H3YL~zQ}^uBQ&yx+GvrVVsM9-Y??IkIS4Ft#=ld@0#uZ7; z(~FN@&uV)7nsgs@>!~jQo{VFgc|t!4$en49`zsNJ%7}bLQ;EaQYMZ3!RDhz9sp3Ms z8ra}$_f7D|fm@9~=rgdSKFoFBkaQxHlmQQ({LEvrMk z3a$&}tgSA8Y-rD(4W5!qk&*83pOlk##tI*ZJJ>RJPnMx+ve-lgXQXU8bEy~O!lG@M z=Ddkl$9BB2~XX7!`yH(&L-%XNDTel93dlc5f z+{itW;|TZcn)-T6#ff;erQwQTF00gNhMvQLcE-6O70X|VUHyz>Pf}EiRePI^m9DB= z_2U9--uaf9_cj{Rx7A-%dh2#(Z)reE=~fvtPS=Fb&f@bW_0XF8^P~NLpn)Uj9KD)mE6CjSw1psHSd&6o14$bU`jaBUPNSh#G$8UVFx(NWHY6F05ZU zkS8vt69bgY%OV+KVmoxrp8jqW#CF&Gl7rlp^k@byGR8x1{4-C=U&aiN%56@%Bn_dn zJ;vyiz%`%uh4CscuwL~M!-{mGn}=*#%C@#8DCm3ju^lgrJrbMf)Bf#%*ryw+RA?x+r(rn%g@fre-x-ne=uq zq1HUfxO^q%k#D6?rz*3!IjxgeG+yS8!@Wzt=y88LHUSw8NACM7XMPitk194}1P+^D zs5-XSW+qs*QsCWce&p}Bz5^xovg6+?X>Yo;v*~PCthQl0U*!8xqGu;p@T6+ zxlm?se?4~%DiB}Q(>MD*qdVGyLpiI>@{@?PibEL+jcl30jijvYJonnu7Fia#dcleI z?zlvQAH)=KB<<{8i*`Lc7HYB2(c*t5ed@tl@b!-5nQ`yV&$_iNoHiMr z5!t`$`o^zjf+Pt_2h^q8pI@B;?^}kwB_F3YAE#Z7xxV~XeQkvtgLmQ9!>8&)*c$cC znGT!jxbPdc+EAo*83XvMYW_f^|rMw`}tPl3G z#iT-quc6+kR|%8cA2!h6%xOmAzldMjQk&c#>!G5VUa|0{%&P%qjww{Q&9QajUctU0 z@njPX&27!*_=M!mX&fZK2Bw8nuLNA}YH~|xjMm}4RAY^BDWpY2ys)Tv5zpr^UIztC zOiUmyN$Be*p&UO>fQ$P+D2UjLl1BjuSI99cAt@;sN^7m2qM+K4Zps09z!^0EV4xh`-UY&@Ya?#IVb()^5 zxlha&zw`L;i~r2djk8z`X-Ns27TTea#F>mK>vrc%87xsXp1aaf&1{&VQ1~LEAi|F0 z8(%FwJ>D*~bh2N+qahi4g_c>>cOf(>{$pMVg{SyGkF>x3#PtivhWysJx6Fu>gG1iS z%j+~2Rt_JuyD;=y<&rca4Nev-^=#O)<8qik7LXizP(2|5C4K8)?p!O^PQ%n~ub8U* zZKB;ajq}pdmAxo2AAVa9={C3ZnUDT5>>m~$_aJ8|4vJJv&m;4GY8Ve4nQBh}24ZCz zwDJ*!_9R{9BI{d_KGZsL9!kfb-l9P@kne@=CG%7v{NIq2j!BT! zB`v}~Uha?J`MaZa^!JE!5Cp9^w}b1Xhlj_C_4%asY13=3e!R{XSp?J3h1U#4Y$KnD9WCHbp_xg#^v(|!Q~<2l|ae+-24H(D{p!6>$;6m4W&*fT@v_W%XhT%(5N z?nj|KFvCMiEgJpy(iK__@7O2G(J$lMR2o11_Q%J6avb?16=6#^Iu%ZF`p(_Ex^Q#r zDRC(LvLEgRa#&5H)xLH&O@012)2@4>J@S{CS?E2Twbk!S$I?$fm=%Z6u&AC9s}|k}?$<{U?I=Kx3i=V(hkeJX{= zq76Qzk%=|dnqK*8jHkq&G2vb6`VNaGjdqDQxVF6bxfwq+kk${LWl=KT!kWK-CnpOdPAF?l0@xfW?~j9Rz4tTR*wwEzJ{VtJAeOtp8>XOulV_!l zdAnNFF=~6;Hqegs_Ip6Lo;dApI347wtE;bq?8_~rslfmD--pa*)vOd!WQw85*>>2C z)?)N4ZeVY^+*MGfPTqU`@-9h4HMBYxmVgqQE{MZ2RJFJvafBbA$d#1Fw^MN1C+n;F z9G2s>8^zXq_+~svyzWKziZ5(nf2Q*JgT)a8X;U){q zBs95(JFbKbP9oK23ah^6I2(JO_^d3cgYC2fy#tK-T7`R;+klmwWz^5fvpMv_cDb6C zfSkN9Fo76m`ioLzZC?U2k@qwJf$lW}(gH%+3@pM~X#6o4A-=A!a_RM@JjTajCdbjv zfGGtt0sdDiHAA_m}^v<*Um<{@C9 zapAVpR}d?@y|9L)5p3krH#no;B;8maRIi2Lbv?Pp5&O_4PRGevpv|@cF!*h>;9fF| zsHi5`RXW#;R4J6_0v`a`BYw573Cr2t%}`?9+9P1pr&Yw9=rYaYT}|y#v|kkx=dK4K zr)2Ctj{FX^;5EafNl8jZ_w*?2Wa1V)L3L*@@O(-mTw#q-Zn?i}0~7=7a*1T;ve$Tv zo$&qroIM@`PYIxlZ{VE*)CT*sAjWpG$;d zbBza65uB%(_|_G8)|1DaFr0zF7g)0RigD9(0~F__R8+!2&uTi8mHS;@cD6J!`<=%l z3pyR)tXV@4X6R5*vO3!cbL@#$N0`~5%&mp(NJnQUn=K+u8F7=nEBV8ytup(adzR0|`f?C?jQd+1AQBnfqe#RTCd@oWGq{V|s^ANO)7p`wPdfSOLzs)R1_*whrOA z0AbW*cg*RG^Yr9o6DW>8u(Y&X+rLif{Za1fc3T{83|#PExw_i1*X(He@T>$Um0eJU_@^iHi+{E+u`bYG-e z=a=Pa08+qZVU+FJ7;^n^!oPg&TDB`~eorrGu%&_7Z?CjlmBY$Jqm_JbcegIKmYMu& z<0G1`$!gjNDQj!U8XynY2E`uTh$?g!GfSSG*-Tq#ro)H&cw!x`LnHT!Cpg&jH5dl% zfhec+)2Hjw*-qfIkYzUZ5V9%N_Rh@rz)f<3QbaZgsfO40(L5`|!M%Pvye3gJKCg%$ z0TCx=8n>QI;Nnuiz9u8!@IiFkid9!kEU=|DghgK=)JTP0GBYzxTxW4CVj;D%^_Sri z=$nPGwD#qpA9FeFm$&zZ zPV$LCiU_7trmn30UchbG2Fi9C0lka`8|GK&cg<9>U%H#CUmtWs09szNA}Yj}pdC8_ zn)>ydqwWIZ&s@qZVGjB%uTYBxvgF+=J3#B<>a|Fm<{{|49pT;mIYOa_?BtVH-6_`N zgeU}*&9R`qopp{;Aopr$+$r7GCrs~7jD}u)4src+OXS~QfRP0war|0?Eqq@Y4$k5Q zR62Tk{jx{PAmGwGrvPiE)2d{JBG=M~cRSuwEMaeFqcC|77@F6IsKp=|5R;Vjt#u#? zYvem9YW*#K{u3;o4iErPce2RkupP+K^<_{givRH8f?$6Z^|NPG=xuDOZAFP;yS`-#IWqR z1NXySr7~Zfc$gdPuUCM1_2eZ@ej#XQ9YZ^DqJZAF`bi5kWvxN-*P=NG1#Bm@$i*i??zI3%G2Lu*6iJ${dVOVln5Th+3-%f*EBU@X=-Lq2a{YWu>3|Y z(oiEA>Bi@+^r>9vS4}eUV-))t+n2@P$cbPC7Viwrf~{UH9v&XDd1X{8h5=#Hl&c*s zFsw#>?75cxXzA(cQ0#4YK7O(wgq)X5A6p7t%F;6XQ5ff#2^9}nAPD2H#Pse|cn}q0 zIxm-Z&E3X7_l!Rcm9wy}M_eu$fm$2wh}xTUgyb`n&(cP;`swZy{MPR9vE(KH-Kwn1 zcAp0Ecesdpx&ui;djG1<2BExTI1lAIPEtV{lD0+7sSVZjm}31 zF5=2o%~`sw1;$~#&_I!tlq7WP)~?(Tz8e;5(YdCkwpdKA^{M(WL5ujj2NV>sP7T{W zzg-@=D{b@q8fuLtP6Sa-OWc*W64Us^Isx6}lNR<+8+uUf6&!vxRr5@&p6UGUyRRxZ zQ{v#_^@}2%cnakMkLRlDG0M<`N-|@=rJ>uPa6vP~fU37T*?io^bh!LfuD+(Wc410> zMwu^jD7VdW&0}7>xoQqc2?_kOXIqnaJP~e^gP@>mLpVFa;e4dd;-n3T=6n85Yh^zs%d>G}2+yr|c&y=%F4hosdarK@w!#n-r(*FHUq-xpI? zr&a$hJYJEgL1uol8yNQa$u2~{XyZn1 z{ro1OPKH!t%l;y=#SDy$rHS>jn4r07 z6gOYX;rsS&UxPkAvGzsO8X7ux9Fr@+3(ns7dL^a5K(=r&sjXsqzKK>tuF-CyRVsj1 z$3ADLk#~Z5GnMvQG)P|(;C9EFYH(#+cj)vNsDzvmG zZy}Jc$+aO+P6PWgCDGvzR$yjQ)7aQ)Rl1{$T*DtFG8o^y9b|6`%`4U{DY&;;aiFNe z8CN|jf6j}RT8h$H{AJm7LU!xn&<=y{Xo{U$DRIs0u&f>P@=T`&?wE81kpeNG!Hc(t zV=d_^11k~08P;mTT|a-82eH2`gf}LIXfpk(9gSb@HB^pS%Ln;$5J=Tp5y@8elXY)~ zthJ{(N^3WSHUlwUGXIPQSfNZen(a~Qr^{{gxYC~W^fR|rzlDjif4-?(w`2UdfI{is zv(s3ei3thX#1dA&`58Y|ln=kIMlBZmjNj`~gE(+(MH)hrHskRJ$=&6DRlx%0BNY!+ zM(FTJQ}H)J-!%&A@~Yk3lWAT!lK`wQ;g6>1x2$GGc&n16Lz|N<+oKOrAZi* zq;n#R_8Z4^Ts{un*NmIcmAF?jcSG?pHLiD!jsOxMKLH3F`s`vlq|Ll1D6v|N`wOJO zokP^6T>SR!+eoo%21@*`paMZZ!b$`H-=m?veDh+mCZB2GVfHE%SS0K<`{)vLVxV;- z%SS_3;4zMOxKH!!HjE;YEkXMh*=ac5NH)9K0W7+Bz zr%OWToGC7%Tbv2yvCG-%h@N=)67}MxOFaU3aM6zhDBq`&{3^O^VEw-;gqHvj)cR;pv$9_OE?$MUL*C)9$_4Mhl-E?zTIOC$@ z6r~57Rs@HWzNnqh@Tm+2(*GWY^Jy`AxdI6q+RN9>qM~l&%?VB=2R@t@n%&r?LcRBv z-`YBFPn&Q=o5<>MdS&Cq=H_Om&o=-OWViwH8dqp(t0;f7On5_*W z4gdM5ZDMl{AGBWR#qhjcCW_{HE(G!U`F@opQneQcV^_xl$(&cuoK6XcaZ1C50gh3N zf^@gDw9`VW+i#&9keZyFEM$2vz7=xjRWQqx0V7$)wvLXc360(L&|S2;T+29voVA&F zJ+PfPJD$nP4q2OD!Xvg-sqA?XAwdJJ zU&I4ROZYU``%pbU#?6;Kgp)nDGhRG;_E=!$;B7rr5=i7!PZLegDYan(aE}hhxVBSP zZf;UhkrH@JyvQVxx%(?tJ30F1Lhb;CLBJ){?emKripE#?_*CI$-?lo{y2FH~fkP~g zl2)|rUZ<70zA%`UqL`}(HzK4a2CH`#3&BGXM{Rl*mQ^|;8uG`m#^y1b~jol;66B|rc78FfdyyNwdUN}Qe_#iZs>$k)~p-qeDY&BfKEZC!q~52uJ9i`*NMR8k5BjRZ|5OV^S$)z3kV zo7`--WAa>FpDj6(@+HYgNhzJ?q@bi7!T+oA5rDR#=r8MmvT_WDY?m+JuY7Ul$LZtz z=A7E@MgOp{ut0CUT`ngMIC`N}4n7=rtwlApJTlH=3vvxM&43(ZEBV9OKxca;d zZl#vcM6VBpi*?4=c8?#%@~f4@uKDJ!kx}YibBF-hEnXk)<=2SQU_QCl?$Aeo-}^y< zmtkg-oCD?F^`JC{kxui*T6B7vb7FdihV38`Vb`Jq6tuwN^(rEYFlAfw&SS+nBRmxB z5^3WBWll!s*1JK9nrEDYgCt`k%hbGt-lanQc3sfII+U-3V)m3XHK?`?Ku50!@L-(U zNjwMHl=gU=+hpR_8*-8=F`o!TUb%1j=0RWpi(-M4HdmvXs`z75q@zHO%#h+5Q{9d^ z$%2RJ{7{+LE7v>IHG_lw^@eTQPo8v9&Ff7G{?*DZ>qUW!tCXSPT#T+)>D8*u16k8U z?Kju)pbbjTV!ACsNRSmJ(&e*LoQ3X}zW*5P0(XWWHudz$`WE=GaOq0B#uqoA7O9G5 zxV=ATGTU{7XR++~HX;CppHoYTOH7D@==Q92Y4}uc_8KTF@2tDx=E`E;(|Y8G<(W-!s99X7ZpYPZkY z;mJ}Uw3mxQUv_c0>oHh9N|BjKU)DJ?WEaA^W6u)2%K4am&S5W{C=A=AWu?;jpw;eT z$r(nAj9JZkAf#;XrtlTD$qY)QL?qh{wak1f*b2Ox#LB9_FI`I%V9@U3z^K$d(0O8H zB0j!}y+Saq<|DB`yZA_}8Nu0sy&VBnZ7uRchs<9E`rrD0Lw0aqvp23g?RaSsY|Qw! zhlxt#~zUE!E*mD7Vw8iH?VM@dOd^@%!+)VKZp zmd#XwLp8TZT~P8Ttk-1WrNQj&_X$7BT|+paCmRhq-p1|vx{0=ooCT&B0W*nYuiU>F}O8RIM(RDCqT581}#8$rru zP(UVMZ2@LM3C`ErT5~cKX&ISulPTaByUy{D_4e4FU|*dzbp(mbZ$+7cjr^}@wx&bp z=H_@FjqE;+Y?!h?xq8kdo8M9+1|j$-C-ZAFlKbNhn2q`W-gTLVJ0X*1vnG{#iIXW& zn&Yf+fziBmf6k)9#d~qk;FI|P)g_q!uA$oieRr&Y_CWecG|cS1%)ydVelk>nOI>E0 z1P*E)w5-?k9S1L|g=anHh?X^2*A@sHo46Rjo8Y3}Pdtsu3h-jr6^; zj~_qY-(eG7CRf`(mSj#kW4lb5-%GWJzA9UWu6H$sp(EYt1fQ~j2{~4$c^F`mtSK@v z+O7|+-%3Ata4q=8MG1h~Q}tH}=~%iPx|AE4&EfJDeyWXCwe1sg@7y=h|Hs}{22`1C zZABPSMgb8}(gH+Elx~$!Y3Wu)Enr(qZG}5>!VjP1Rz=(NKP6s@H*u&^ zqumC7f`xkV0xvTaVkhpn8=|5z($X)B3uelo9zLVu6&(0fV)@-EDxEJ^b==KzayK5- zLU?M`)VXq`|@~ zi_L*-e>O&i$bB-2brpa{y3bVK*${Vj)UxAoaIGrGgyTNKvo}MMv|~wB43-VIO5-MJ zwd?#SdV)Dji;UDHBs{xFObD$XNThbqpUu-RdUA+@X>zyDyGFmH=v`ER4Lup3wxGtZ zv>KH;y$3V1mD$gK4h{FVRf~4Dk{ekp$XX*3uiG68(Cc7zLnhD`S(TuqOfMyGb_bJLt zkolg4-bMA`tbzh{Nxt>)CR5;aP)^tPY1%2U#KgiHM<`+V7G8RByhC(1Y~h$gGq$p7 zqqw+u8PYL2c6J4*?)oh++uuo6)UGy#b|=FAfWoJ%@95Z_$C_SmgObE>m0xcBq|*H_ zA5lR}4tts`7Pp<4zalZW{WA z7Lhrx_%o8s6@N{^0QOB~ZMv96XTHu|q3ntxLU`C~Bo@gagP4(YyT zmJ9^lH`3E<3uO(Se0uWa$=c9;1P2C{SDhzKAT0zJVC>Zp_Z+0s1-zQJYyHmTM{=7j zx7JLrhA`f{f4LD0pk~KUp8No14Zq{U*wQ9fLnMNDJ^&oU&x|Axk~b^G)cgHcp{V&; zKb>oeog_CWXP2EN_+*f3*~#v}rZ?q^b|DD>SYl925-=%PkF26%H|~x`_%Rs`a=1N` za$#!NqR@OJPp{!jl!N)HdeZ%8zE;|6)(X!Z%1YA(KZipI>--I*zY{peQfblM_KkFA`z_4V+C zK-)IC>0UiOV}p9%Vkb*ZIZg8%>c~}ou_lgdGbdL0qtepY$Frtag9FQ3Kcpm-4EN@6 zi`G)gt1V)c3Q=3J&;NB@<3DW5_olb&!+TQlIt{0(<|^qDv$|*Fl(x|i5>v6<1^l7N z!8IvwYMQ(<(=(amRO(82>8&dI7|`cEDNbfNbMY3|NVO^D?c12efC{x1TY|D?W0>;Y zD}wX$^H6KH;w}g16I~{24;Hk-!r8lj_rb7dA=CT7<5Iz&gM!H*1Vi~l? zNz2Lkbaev5Y6mhG&p{9!#~l{$j1EQH4o=#Rw8$xM&MX37)y0tgQf0kGXSt3ObPX8> zZ0mb@8n>{f)@)1^dbI_h({B|zEyQj(tvb5zi+22Q3C5SY52ab8MSpoRd9ge?i9?(; z|6`A=`gEV!7Hekk{G#C}S>flS0%|bxQAwI97@WUucd2z{iA!lcC@P})52@3Cxu}1% z{@)gwkKlgbi@{W#x7N|v<`x$BMr#8ZGXuQ6hf)d)11l;j7^AEKVMw+@bMD-EcJ?+T zmILMZRR#b_-6l|kN~wx7g2e$$in+?&<78as*8xQJuB#Ji!y7okz@Q~}IgAW0uXLH@ z#&<#R=zfaIs!h$RXE zo5-leoL5&o^&8>KhX=;7imfSV`*TT1%s3X4p>P-Ilv~Gu60&?|L2It3+}1a){%4aI8h&kxV+SGp#-MH;{sQB6gnL zz>##T^qhAFGbOm^>yHnmlm!F=zOU14%fe6Q*)Ps?CQ}xS2KFuY4qqJiRSIs57971& z#M_pTf_3u2p6thChKu+6h{&g2r#J>fVU76g{gM`tD;bK(?Q@uEWxuNXP7QQfZozPO zs`0w*tORl7f&Ki1Yrv_UfY1{fTdl&JBOsx#0>gF^1_lOMDe z5KKnt1h19QEI}R&gk??lJfMcm?1lfC*!ZRz0;OR2r0>XqVln@Ebf`h zM~&2pwUM`}huu(my~%?67*F7%6D9y`S!vBOsPEQF-&-~{;$C1!cf1P-Jb|*ybz{$p zJA4Vtp!SIlqd{Iz_H46?{`CFr(ZY^F6Iqy4$n;yW<{P}*Wb@isZoGETxEw$(<|^0r z?@6YFOyJuy@RW=uMn~Vb!#I1s($6nKSW}ggivV@13T(CQB`xq(8VF7b%~7_zXV_%#U$37O|~ zhc)=Mdr&aq4It}vnve<3xv(!Lx89J=V0ohDri4UmIp0e1jBaLK*$FEkb~uO2yYvEJ z_)&F^E2#i7^>we~9^68x?sofGhd)pD3POJp5uDk6br+NI&Ye$*pW+bfLo3~r2836k0$yRg zw2Y`%g)c2F6&$_WeUU)#o>ALMap?sP^#}qgd6nlXt6nZWb!d+JFxNbxA9g&re#i>y zjB>!fSG(o3GIOB1a-o`X?($Q z&1dT{On4(+O|9AW^n{gM(kYi@ELlV8nCIAME;a@fU7W6aOOIXj?(#<%M%A6En^ZNc z-Q?@rrw5h(=0x=Nn;q>7y&K`8)J2(G$B)y* zSCnWQ6(=t&`0Ip{+x>TBwgM0TJ+JVq!(?kw8?Sx+4P7w+QFc_=8d%N-5H4_-zz7c< z8|6paRI zCAKWWCnyu;pysr_%=}>Jv$q{|8)$UILvKSwGY=Q02uz9?$OWI@Ks2~cT-Ft<{1~gu zWc|DdfL`);6q=P%1jiKfjj!>{4ZXA8i110a*BK6)%M`TMRKJOO#VVw962nI?&mgnr z;1hx25qUwcnFwZGJyS7GBlCCdcGu4p^{QfGv)H5NX~bM;!}y8Qk8IsLPoti)@w7e5 z!Kl&Fju6+AaoPZ#D%4o2&&g?3;qLA8_Go0xhxZGvL+{_)%^Ea?_Vi{J{6VbgM=SUj z6wMRjFs+~8!0zth=IW~7GPbL}axog1*ViFR+QU(OiHWf6_u#|J5jN7 zS9Rd!$>{F`c%m#+$ZepOdzQ!glK5HZ?*M<1u3$=`Gk;zSd*OPGHH>=1Xix@;mX8Xq zH8Es%--?X@))VR29Ki;7ih)|dc``Ob#Vuex^$w@HCCt)Fim#pPhz(?L!~YI(Vg_-kqGc(nNYq_Qv-8uP=AD zrKBecSNQteSe@$JS>s5szTN0ACZ5ojlOwNULAOx%IQ`Kr!5^6DUo7zNziR#flU7{A zMr&@X<4rVBEddR&OXGK-Sx5nM5c)jeQ5qnc3r|mfn+Lo&gOHLYir-tB?L)t?K;&Vt zaEQnr7|%l1*)i|zy}B|;hcY~twY}1H7N2MdXoI)e^Q`H13GjyGGI z7}2m9c97umu+jwC_tz+H=z>Pd=yqTH_+~>QoV}%)+>8u{mK$o)SK3b0YYhdpMUQoi zKbh#uoX|e&25#!qH4HPNsGSa7!@?Ucv5!|br|UK?SWos0_2(EVA1`|m6r~*#5x$wX zJX16=TYAH7MQmltz9>kbZ018{^&f;D{}(r>ddlI(K8xO%BNL*h@Xc=p;6)2Mxx9EY z3()nK*o>T$aj9qU9baL;;Rmu}w=6YljU#fOiZ6!l!52-q?55HV<_yW_0 z#sC&z1!^t_!E&Yx7hI7Epq9jKCnk}H9PGqEaY6~&r7y7 zAe1=D%|nepWe%xe8KOW$;#Cg_f;AYzS|u4ZxNHh68Rg?q!4!}q7;PccK{X5qTM$78 zDqw%+-|VOh&B|H}u{j_@1L89$ndk$^C94bBEq(H>1l^1roU|-XG7)+!2-EUP#W zDd^bSzx^g-ES_-wi%eaqnUfmnY+Oap;x@}D+O~sqpM_w2i?vLJf3>p!6$Y8tG|Q=9 z%+V@yujf2TD7_A0V3LG<%!!{ZX=G)hn$8n)jGd_e7%p@V>bh&6yy-k=tXA$g_1(-K zyF=uy;4SCO%(v5-b{&&3GN+cpQYq%x;_saFzkAOAI(|U8W0R@p2nZ{u+{id&zmgj4 zfrtY+x5%G4rwhmeSZxzA%tFBJMsf*`KsMIrH1WJw^6e$6W2fmT3ygE8GZa&_E1aEi z*X%!G5tGezfTXNG(e)=;+c1OXNxAjqplLa|m8^3H;Ix>I^|MzXKK9MUJ6F%`IOYzx z(w)KY^{$P@siJT_88R1mF75fsLC__<_Ur0tgD{ zA1GTzgPI9Jl^>JWpclU*`p)bjJdR&cu(9O^iTOls#7v$RldYu?qMXIwXa+%}F+iyy z&i~o3CvR=9U-adTX)NhmWy+hoO0SYZFQ2Hcw!JAZ(4~6Do)KTW1qR|y@YM zuW&ZkxZp*D=*!HZl6&$FD=0JU;IxWqdeU)<$)TMEAXo+UYC`1T_vYJ) zoZ+}YzF+pl$yZuSX=0c{HA9i0|X6d#R=FMNw|t-lg7IkYUoqxiNdlexlDo zduiel$Fcoq8R2#%^Y|!(gQf|3<7CoEZBXAUc%o@OfXXj^CR@HlQt-MOAe%CY(p~VKt-wzvw{XKTA zAonMa&U|Lazj^b=@z@Ir6a-9&64eTW@8{F!FI?zc?Fg7UHb*m&I%%VA#Ey*Lk|6TY zz#Xe2?&YWYpyJAYDWFvJfZ7-Py?wF+`$hw;g7!tn*}NjT7meYP^u9@wKYzoA;qLV( zyn&`=CtDK5R#|R;e0V+YlIbwG+oU_iF&b~%>nIN5ztZi#Jk_`J;tU4seX|4wG zIv$HUnnka!CQD+b;YcGN=NcW?O{y%7<}dS!V*r?!tXC)>^KHp5a%hUPVFsInM}^rz z9t0&!Ki55pOLOGmO)}p3O)csJA-!}xM2UIJf;X2<4^T(NRXlqn81m)qHAYW3NW_#! zpHx5FS^6LD$lv`fiyCS}5?3#0<;cX)HRq!qh_>XekQdzs-^d%S0TA^azLIYbyp3KZ zXf55C$;bfXn{t{#yZIa0t=lcnB1~uBImN=fQ$8v5R7z#sW?V)`kGd|;TDEh)W7TN$ zF>fchNtjIIHEP5hUsLu|Bq9Xm;F_Yt*alE)ttdS=T-t0*LGyiK?H zeN(6du@-%6T_FLw<6Le*aiWK7>E>vLMM`V^AWg~1YmBa;j~Fdd3WWZRb^X_O$bJlw zvw+FfTk>zuN9332mDA48V9?FK+BKI6jemB%A1N0E!3 zS-&|tpVHy00!ccNb)vb7#Xec3(i!DGv$Ur!47)3!z!QE$KlU5b6ypxi$)t;8D zW-0Ss#&f7Y^*Xd*WO) zR(^hEdF<*+W_TU(2*=U;8v5x480zNWSFBm0)+qX*=C(qPBlqEb#wD$WtJ^m+n;* zAdA7t$vF=(3?pzsYeM|_p>k5Yz-+X(WxrJMvm@L?8xU|N%*?+tt93C*$fgFeaa#d3 zsL>1}0n2`^ui81N&ci%QMNVZafdK|8K^ZwCHFTOK(d?SFO*D~4HRcSmDUs*zVav9y zlusXWBsqD&*4=f6+*V{@^w6$!^Okr>UMNqH%EtAZp2;VVs@UqR2WySakl)zU?~641 zb*%Sdkww*on_8c`%(VKu>dM|Y5)8h3%|~F9y)*hE9Jl@4|NClx!yu5Dmf+hz7Guuy>~lK5ON z7`>gJn#7IjzAXl^Hpq^pFa{is3B3^Hq$MBMm2-iSdz)AK#~ch;;&-rgNR zBH}H8IX#E#n7zz{RH3V@Wmj$mbpUgR2fcdGKEVt3z%Tjau?iNRz)7*uzVpJa0IeQ- zo@x3-n8b-=+qquu5S<9Q=`!Fd8N5Ke4<8Iz*@>&?={a>QcBDv%b8wj@+t3_*?ivobmKcb9J30h zpDNg=$QOM6ePQZ%hxwPbBxQn;F4_DclgQ{@JO<)UA=|$|VKek{4bT!KkTnE1b?Td% znu4CM#`DR9Tt+U733EXMnyX%nMFwLU)Nak7j+WR&kV+mv4Rc$-C*IDRMRIwV*J-XVcjkwrp zn1olkY3_;V(2Ta8d82QgcsW#Ov30z6BV<{4WnB+3qJti7EJ?RMz| zOza_!WFBibWuAM(yjiwug8#eeG&@5vWJAhC+YUOloW~L~du{0uWym+Yc9uLaOPvh9 zrI_y?OpFh@U#@*F>A3KEzQN0p<=AP7@hnV%h-(dq^paiP&vJm z{J;jA+hvhew?*oABu}xXQ_8OFF2!dIx;7>kE4G4{3)or*#Uzwo*FNGDy>(D#dD8Nh zxnUPWU{|C=eA7XV6D|U=vSpK*`2?|S^d4H=%aV4uM1PbW{rd-_Sw3W@>41KhOO7yA zJtWx^Qc;3QuibV(h(#O48qn+BGn|_;g3ZZ^PtI-PX4TdzeqVEQB{vj3zs?-vw3;=r zg>#m_&dW_{agO!sLxC{-uSLA%z+VI*f;!2)=r3M17;i-!-fe=^gKn{zKY#R&w~tj z}2EC+>@OINl3(bjk>0GNXV%v@Os3LH@VM#>_YTXyioq%$u<$ z3q7vP_|$aINuDHmgT$V^Png=MTkjO&H52G%B=f140 z{JEY`CH2J<9iwM&7&LOOtB+?`569EoV&qJz6A4ptEIx$Soopalxi7lp{o<;1?$24o zGRQE-@GX;m?(JKzwHiP`$EgWj@7}$Gfm91%P!WcyZq2d!_sp6l&);yEYA!3bvc{c8 zRLS&SVA$Byq$JE%R=&Ho;vKAkpYQ3$VIf!tE;p)#xl|2T>F(}re2FQ^V0Cl8F{NC_ zm}hFODHN6-*@`!r=JHVep;BXHr(rK!3V4%7-9_!^SDh#)ip79qAQ$!g@B5WMcxNdE zP(_@$nSmdl3x)#11Qimq<5Lno zEu`Q&J~8OYq5oyo1plrFIi8FJFROB9b%A5*Q7LGL;YA-r5&La3->kBV?6#SDSV3#h zD3_F*$1;*XuUI+!3pqDqXF4NPLKv`y35VzXgm9Z<0z)%;^!MN^QJ%wYZaLaY_2Ley zv9S^GlETZANVu)BJ)zJ5#ACl&-QnKd9eNBQEVY!G^FD+%1Mix;N`_=4T@MvYxWd-P zO-du*;?x{Uz(z=kCv)i~?l1b*NS%DFJ3-_~>Nut2Z!tA3F@@i!+cDlX;peX_qGwYJ zpC9RJ3y6#Jos@v%o8-r#z$RT<#t=lr@;AMJ%K!w-DrfBqG!8iX4w<1%|DuH1$x z2Gb55zf$088h|ohqVrM18@unX6moYI^)hudF^1Pkr^>>9KE&{z5GC^AFRq5f#9SC* zeET-^La0pFOap8HaxN6JCU)DX5RX*dzS!a%U}M@rsnyY0=xDw6ur9YsbTFTHzls5S z;`0>O#sed_98NSY4(SE;U6OligKYi({DlMp;rqnMsJq>D6Bxi%@4H`oL|RG;PeI*A zCnGHach{`;VL%r^(96?d5SlOALYd@;a|FvXdQw7(#gPZY{m`aRc!D$|@rDvd2D>O1Y<~l7a#j#2%g{v5X0M-FX}q^)!>3_Y8inhj~==)M`=; zA%Wf(45S2}2e9}fTgN+TkoZUp=T~0GtC5*;riFKunngXt{gFjNIbfLiR=2MfM6xpk zq`Tb7mq@(FH9ctJn@2>YA|I)0C06M25?fT?y$y;AodhxM>C6;0Z+~Ke_o05F26v0% zy*Nz~_^~#Sd(f3G;W^BHg{ge*L3XY_9P`*|DR`NL)yt#9BjwNeC_k+7vFuuCI)P%g z|d=X`gC7Pp9KCmZGF{7P{nRWI--D#7V!@U}RTeEWLcyy?V8NtTk4~yiGI9ev98kWQmWiGyOtL zaLx2?G0vIc;@x?7@71B9p^`U_uXm&Dn#ke#)?A!hjOH}36t?GU z`O#bF>FBtJD!44CB&qD;zi|PafaHL8NY&_|)l@saB>%R#$FNannhR2wv~}#tFiO5t z{LC79@-PDtT4u+`^D^2AX&yiZwKnHnx9cIkyuhegR4K91N^r>y-Wt7D+qV2==p`C2 zUL`Gl(kM<6F`IkG-lS^l#>ONy9;Xbk108S|7bXuIq&v>B_mqXYneNEggJ)*)v1X%Sv;CJl$x*KH_2A4h&mq4X^G$cER3~5tv%xB3=Jisy7jdV${GN!waFDmf8eo%_k zpXzF9!sA30K6Uecl7$eG){m6^Ho#sC8T8s>qPad^=6ZBQWP!w&6+V?ZOt5ng(T<{cganFS)!nHP&9mpIHusySx!7WMNi z09iH<@HZjI1`=C!p_d*Kg{YSUxO1GBDXoQ%Bm3XZ+iAG$T0wz zt!vnuXQOfX-}eV-0ZUOZ8x8I7xE&Mk)g#u|e(VHa5Pk0t#J52B^u0mwxh}!d{B=AA zLR)I^>%*H%TLFDr)babJTv(&T-W~(6`aS5;GtAG2OyX#)U3~`~I%(CdPn7-=TUUmW zesar|lYIaV#Lc;jMgfqW`xyF;PB5Rt3gybz0{J<@>-LH3Yj2nY)r}qV^0w-37n!6(KEPk zxsd$sk?rQ7hqU}L5et{Te#xI(XiSonhuWxTTT7RkH-Yx;m3h?u1HYULqC-7)LQOKh0;-)Z3E~EtUYftJ^NEx_qX#>e;;#!R3eru9v1~rBqkz8)i>soK!FP{ z-dO=W8R4P`TCSFNwfnI?091*z%G_N5*|HFloqqsus(qWS3cFXq{Su7q>}=+|wsB_2 z|1^g;N|~L8H3n*4wP{w_O}0UPd@vRBpk@3( zb6{>mCJ2^#p`S~Iy#EnWwsUF&Fm#`2S|H*@!CFg24T!Xt=DlAV1RP{2NR>FCG`vh_ z+P4y@qFJ2Ll03cbA+21Hx?!gesbZ*6>`?^ibl$LM{=TomZ$*`#I*?GLy@Ui^Q>e&I z4#2+M%pMvjb(3i&yZ33PCszwVi8o9%3LPCh76I{-Xv)Yet)%q!`zw>6GU7X-pL=sB ze@w5{ifM0*APMUMspV!)q5&!;CB<6>k&*@G`j;s)*AK%KYCp`KiZrSaT#;r~s~9=9 zng+>sq>9JS0d@X#*ic-Aco=d~c}JY)!Rox7A(?kUhRA6rk7l02|FQ(+lKjI)6}GDR zwib<>+`vQnZApH{IsVaGMbp8qSWtCcZv`H3gLnyq1hbStl1DnWv-3Da1Es+9hX*sI z`jrceYB|k{i@=iF!r}Mk#KqeJjW)0U_)$OoO25^_49_ZM-mMF20PYs=C|U*vj~lP4 zOA#zIctkO!;JDk+nl!DT@+$$kdEYq0zV45x1FQLc=o$p632d7kPGcdfj}PMV=~cr` zu!nE>`3c10V&~bv@B+GV&|600L(3s(j*+*inu-5P$;X4VQQPxu%;IOdRMPZYVzSmp z0>VuC3QQr0n<1B2UruY1;MT4Il5Eq@^ZAcm?#Dtkd}e!$Zg1Z*TEJ1*Dpl$6agNn) zvq1z>A8!}T@8q}(7=q>=0`^6a9={wVJ5gF%ie$n_lr4B0q0Q3V-dIk){ppeQ`mo<1 zs4crf5~n=-`ur4}26sySoe(=V!*;yFo#jMal0K&|&=*)Cy zOUuh!*01dx0Jq}CyTo`3^TOcJ5vyVuNnTbYg$Lq<*tTn6rF_8l zM*$uEgUhA>xhaRZigTdd=!L@1*|fpx3mrkCx^ASK$GUNPxZ z0eBWl`G+zR5)$2z>N5k&UDlUJI0FCx^#tt&G7Em9ych{jLkwOty^f_bP==;MmyJtO@&M9!b~ zTObE(p(#IP**{1C$8mDAKO9l5eL-~Oe>FYXdt^mil z2P_vfA%xBqg#o7%8DP9U;8M{YcS-vl?6w!aXhIm~I(o8{m(iFDi;I94rD$O~|EFdA zNtgHWAVxN2-m<4aF@ZH4X)Qo{97!}K-^=(<;59LX@k36BSa=@3di81zxTcR?d-kgf zpz6y3nMvAv+We>LoPR!p;_9agD{%Jd;UOf@%y3y`4SI5GLO}__-iA{ftv`hz`WH_c zeFOGL{8Cx!)i0&Q8gWM*f#p#dV2-qOwvCv3cP+!NUq{du=I52q#$zA=M+Exl3ZMdH zbf+;VqxAcgu@^rGWMD&Y#FwU_WD~J>7(6S3fDS)Zw+2k#m6~y2Z7Uzh zN9};nAqI4E0D~34Zc@8ooG!8#C+Yi|G}6F*>fkFM6Y&u;fkBu9_;A!o8Nj;?v$urz z^hqRsuw7-#JgF-zh#f+5J@F-~oWhR&!i5Vd844b|D5dPV~c}{nAG1jG~rV6<#Dai z;_oykY81N97+v1ei|&O4tnPTTVTpl{uW#97Ts}qeEf%*R%Ht0hcpAhXxVmjMedbN?S9TF0*IL}z3<+) z8#x@Lv`^w%y9fPp2k{%NEVPH zT1++1-afP$dFb*W?@~Y9vILMHT6Nou1}0n24_qUTurdGmaKCAT5OFEok7P&{0f7eI z>;3d#PnS954$^p1tecF^QA^jCv!5{wc0;t!V3|#24O&JUZ zHp`!S8x%Fb&0{KfCZ5B9NEw2NTPlIW;~_ZKgna8_XWYc5e0qt!yUz`lgtz_8cPFtIKd`W& zy2~t2jov8y2p!C0$Ta65GUo-IJQ2ueHrSK*_}pv4F8Q|~;BOp+77w!{XcU!|xDbRv z!DJp8eV+%O5+Nm}j+cz%e0yph_2Ee9HeT+e?cA4kPi71FG~6Kl7eb~6Vqz0qf&KE( z6Rt}VR(l!l2oOL)xkZrDKF2c<`$-PK_y@HpHxx2sDR)jIl2$0Gag2c57IWCYq^D+xf24> zbO~8!5TZ$x6m9UUJx!DKE?`*rVKmEPlD2cIWEzwEX*k;O0D*n7W~u91Kd#dWsISI0 z$ryH|L~vP8z~qDnOv@=I-M$hybJuil?GF!bbk-+PvLd_`c2mg+^;BtNgs4{SJ5JN( z76lO~-jPqQWa?xZ_mDtOE3t`puf6ai#HGx6U#cHmn;#qmwgho+Z|@QqPt_bKae3+I z=Z6Y3+Qq`{nZxvtghxiMcN_3nz|B7RN|ezBj#(*m5(a!#j9p-sv_(-vfdIWEi9U|Y zw{{!Ebz57ATK6_yUcLYqp89&K-J)?%PN@fb?+F}YMt4w(Dot-wCne{$tiIlakXH>5 z1g{AmCnhex3pH+Q5LeTLy2{CpF6up74kSYS7Gz^5TnI;h_NgoUJk ziqPKL>MGn;LAoesX#k*WfP_iRHVZ>zV6v(fEpkZh3;I9RpLf{y)^m&>hx(sH4Rr=^ zzM0ws>69yi2t!0U%YY-N^t!vw0I2+#b%Ty0GiV64XPIhIXQUn6(^a`13uc@4fUO5X zSrQ>5?V3QGgscn)dTTKZOh=r!q+)MtjE7+(XdjAE1_?mjQw(OrcSfCdXj&80NMTmR zz@373V~$8#{AX^3e+eV~AHvt9o_jGl`5ft>OsBV%K|-ksBGRS+>JkR)pNu8n{3*cM zpN|eI4lorya$aHcY?t-wL$Zc_RR(}<)Ki4gfrT-9=#^eaiq0T!((ApvgZH71L&$M! z-&=`90d;kCF@F|Xq-X<0Qd404F9l!Nlbd`iUFYjE;$lzX0+GOTi~~z zBu71?s)s8&YbvFRM7}9!Z*OBX%?r`Jt-c|b>LQrLB@^SH0)+nA&LG+qz(y^&{hR;) z?a2L4zmmd*mz~((e;H4K1~Z+}4@GM4t+>giQufS9S?Imcgm z?ckcFQ-{g+?^Esn_Jgm#{y6NXL1B`=+*;6L4F5z-b;V(YIu|LLwJp{k`R$ED)G;fo zs*tMEa!RK7k1uTT8@GW`@8Fi*+tog&LR#`ZrSE^%KkFj^qHx!2tS4LJ#Z^n@zyIc6 zzOu#H7i+5*H__|!C52I@I*9_tD;be{TMGq~97Hlo>)`mky(9NyDA7Z|!Hv?h^-s77O~;e{~X@*$kZhYcw5-4Cg$C&!F>JmLJ;q|Ktop1`nUdXfX|r z=l;9P@*f`efIb9U)|1k+evh}&lMX(&+#zGe{Xe|G-+b=p#8v81U{@gZjsSzk0BL4EXCh zfCn~LUMOufHK_~ymTpeqAc|Bw`|W<}%KH|49KhLv$)UqQ;ih`9pm@c>0Os7+LiZMd zh>4vhxnC%xbJUE0_3Lx%1yPXbJzXn=5UIe0H)~J)bs8l~J>NDx__o2t#spUDI^(xr zyI3MkXhNUb~ah5mLg#GIR(O09baEST~i zR#)1lfqu6LYCsB1a{$_H?aKU%*_(QM5S|u!2BU6z_e5v{&2Mq6554xA+5ElM8FCh< z#HxQyLNzlURF3**a%U5|mi>P0;6t8buJl;-%>lJrx|B)IhYYxDpl-?kbq8LEqJYDe z-OqHGHu?x#`zgu(=TCq5S@owefihoVmhYD%B*&|(n$ISG|0eY)ED}bswlERg*<$iF zn_Si(g&-l*m;*fS@!Qzu#{BPHy?^+uZxQ!@xH5jT-*3rHy(#8#)C5OeW8m*G```P7 zA3S&Zx&VH(>7`K0Mk zMh$P|iyk~mrfAal7jYYY`$hO8EXvv7$O|~U!j5UOnH#+byP(d*UMKlbpCJ$!fQ@WVg$@Q*$GV-Nq5ul;jh{1#UJ;~)O<5C8awfBeIbN~Zt#hkyLT z|Mw4L{;_(PzjpzsF+_do4_(%+#)b*uLxEfthe5ID(Dvr4$!kT|gWJ#*DFSw+;;r?? z;!9QZ1kg<&hG}@|?h=z9=IjEw1d4gq^NrbWVOst8i0%N-brlmBooA@7L5BX9CR$=I zYBU2yLdMam-{)tI&ey9ixN0WI05Y^o>cW#e<#bxtPd~!Lpj8` z`4NyRV6@jzlctf=BP+ZT%xWP2Sw>x2J>I`E5nEPfSoPcdZ1kg#eV}K0AEpMJfj@?G z3V2%NPV}R#P7!=10CGAzWT?6bI19}_Py97Fo|8V`WdvqXr^EVp60#P_aGryn_O@e_M!-UqmWk1HFaW3ZXY)mdMxB?4`ve&5*!uXtt82Hatco|zGWm}p=c?O5zjUPcSrp0ep zzhUn0ccwJc`U7Oa@Y1ye+^n zoz_%@h>>lqOsmYQnpq7TB;jALki05J7HjT?l{%2#Wm4!6;ivyYu9Gw^6d>(0FN)NY zWX{3h6A>SEqInz)20aZ%Z;kdN@`fY~pI^4N0ksWTEze^0{l*(KPlI^vB3Tp1w&F8; zNno%g507{(TH8(N{R(shAK8h0N%_`M#+FsM4NMSE^Fl%9iLIp1baOe;NzGc)aQvm# zm3mk_n6px7-UjhSjn8x&y{iRn12#ecLUVdPyEA|7@y$#%%Z`ZqMq8&ssWRcb4e?HV z<*xrjm*=-8QQBRUSuH0GqHJm8qw;*69eDNblxjK5eLC2Gmw!_!Yhxl7aT0Vnj)?7p z8ux1PTz~O3FAClZIT@;hijT%%CWE%!gPc1*d`%TGA4PhL<9g zCyX1)xn+cg1t8RPr6ggo;)Y!5wwb7vQX-V3vqdx zGGU*I=*~>{tHAN^hHy&hOKKe-4tt-ySv!ohUW-ez1sYaA`h7m#*P7}|Ft-li@+5U& zg=k`eMe`Q}lfcP@yoJihgG;48pzw&@Wtb|i=Yk={{9^5N=~3<}Jn$NwaapQAMDXLQCO55@z1v!leS z_pjWk)f=gPTjN?)`u*^qF3AfGFwXgq#Py(>j%(Ok=IRFwsg@+y*xQu$bDwG$_YrCZ z03Uq2!)i~loPt_QoK(3NC1pmh?IaQHKEeYoln1B4!{6ylyI12%2dthsr-^^S34%I$ z;l*fPD3I?QCo0l2-XFa-hB&IL>J7jc(GW*-lK8WmjZh84KElU{NZJ|@Nf9FaH|%s) z7e;OmH^eB_(-PBR_+SjN{1(oJaAGj;=PQ^ib4>;qx+cd|QgIQG6cX^kn8=8^N<%ua zy7iqk5E7AXV7=!th_eO<4u@_vBAvm4k6Lyb@#J1aHv>&3l7Z**!;9-H;L6z2aoWCn zk3S#tX~9#870P#mY;?2WI#Q7}g{htXxnf*gxGObYZ-D&hY7yuP5>bo$H-}HYX0!$H ze=)!$h!G9Z=92FdZ759C8-*pfM~UQRJ#B>M9cICC9f7$hv8>Qx+;Yq99?gOjGZ5mDvYaq2G-hu*L3Eph_D~1t5a@(=@{Xk5WIFwRtq$SZdUrrgE?Vwi7quf$D z^1~^Odc{*-HbRJu58^o+IogRhjxD;vsZDYnRU9ZXXHJA&<pxNI2qjVSKEdf5&tUwo<)`%5!nYFsU5Ix-LQpq~5$+-I;_aA(ML}H$PBC*XO ztUt=IS?OPt=Y=A{(c&X3#81PcTjUfNh6wW)Yq=pb%i1T>No3}LOe7brw%jv9E*QrA zrN%Vfr;E&#=SK*QQYxt7&w&Sg!qE!IvrB4}i>}CV$6I{en8BkQQQCcLl-q&%eO7n` zR}=2H+kprRNjca$^}>kqmywfJq8%1#oP5=2OYans+J($gzZvva9s1YgTLut?O|W{z zwwIn5sI0lCvgIr&Ha7v>yd9DA1q0Uix?oO}IH7Q7qo~sPNLQKDaWb=N<~uO9dqk48 z(<>PvU}-QtxSL4-F_-+!5Kt=&_ra?=BSO(SCYnQ^U)}+6U&b!#$qGQR@q*?zf3l4# z%Q?s6!M=tqkfzPKX>5fiWeK+>EVmiEQRWAmH{R$U2{4f$i<&8c983PXdrX()h8^7g zIyYN8HQgAB0P#lLs%F(AtkwfJME}~L7j9FOq=|V#KMI&2neKpWp@DTf-?ydF#e!5M z{p!maU(7WK2+5Jin%e3l@l&^KzI#|=TIz6VE``kjZ)D%;f^s4gU_m*|zxD~Y$VWk4 z$v$gnwnbhgKIxQN@oL{Q55%yE%k-`D`geXIHI%Zp0!dT7VwtCM;acj_zE2l5O=A#F zX^)Nbd@mGY2TpAN&|gj9Aal2OfHyMUDe05?JA;k~RScwLMxDm0*o=%>zG>n>9~hTh zM$!-4DXD*pgJM+6alK0883Ywi5JFaiO>pDH1%945y-*ca8EY+Kb4W-q(USc zKTHo4ks-1XigyDoNYXvBuOc*m&*vkI=Ch|jkW43FxFyE0p20fBkxdE5GLG)MQ^_F6 z8jlRPg(FZp<_J>5R75%xu)zJpyhP49AkJIaTOEGm^;LUmrpy8JI9}UQLIv+>njrVm zSk3oQSs0cdj36qef7b^AkId{mT*NY%G5^QjTZUDgcJ0H8Vt|ASsDz*nDlHu%V4#xH z9a7RNBB{WjBC-jQk_M6P+H@#V(jbk1f|4TA4gYm9!aOtgpz}QM{pmgYGRHm~#Jzv} zy4E_^xz;*Ql8CLeF~UReb(Pb`U+o~;2c@fe!uw%l%Gtb*5nn7I|8}-$5SD(#Q*ltS zz}2DJG^JotbRHs>%Is=i#4(hQb8q4ea=pL(<{KOZXC7`jBUcf#r~H#wQH_q5fbVA{ zYsFYs3qAXoj~mI3CJcxB{q2iJ+hY&gFlvzFpY!cRbzS8S4E2e~Bx0)GZXK=0dHc{z zCXE6f+4FHq)-zymzHi5FLdK33$+Kf}P^?gML+ScbHz@DnL^o&B9=W4QBYkrl1xcN$BI*G&pQUYh9=-LHAL zy%l!SKUJ`p$S&`#OJaS(x_Mny6lsboV@xN?mS8zbOC{q%OS?>qGKGf!ENcl&e7pgD z3<`}_gS9?a7D>i!@@+f{1riM6q+LbukA6hL()l%-8|UQj*)9Cx9$k@E4E2-jRzJO) zHy|5|!LOhk1Ssu0FMZD}KQlJ~nLyM8^JdI@c@_6ti{OTjFt?jz)RsJTSnkj3E-gm3 zwf-O$*JpET^F4QJ&{XFe-(M4>-zI;5H^TNxaG_m^e3wIJzb-CM)jPvNfA}YC*=`ot z>ut9BjBQn6Py?2v?78nUl{rTHrgSdTQ-#lc79Rf)vDQDuo)_l^NXNj{+66Zp6E=*D z28i)5M41|zvlH=u6GGm7D%{6Bp37~E|Jnz@fFI>(i+mRc9oD$vcQL0zpPovusm;ui z@AzGHKJvH})4uq1%Oc1VLcER=+}#{tVc4yhniFrXmc(mH0fnL#?Qo_g?7QQGX`+!z z#y7x&eo2?^fp%ErY}!De@#7tYJTzE!liN^TKK4~mp~IT`naQ4Q+;R%0(pN|&T>-qU zO6U_3o9J09gwLSB*PK?z)9=>Im3F?j$nK2`SRi-K0&fK&xtj8AtLiPPKGFNiSs&jx zJL~x=N1MC@$37=e+)6Q_IJL>QiQ+@$BpJfzE6{P_iVoYF_ceilPSqNQs`Rq;A}5l| zo%#TyTF#j?bw8^%B@-hE!D3%t&R>7n)a(0q-VnMo&vK01vd3nSSn-7`OBjkHPb%Vb z-H)EjGhvP9v1Gi21#U>6upeTPxG9C}HDr96)(r)3kahCcOKx<=&l1uU=jQ-A ziSiF_Rf*6v2;eq(VHGrNJ`>}g$Cmr)5k_7M6VQqYv8PdexziHr9d~b+@_-QHcZ+7RDJ|+=i`>=k#h0TCgD?8!(7$C+f%n7y`Lb_N(AxvD zs!CA;o@6F4Cx)**g8BZqiE;ZzYMd$WT+M+^%o11#FF}H{lILccQ_IPXGw~YmSqeP) z>VSwlte~Xj)NE-%&?jins$!gk!hnbli*kCTiDPUYA;ZEI(SiYFY}Os+;^3_rTyd?Sr{gIaTq}s%5 z(f+CfvdfiPlb_y}y_w%q_aW2V5rx?=REb%gMtb$xpWY&8Q*6HIpiti3;RESv-W=TC zw9$v}`mtheF~ezL6u^R-cwhDM1_Wx(6k_z^07o&Wl~y8$44d~(8LkD2hN)}Hi& zQwob7hsTdq-_vf&9|eB91SBG4=;+-U#Kb2~IKz&=ALZGn3bZ>c)-Cup)5HIGp+ujm z`_}7>VN+z(Qk4BCrW6Cv>SiIm(U+!lW&}9hJADgz2Sv=;;cRPD`_|Q>a@6ADsZ~2D zeoDX+QY^IofTFRg@4igT?u_R+3EISY8*d&s&-nusu9RTUL3bz{d4@Fe6ir95L}eMa z(^O*~isT zsVyHfok;iaQk(`z(w0B>_^gOIl}Lqa3N?p;)DTREPP}ODRfcX(MgS0IO2$c8@|owf z$pukFsD_;x(Z8x+z4l|f3xN=*X<9ln#GVpRJueH z#-_~u9w_DKS&eJlKRshr@D!ZHoXcDfJvR*^QS5LH-qM=&4xQ)c6N``Es-WT7{)(qW zAevw{k2+P*p^}|%IGg2~(9fw~%|?{IJXx*CUy8?&=xHm_M0r)UoaCX086A%A;ylOq zdd^5Fys03<$XnOuS`FZNNSF-vx9TSas0xFoB)dBmT2{v|u5FRP!VB?r-!hN`i(A;m z=&P97B+B3f<*p*Xpma(&1%}09M_H`S*bjwf(bhWt28T0+j#V#!1*4;Tq?m`S zc;k)lRJlS^eNRR9jx`*}lkOpR3zQ=1Sj(I~+s=%TqM-9$9!_^ZI^E%PW}FrA)+T@c z-T!f!(Rbolj-w1YDZ)%PT0HK#B8mC$8T41*OU>H&qXtyU=r<#*4wSM1Q15ie>Cz6)A%oEg6l+APjg^3lO zEYf}68BZ!pnZRr2i)mrqGRO4dD0^X%R1!*LkI=dokfbMBUl_f9fctS_FpOw0*jy6T zTqV%eI+{MMdCEtR@V6|}s`uh;$-~C^*xC2uel`Kt#Uf^9=6}*pSqCd8S37;I+8D=O zM8==AVCd)vS(67!n=RKqAO5}r_x+gu`H+AfXS5MSN@K@ThJd@@Q!lV&a$0^Hea{$b z(n5iGs18~yz&r21Lb+c6NXHst6#Vq5AfSjP%NF#?m|wQRVkWW%qW8!d8&Pa9{SA0H z6FN>tCeQ;G9phSkcw?fk%)L{dA(}_ZFDJcdnH%r@mhpqe3Oym@qf)vvDUt$XPRZSU zSGN87LxK{3&cV%zJrg|KGD%71gVULzk-VNpWxll0L(fPQv5C{0EApPqsh}oKyr$^R zB(_=yKt|!tkWpcN`Rnvj`H^l;^rcd585I2R;3@*y<&1|L>ZjrJ@k;Pn#LMB^2qTK_ zF~h3E{y1@hpoY_8gxBJDJ$dA?12@u3Nh(-K6|vfwem$22Iq`|k)}E$}GL^H(`!_jq z4&`ta7Zu-~a`AhbAzd&GN0fE>VdBbmH*Q7s(KOJbhY@(N@=Hg?8!cxlGNe7!e6!Q` z=1D`s()Z#cFW+qhUMX`N)0MAzkim%8+HM<<=7<)`W?P7hC=iT&*$DT4e)Zo0%M32G zLOG0E6K_`@bAD9KoS(S*igtD~u-LmIXY#PD%N3QGix?sKkoIjG49y~rvVwntgVVXK zPB>gUK1ZWd#x3qsh2wTC$mHH4e{3XWA=YvxXXoZO^dA-OpQp0{2Per_kpuj7Zv*?2 zB~+S6^X(SwTdkUY{5JwULUp(*49W~_l%RiuXzTjiC*S_#ga2a~HmEI#xMer0ss;V= zhP|2ouq?@RIqt;PpLFvOI~w6JLa7guO>q&&kUt7yn9cyOYXH$rOfncp+~k>QmOYrF z49;SGxmo z8q8!mfLbnyOt`sk=fB&d6S8wu@}U#cXoN=X5yS>wkZf#@lfC$+AmW4v&vA2&MtD4= zP~G@0I<}vHMPQQ-6NxYH_5RJ{*r_5K4&FFU$0gCE$u)rmU1auGlAQmQ98!e*VdQA? zu>_xGA3S-&D@Sc~5A)oo-{%{n$n?I9=0NJbcT-_3~%w5psbwWcQN<(yXDW}Pv zph$879L?Cy(|#t^%TqdmBaEZ&YQMB41HMsAphF8a$+>&8`#``|=c5B_J9Mx43!tB? z1=#|^fn_Dh8nm$~k+e)Is4YE4n5S?6^j`NUs+kE7$gfse4*W}Te);{U;nLlx@lU<| zv81HWOnH~PYg0F{idrd!vv@GsLwD~jD4-o>E?%Cz-k6mxE_s7^DENi@zf&E&!+^Tl zsZdJjGjEf(I*~t?rOGqluQOtKa7{8y;9LPT{apq_{>Ss(h-?24C}c=H3ZlnYk=(s>(8F2Nq`Hi9F@5-J(B4uq*B&KMjl<8H~gO0A;sUu zEw1*cr{gB{_*;~4w8c4b*)29ZEOQDBm}x_dGWu;ocDyxKGo20Jx7l-+BChPmYqznp zs(tCZ=C;M}5=z<|Y&#Mca5GmV>2~nx(lg&=Brjnnk$X8YL4tLOH+VkeK~Go z%1GX%k9#xD{q2R6og+A_I_W)E=duWgVliJjNXDGlom_rGNM=>?Xn&QCxcq$%7j@`G zU+v!@JHqYo7&B_v;vDOPxYz4S=%E@|$2Qv6hn}OLa@`Fm>PtSaS=p%RR^dR%6u;E> z6IX|>x7?$*-lw>Yya6IVE|MAZi%b^0UO@Sd!Tw&=pvW| zt$nkb_Yx>Ini1Iz?=3|F!w{!gvE{qzJW!d= zrvbM{ECR7Av`-IF*YL$i4|5tPrd^((6?4#c83xp_5AK(et%M)7^7sk5WJVo>&52E^ zbB>I>{K~xxn3pwoB%01@`}z0X8wKwtjI;Pk2yJw`h^?NG2vJR;{*0yE_cJw-#mWsA zy5v`dFjUyA-nY@6`H#%eAc+f80WZ(Mv}yCu9L)PPfC*BwWf=kD4+>N#tspy#9(`)g z4!yTt(a?mm^MH6-R%({v-SkQTHGEyvlr=ij6snkGCpDAuzvdZAJO{EzR_*nfYWl}w zXJjkg~~Y^~fEE<+uvtp>W~cRJc#ufdyM&DR1T!dzrB zJkGDeL{PWxCI3&^59kyiuv1{W_L#ku+FLUG<58xcCovkf!%^wavr#cv0B$v8PD;1w z439=N6X*p59RZd_2`E-E^`qKqHW~Cs3`mLSapZ;7)HmvY&I{MQCAdVWxa^;NgPPMB z^oFCd+zGUOJ3*r)I;$IcDxz7#EeVSAP0x!+ngsS)f6W~%0a&4w{#0gn7z{k3uf^3x zTZF;9&Dg+sE4WmNeT8^<(BKPeJ`CV%41-5e#_P>EMFU=Dwr|riYHCY)Jw>x}ZeJFc z6zlQ=N(f`anri~1S9x&RSucoxsaH{& zfvH&nzqC=ixyik(0aQJ%wMrY#pWy>=4z{?Mid#(~NFvNCyYK4U$(5rw>j{0+~KlsTiWe3e$K`T>&9~d#DHaoE>a|%s6peN37NVf2?<=>*x#xLCFsD_^i z`hOa>(iavTbqp?Ytb1MDW$EA{9nC#mOta$s5u>JN9QwvP? zINOyO7}62Ij-lQ_)yfG&)JPmVK^9~SEUipe*)CMbAd^bjuCLG@jx*h0@BZ_tPO=aW z9#c>;jA|AFBh!&)t}+5d-1K#q_KHwHsdy9zqQJ}(d3gtYYig0Us>O5K)owuL7sT5| zJ1x5{+9GAeo}yE)1wQ0rbJZ(3&>(sP17y{lbirK^BV@Y_HqmVZ7mk_UAnH9@e1iv? z0Z+y^sJgIP-^R)vSETpecwuw9&O;c}J615;X&h$?4||}6ekOX$x6gT+xH#!3)IK92 za8B=nihlI$5sKfK9;cO&sXl^^Sj=o=lQ}!8ES})k(3(ODM`gq}XjK1ZA&nw-FBO`~ z9#@JBxns_5gQd5iZs7x-2dg`?QNP7QU=Dty7}?MQIP8rrcc)OiANy%s z$shV(8uT{F^Vd5eXf8y+JiZe&=kjy9pqGq!jE1q%Uq3APr9NHe`UK}DN4_UB`4x=Q z2lIV^POO|#I8)L`G^BqNhCVZI#MuZ#yYp5D(f+<|X1K$CH4LzlpmEBU z=x>M1k(?CrNn76a=8v~z9v7EKlCD|abD&zo|kfWOqG1-ZS$SbQ%_)9wD8cg}(^_)d|fBStR1(lRW~WjvK44w(T~|ABmbgS4=4VJ>m9+{jv1nBb)up zRvc*hn+(Bq?>M8KzIx1=^1v#1HlE752TqrWT2eTj=hHsqdJwn@LNu40d-4UKyklb8 z&8Su{T3Z=EbRb*Q;_$&d$hL{!8ZHv~@~Mw2xfNQC_sn>7Y$>kZnewXf3UvtGJYrVR zYz}kREEmw3jacp{Qvl8BC{fQ7%)+R>ir)yt7JM}4hd$!FNla5wvBwIGF0fO=Fc_ML zvgqboJPX>@NDy01*1V|pb#BWzbhRl>!f2Q zNjJc!M<-4?g=rdyoiy;Zo|5Nj9Cmfdo@Boze^~DMci`-(LIxM^$`cms=Pa21DV7?F zRSU{A2aBL0 zK+>NutwGv63!$)&j=PUu9Xn1Yf*F4TBRFJ7++mUdV{;+Rf9QR#IetaeN?6WSbw5(9 zj)qijOPFiLlQC$y*RaG_GmwLi!uA2KHlubr0mzdfD zvvGmpnD(SHCBy2Q)76$henl%();3*J2?7%rHbgfX)&_5d4|aA22+{7n%4BAY2GS41 zpo@{B8SX2HK%7{0bBfZ)H2?Ql7(Kl)Vze}tO@#Ut(OI>IB>4MFAME_I8xoD5sjxp* zX&W8cnp%Oi*}I3exAh9b=F>`HLL%se#h%?%ccrvG=83xpH4gdfc zw{k}la^L$NHn5x)9=GO=xv~`T1Y33*#=SYW{OyevIOtt9ZF|29?5UVYUSnJhbUR?6 zD*qdd=;;}DCUQYHM*$f!u+KbGOb~y~ogqDVLu35TCYlY#GXbsYk1*_-VtaR@?5r`& zm5DD)9aqRo8X#l%ssU~nEtD5}x2%gP_I?5K+Et^X>-s+D-8=U zUVd#GWwNcp3 z$4EI{-~BD!*M=7t4b`P#qWaEbVe`zYyx;L$orL7~=pWwrn6!H$r&|C=o@iClaW9Dp z>N)eR-eHx!Gs`rn`)uKPG_8O7{%Zw1Khays?Er@}26eY87-<^r?3DS|fDi{*Lm2yr zprHdZm~t(HL>SiM0NlDeKw<8CYP_dJxChZ-4R4fSJTDydK7b1IjjNBpUUfJnx?Kt_ zm0eUlvbm|%P>KWQnhZ${Be2N!QD5@b;j51E;I#erYe-M{Z5Zp)dzx@dNM2rkm|%zR zGId0*N{dyk$40^Y*ND8$5m%Z9mk7-dr$JoiuKFAkq!MW%JiN=C7a@v1nEiGkp^=}? zf+|KhZ~tZ5u3Pp~LCAKU;fTth|Mby0v}DHxO-;Vh0n5}}m0Hte;jagAB%(jPhjq$l8`hiyVbG;+(~($`;o^p-Eh&EA#F?A z(EsU9KFcpkw(l0hR;ObyH&vu5%1}l`EtE3u4khTu^h0jIz?^1rmaOi0FThH%k1~eO zRH9Vxh;k2K#ZSbv`zdL=pyt00R8!0mAxAf3(v;A30N5TC)DHYv?b@y-bw#yYU8OJq zMefR!rBrtADc?veiszBN3$E>OC1IaTfDl^Bm@ZDm1Pn{1&=$Ao4E4Iy?cImDFqs?` z5v0(3IRol0zgAL^;7`c%;+q5fS;%1znJg)TdGG^T1;i^9dB_WBCMH?71YDo58CB!*U(q z&XZ#oAD4nb=waA4PRhAP?*DaquJquwQnVln_&YggeU=xR52w5LygY`FjX&l2_5BWMAXz?qi~nswWss-twbE@HkIwPJ&L)WuXpB+jy1~U(QYLJ(j2WP&=Va5| zpR7KNxQgCVx;lzMGOmRQ5%p-?UCa)x{9?-6D4W%ijOrTOHowrny?C{D2>z-hn3y(^ zKZnoaX(5a@6x7VhNTZ757$K$*9G4t*Y$6zGBIqVl)_sP#7loBXiFLS~9By7h&b66W zcfCE3MRhsFEww_1C37~Q*$Tx5gx^1k%vI5QJ zrhYhGjzRYIXxYv7+B33{sIy`*XQ!hU{;U!HcE7jdKnZ(uc$@&OSsJoR+a0G{1|NozJ0_+)TR1ZjMNxc)godmwt4rh5!|vn4j7@=Ek<>89 z(A1X+Xn+Y;-8iznI%cD;CrpcnCcOLg}r%}fk)k=)5(JX7BqE9Pn6h&kdgt3>bt)vp*p)G8Katwka>Y9>5%SY-aN5q}TNN>e7%zZX_|Nz$DkGQxI%gyrzCJ5fsU zsx8`LpMdk>yJ~(0rsl4$DT_`apK}>oo%68FO74@_rF-bprN5oh&Ajr8fB!u2a@sIs z4?UY(rSLhVZM`k=DJAc`*QQDc1D$vB<@Mt~%lz3!utsAvq(R6me-HRUZ z-^|1Di?`U_X9gB>)eUTQ92WCyfU_jmsqk>0o_h(SLcVHbu*uuhdjsvNqe)6wBG@#7 zbX<_5VH&ztoL>a_;hBB5BzLHo)h)^5gy`OwcjL}<*W;j5BXQERl@ub2an!4}fz0Gs zUKP#`@4l@*QaCPNGdB5j_i3giy1+xDXrG>nF45@bCksz7`j<}&3GI>F{zPRX2mVJG-0FJ$fX9Nj5v-U94xmEV8*9pHhtn*6p1vj5{l-0D+0s=<}W)gIZZFZh3bzqsIcPrpqX ze*;bYUw_ZvKCVzQTnRPSnZNzs6?Rb~ z`xSQ4Q}~5;*AwP1w2Q#pFSLv5i(hDW{VDuHyQqHnMKYkL@QY+vX9|9i3@BOtA{h{~ z@rz_YPvIBIuug*gA{l;>3@A_jA{jQH!Y`5morhl}19}R-NQPe|!ym!%7s>EFK>i{b zevu3aEg;8__#;jHA{h{D_(d}OA{o|8>t7_pW_kTX(k80xoZ<7ocx*i*BXaQ_;2>+`Wzor`2OYLN#p!eJVcN52$gP;$4 z`93IZu)_X^L06V3BP@nBM!VC7mhBzAv~lw`%Gk|%_WO&#l;25lYUgrU**DVGdY$+8 zAN%vmWc-E$T1+n7J=QN7osm2MNw};++HssIsShAq=K&f*oJYYa0d0}%fYBW*sKh9< z!-5tDgcgKhqeL4v$lSq!kr`|b`A2hD{`}3G@Zqm792c&v?<-B!f6h7ZFW)4wf7nhW z4+)B-mdejL%=$`)nxe#%v6wwI_>LeNScW!#Kq&iq>u?1)vY-vm*28~VkZo2te|sTR zO>FRvE!xoUZ-4#=L-5bf%+ZhT8wjeFN?)NOMPh-%hZ>?KrqsVK!2Vwah?KmHkEid< zzD4cIn0_@Ao>9ao$CDrZKJ@i4KEbKAl7IDJ-X!P7zj`9IS3kgVmpgI{NIZhp)zUzN z)JHx@772a(v+0Q{!=BV9PnkU;%4Hb%Mj9^RfclWl^k^GPD0nDf>f0$Vgg05d2aBf( z*tRYMxdUp@YB3TI=3#}+E9Z`623(=2u)RomDG|g~(!h-k>lXkJxNy2_d8n<0qf%V%&@m5*V>LY0Lj$vHl`bmBAMpOr=j_s z<&>ZS@}SV@&DKkoe>{8KJ(+_1;=kAFd7eo8UZeUVyURHFwxEN#F zrH8h1Y}TMiV^+)AR$BS&L_dg|xJ~CYnS#}jJFK*F%Usuloi2J)c4M`=`dH6}#L8=_l}(NTgxYft=)b?pvh6RM@9GS-#-6qch}Of*N^K>Efs z(w5>U_m;y$NWMR_#rJ1tYyI*$;)>98yCyrVtrFnvQT8lQW7v*^|M(S1sa5RSEkql~ zW!lL>*A5E(+TdW<2;$Rd53mo`$V|V@&A~F?S!9-lG~6gH+qEV@MTK*iQ{FZmLVDp1HcEBz|ZTZJdfxRDp7GFGggD07D zK0FY0rT}d~#|6<3;1=ZK%A6PLX>3$Q91F4zmtoUW8>CR4TJ_k(crLV=Ou|Je1Kq!) zNIDC7x(&P_o0t(}DI`H6ry#L&0(m7~byVJ58sB_>e|ynD48#2djqH9uew*K834FL+ zL`Nw>Ln6pDY&ZO&5;5*7NMe)|2`)=jMsdsUX$H5k!$?>jEaMJ3)#N9iJa|gd4b)^= zKy0Jo_(LCLipm2X1IQl>1D*tJ7IA+{OKtT``EWa%ZJN*jt1z;YOvw@C|88h5!5%zm zF=*BF!`5W|QDgK82M#%cG!WO#oR5zatK%TxYcBU4n#c%SVMNQ!GH04#JNF0@56d*| z;!04YErXp-ceI{KztE%kb|Jn3nZZET&@6qj|HSblKKYz8{Ki3T;PN16TL+gS8TrW= zGh}SJc)bySo9SVFNr8Mzh4sIDO^oe0aPQ;y(BOulDASmIs~1yF0cq3Gj~V$2mxD}S zRgoJkSu-LPl!U}|(GDF9qxhfztbqsdZ;M|fPllMKbR$5o4GA+IU~PCUe-Z4HZNM~j`Q@UP>(Vk@%ra2WpzuC*?WUPI zx7JS%fqynE*9;^TXZ)+T=g)!*Ss-a1x%6sk1i2oEsZBea;d|F}y9{GO$dF1vc0ofP z$TXHh;GhDf1kk$-oFg|*+>d;iL8<3dE4V2kxfkKWrf`TG{6%06fT^e-bwul-pqp%@ zaqZ2*PxiAw9U{|EYl>6-Cm-(n=?T1g9aQlM_}(4S58jU?r;v_ap8!F~#j;ae$kKv< z7%jY3k#`r_Ex`glQ&qgFTLkRi^C8{##8%i@X0p45JZ-5cOf#h;D>$={u-_;J9Rf0> zudW87I?~C)BNHG8cxACeTJp*Dx_ab{Qx4JsD&~P$`*Q$R4s+jB)&@mFPOF8$pMBV7dMWwQ{r~Q=1F#Wt1b1e3WZ$*Rk`=AdGKpP>B&{cr z17}LFh4XSFYX#*&cKtfW{290!s(2UU8MYbK-2gdEj2Lt7Zw$ZF;TjN7aGBI8NUZWW zDVr883`CuY0S^iN5J9KQ=fKZWS%>5&7v$9gSSK@jZTD53_b&F(g=+uuc7>>L;Hf3{8!R}Zn}B54 z`@yZ7-l1WQ-Zk*ztjnPH_FjiIJ7m>50eZ=0;2on45=e~zu9WV)rPXogY<+U{ob4qz zjP!Y(koDG$vCvhx4?@#X0tg@{f=tQ8yRg;FyB-*F?vijwa#!#QMN-O0uKGPNL8uZ1 zXoNr;L7eF_kMRnVAn0)&X1^gGS?_&Pg8rN!)mck7NHY9LX=w;FN`@EWi)G)*7c2|} zO@I)Z&%paUi&02ewAvsXdb7bej`fBh=$AAZ14!s)>5v4!zoz@3n)Fc#4p9Wyp+r3u z)m)pY&CPWfCkLBByXR*{t%JBs5i)wF5k$~|?7(xt;Yc)PYH^Vtq=Y9xexe)<_f(Ay zWhr99bmrchUjw&8c5GeTfFshbsS{jV4$Qy5qUkt&i+$zG)!wITptn#8xA6G6=YafJ z?8e2`1dyKXjT!dRgB!BP8~N`qUkjLXkP+jww3jUY_Su8$ zNMJEN`gX(l-DR+VFRx%~(Fj&?q)4&bUTJU;kb5#oID2*kc;ub~9C9Z6V#b)*c}erf zZjm2ikQ_uZXgXR$jUcw2B7-1)A{PAB3S=U8pSEk0FIrq#-a$pE9SEo05PW+aRs8C1?iD z0W>^lM zP3HWftdi-KlXh!NgYW4`hTwC!c3XBOWm!rT6+=~sa9%qE<7z%DyehHncdHpQ#Fb~J z%p2iA$=dde10<4pqF}ei<&9K(sG(j^{*w7lG7S(R_sHD9LPrnwhlsj?4d67^ZSXDQ zPg3X~973jnBZ+u3s3^&tZY(J9u93JOp+Yj?w-v%f+d*!06r^v`zC`7M*Wc+M` zH0~mQ56-CLV?Z9DJRmt#bwuH+1YSER+q~=pDanLvP1~bL7P6^g@2Oy^z*~4J^moKk z@${GI$y-r2E4A}5ma@YlnhqFbIXIf+m~IuVv6t~N@H#0bzIp!ECt^# zD6c6Da&e6!+U_lRd|k~c1TzA1}Tp3KqmaU z%b@ZkTCZxW);kNvh*v-sd4wV8*3+2lStP+0Ah`B?CF3Re3A=HT7not4x3tgQkfd9a zu^GspBr{KJWi#{r-cPv;wcIzSWq$Ot%(?#ajyIUTRi1e2bK%jQlN}(O6FiBVJPW#O zr`65z*!ZBT({59@VmiAm{$wjEMDR|7#cK$P+nhwc5AspkL6TUOPp%ko*i3$s+13uO zc_c_ZCCRFhsL28X0*%ZQS>EkZ=AmnFm)(F(I3*)h8%${4>bEoxUXHGyOL~^@+;l_Y zwpiY!Q~4y5SP;JS!FfvlE}f5N2-JI?NN{yT_H2vgXtUXm<6H6;*-H$8NR#EZ=HQ}X z3tJFay99Q*eN_x8!BgMv&&a_@wioci#`6DR%63$I=xK$|F!lx+1AC zq(n8F!;8N*n_XP?%5lfbJ;ftEp!2033|L^IS@v2PDk=fo#0$u^50d=O3Fs5W9Gx%i zX?ioJ@1hN#pAdkOlGNF*#p3*M9{6m&LJF!U3U72_jTIGbT={3gZvOeX@#&SkN-ztm zGaCrd_tVgvnE`-r975|p!5&|CaCQ8;-Bf%>Kg93Eh1-tvd~mu*#9u!Zz7=M_XgF;3 zr2na)02sDr`YWb5Mx=-d{ur!tjK$;k{JV#pc^Fl!e&^Z%nO`pSTd$^?TM5g-g&fodTT8C23zv;@(}iWJbaMoRc!;Ln|Q9S#8{SX3V(XEm*< zq>k7p+hRxT=}zmz^K^$ymm3lO4BW3dp^R7SPX1O85{!y;%cEf9=>c-HBt2_TPI>@x zc=4qORSEf7FPxCk5z@C}1%&Z9E8b!m3DQy9O-s}5F+rCdn;b=TwU-?^qv~$Vtv9ytATq?9> z9TFDaFPs|GE+w?{Swi{aJ(%Od^J_gBs~4~PrxVreyGIyS*qX?DZ(P54$wqKo;9k0d zlI9N*$%tsUmU?%k-M=eJPOCd2TXevQh2jSbAY`)bzQkeBK=aO@!CYEmmyj=Sw5J~; z50F)vWo)^9m|6Le0Gu~QUmjPpHh`2~|42d#v+W}UPZV~9SCJ!8P~?%Vm(RI)*2$4H zqeXs?9Nr)wKviSTZSf{OZ|8pBK^W^he!k<}$l@I6_ptwZmIzZr$rJc z<#nPwiG<`vM=X7NVwf+43jhM;`I7oZdv$ricUJn{sml;9kYc5=Z96Gf4=~c~y=QTk zIE`Bw%~z^q=0RplKh$Vxc8tBPb*=gyYU3-ADrB6Tn<7XJ`)Vpg>OgTSLH{`V4Ap1=C=`r#VZd+Hjv zdARhhtO8tjDVuG$Dtw>iW;-h{Z~ri_Nk}( zbKla?S_RUyefOHjY6*lnVk_2 z_=r=(`LhNwXusGL6McAiW+1O!4)m&qvPcqj${2AhLT#Wj>>79l$nd_{mmU~s8PM}t zjF5oEI|kWrJ|?6hM^QPlAvv)Nsp_J_mcz93u8Y`F>kp3pXRW>-sAvLrBRgdJtTtl3 zu!HW~N!*WeSxM=$q$V3-5v7GwFG77-i$=mMajrHDGOD)M+c*~#UwiH zAUyjeH3y0xBv>xi5cw)`I(rn*v@5tM^U6x;lG8ZhMX!xL>k0b_@7t^htnZA)=jjcp z(|<0@zTX5IXB?!-wv51l;OfNF7Rx!*EJC#rBFeY#0Tk$ z^1yj`3B3xj=~l4cY6k_${$TSu7sQdR!j-ggnNqf#g=+hP!44)PUI?Q+2ukajB~#Vrse09ZW*|_0s3f&2WezoN0EKFz1oCWA;=Gu0R%SBt zWkBZUzJB%K44%5nm(1sE8t878!9Pk#w=;Gb;TERFo59w7F0A@>!IQLNV==3!l)XBY zQpV1<&2=Cm0(gC2UnaiL&|#9u z)_kjR;fnh|{N8}dJQ!QJ=B8rj-LRz}%3ph~oX0&Mgdk8VuG0-guG6>2;$<`82EtA@ zk>8@AthpDrEw=2f%;C4t8#+YAcUNioHZ}~os8|EZ8xb$$PonGn;5l+*GOA!;u!iEe zN}#oUoa87QcudU0M=F$?ukYhlhVoYD+lSX?ueR@Iw9l!kr%R2C_e_~)bb$LV?;&l; z##{KU1fLXgXiH=-s2|o-f*j{L-Ad0gmXx&m;idb~pDF{aslLZf^q2+;na6e-fnYpx z)4gy;&b>SJy$}dAs_EKk$XsoKuakPmZoS2&NP!q|h^$_YTn83(C0)QhB+Vqdvln4BEN2KrYj~(itYzEH`Kf>)BrR%Sn9_f|w!m8R)gVT*I`w0)zClNgWDDWdUVY7p+V0qcHgFpIfx&QvLO-Fi^D`3!x!03~&+~g6Sjy`sOE>{JZWL+ET=1<#ybX!z zgOYl#+!(sk;BF*Fl`{0oVY$Mg^kk5UOkl7S-qkiZKC0HJy zdvwa_vM{sKWDOZ(@!tL6&DR`odTy5yVw1lXa;UENP$d?i_*t zuiDp{9ugB~pZ7%vTl^HNOC#f|LttUV)d-BKQLz}%ziuuEPhEyQUPUB5y$Uo}PRWjN znRf_`)&z1g$gY9dw5uPJ>Wz5yf+3_c3!V}*1GPcf^B?g9pB`q7DCz%9WNYPnNI7i@ zu$wE%PWeM!2#N8^B{DC@x!z069S97#In`FY%88`U+rdwRtz~Yki(5A2_>WIvvz4RC z+HlCCfY{29AMVD$jMN|w9SI_=s_yoZOtJx=6J>OWSuLlTb+_L&8mGN2t*t8G=IU+ik~=vkCULteh`~x?`}OFM zjdQwBi)}5GrEI-A0D0@x@~uA_B_xpnIjFHL6&3c5-IKPxEY}BVpa1c*NOXhqqfjo-t^ebn`+jpRe!D#q z7XvluIks-oO3!1gK-JM2QE(WfmKKG4K6KUb0u~!JW7K}8OHiy@_Nrv7{@tcX5z;1i z3hrd|g#t8Y_nQeoxxAW4@#8A+*!27wnnaTRXO78^4(9@H7-MTukOQ=JR{pMaoX&7A zkn~?Qq4*1ZmI{ESA+Gi#=`K46d1$8c9pLwy0bz3mNVN#y zT~RSs3J0A6Vqk2wU-yPt@?W#@W@O7fU=W;elVj7YZHpiKnQl_iP%3MF0abYKv62Zeoh++ZD0wM%JglE8^+Z8a+?M8cfYVbm% zM3V&aw7#L$#2c4e(`P@SQ zQFZV8AV{!7pU3I#4BW9NfETQyA`!ChgMSgxvVv2~@1{5b7L;7ei-n7v)}o-g-snY9 z^$S2i>6)p)g>t+fl zSKuV;=IP@(s)U+C>!<#{jU8d9==rnC*=YEHNj+{f3p_V2%u9oHh@z1*YL6my>Z{|G zdL2bj1|c2w&(q`=cOGt>w~*u4s~QS4pki0z7X!mSrcQhY-{@(*+?L&2w764rSBGsqWuB{ z#WHxDxRNocG*@Ek08Sy~3(?v3m~{%2z!MO4+A7<3ZMcVQWZ}P-okBf`k}wsX{_!LE zUaW-g#FY^e!?UVNuZ5r|d2XheF^X0-1AJ_~LR62QYK;`}t}%@oG@oEHzS`K=);utn zYd)yJcOO1x`eK$zM|zWuK5rvLO-4PZz<4)yqVxxNaMAr2>SSXiYt@4AWWUM#g+=VD zRW^Hh*i@l^F!f`ktM>4pKk){pq+!8X4I3;-d?fJKC$jPXH`2iOivG?Uf@P`HmkgXo94$ni-M0(_ez2@Dd)vG z+Kub`BLpFIJzys}gnRKec}r>XafrX}FpkET{^%>}^VIqbH5SLQ2qT?TE_i#cCNHU5o$qlr3P`LNT4VU6pfmwOg6S&=% z@g9r-doW-;lAe0FA5F>KEMA#4>M{mp_DKC?sLh}0g(Ohjrezv}HrugEG)M6i2D(n8 zF-3A;M0-w;v@jsi?(RH`gvK<2%YXYO|9UJONsl$$7JlD+?#H2U<0(}ZzXC^jSED9pzaV*rh-&iUVR5Zb!Uxvy$;$StVaf8uqlTM1G5aRx4I=M>uA$ z7sG(Me2Z-)O*y(#^65ircwO{sJ*s!p1Bt+t!Wbsi*uY&Q1h`*z?Idh#`~c=lOpHVj z84yM85PXg;3Df`hGkVH8mh+B@scvS*AHGPu?ntsMMbU*o2TE=CAp>Js3t0D(Frjf* z-J)plTo7wc23}h1iy}4y!g2+;lq<`>y%W}Tf2#KRJvV*=&kOft+QlroCX24g45_s; z;o{0?x3n&0Q ziDj|si+=PUNCW8(JxWtPoQw>&CD8wO@jntQOyt*LYxiS6kZ0cWC9a|q%17@}8jyUkjEN7rsD1HtI$~>_aM6v^DDrla-5TXHE zVq2L;-U@m$G(K8(s)}j_80eS&G%550s)x zn>t$l*1az%e)MQU?O21ACoKXOcli*FzV|SSmR)Ku@ilVr_B#n~cSP%j0#$X@@je*A znUWmj=Y+9#lvX6N^ca7f`kQls&gMI1b@9hPwh?K(Ucp(Ugs2}U>jp_$uc#LSB6}Ns zVlfWgryI~{3pj}kuN9TDq0ga!ss=jQ=+ao}msQkRnodGsJeh-w=r=W%0u94?=zy@bF)&Kf7&j=Bb2?GA zlxA=Obgz-P34U_{#?P};;fk_CVFMNv7rwxE6X1LgE(Tj0{cjub?jN&)&`~5iZzeze z)8zt`TCdX*qaIo?_`q54y@O&;zo-`qe;r6J)H&Rsi^C@g9Q~8>w>fI3j^h)`7jXZoI0rFmy*M?~fw8 z5=>k@e`?8tS8$PnmHSZp7>uZno7)LHfg6NBaV(W(`-=!IGO8|zwG|zpIpSsq;iIeG zm9U^j)DO|QX(Q9l&kK?{Ls(W0bNb9M50BODC8 zdS5}*n96~nrD|$sBL`TtNCR%G99%$En=NM2tkE!)X;#Nc*-jLA@jiD35bakO`WhLM3GYJ^D%nY-I0240ii zIDlAk9T-(W%n{x`a&Se8UYt(+3T7;%t$TtB|{o-}H^ z;}wfef$g7IA9UB$9`Oa=I!op&BElaK~I5A$5)wR*?FnfF-So4k8*+mjIn|}v-oo$H%`c`K!HyNd=7&e`>me! z_7yaQdMY_C+cAl5=Y?K)ZSyKz_x&(&(7PakQ7qv|p)eA9A3arwf$Q`$un1^=KlHPgZ}I@RLY*Gzu zrHzK-PlerQ^OmJ4oo*V#8nP|cmqG0>lzXh1_tlt%WNt@ zhbiA}>wmxfmrTbznxbb{z*2S=@Rqg3-6m+mQrwytOKl~_!GTKCt;^kiAvplzUu+x2 zcr=Io#T;(>QTlB=v(0yoNI!xtDiW%oyk?fqqbegB&%CbxkoY_8WD0Lv=@33t1-`I6 zG9_z>S(5NJ^rV)J!@I|ab`QYOChvQp`$Tx)%8e^qPpsxbX1vXf-gY2imCrUHDm`Jn zBO0{fk@t_2dCjfp7EF0DLTt1*K6C)}F!&M9oxjBt%< zTUt}UoA$T_I9h6rQs);WL^SvcBE#`#ie`@S3&JwVXj@J_Nm3CeKq)~Sy^D1HTwJQE4S_al8)$vsDb(g!tn{&tpphx>sT(fLMH$De#+p*Q}<^NrF{i3j%rCwrtH z+r>HCB(EKAZQOnol^}w+VHX|E{0i=F?Tq?;0jsQ=hXNAzF}^0*BjS8L^NS9#YHkST z+y2T=RJIUTMm1;dE#?cH`@mHDz|pu-t2$BdI*fi5xO`)^9;WsRUi`o(^BiI7Ny!|O z73fGNx*!l_H)PE{nQ`vkc?DgF$F|L433^y3=B^C8*yygd zj&?{v@1TDst~;Y)C+@OUPz62-Yiz7+LY)is!o;-h_Wde@a|!1SvxOQl%*t|10KD2k z>_vg{WODR2RdBC(o@IuFl9HUw67Vm5Q|;ya6p=VLIl9NWUtj%P6R#Lf>N*5H z$3D%=&zxJ}Bk=+5`rP|Swr?LtGbvawVp?8vjcw91$Us@uLH+JO#+hc9Q46ll@1#8= zU?D+q-Dc*3Q4%>KW5(e)F0}kL(cmUYZQa)K3MjYl8X0Nf1=;uL0`~FMF)N&8g|oOd z|7}q81b&jo3F?{s_atj88(k>CZ%0Tgih!Q0Hj+8hAex92)9Mj|B}F$^MEUP6SgMwemq z&YT0C7sGf@uKg|FZoPmvQ%p>4=DN?n)F7>RQYTMb!hAXz2OS-{ z*7Iwk8qpNBAy_|O&h;eE84&1}o-v_`Bx0yAUDm8^1JT!kb%jwzf^KsgmY(4g6Z-(u?~t9L=pNZ_f)pAIOa*!yLkWD%a|*^dN0f(c`a)jS9ED>$G!bxAY1x zOWNH1nJXA75~ky?-WWPD7|Be}+EJG@@T1`evJ%Oj+6OaGK-J;GC4;P`cpZtI0YGjf zYekp11BZqEM}`V-m??PuLyvQ3Ej1>`&FE*p3%7v+QYCebwm=D=vE4s4+iFJKK64;9 zI;tLbmsVQ0buny4ktY8L)cCTbE7c$HSx8_#D^F+X1@bG_Ej~{NUB{bIFnd?d+EMIt z=XM7p9lNjqG_wox0&^AhyUHD#3qUQ&7BD(;HHJW9Ff-RJ&O6eZzw77awk)Edwj31h%(tWPvFyP+6~!#2~_dxu3WeC`&hl`5l#aM1?YQ2V*TqXREdI5K)O zdbgL5G1w#ydsVFpN5f@5^9r|M#95YYDlM1xWJ#7EO`w3t!L1?0(oTR#b}uo<4N6lq zG5OecV~T!JB8Lokw5aoLy|ar96rPDN&p{|jVO*tCoj|w85Eha*iW4Lu*2+3YU_0r4 zI@uZa1X&qHSeN@72_lra=H5IZL65vc6;v82Uj85mNY~oYb7_Kl?p=Cz!srLc+TYf> zyyBH5k1eHIrbb4Dza;KK|EIU!bHgl6b-3s56}9ZSBQ3?d0erwdz&dl*#&qJsny4v% zpl3pyWIS@7j(ZoVWh;tidjIq9R;rQ&(~!BOWq_gsXyoY`{W`;QLW#nQs zE3Pm;iM47#XQXtvsSIlzCKd-RYnjbV8``oS5Qy+k%`e1pGQGv{y%C7!V*ud43LAz_ zdJDx&NYTvC_;`*dcFmMJQ)=rPMpAEfG z&lv_y_Oz6ldv3GSys*baH6dlowC*YVptFhD0K$o$jhZbjN@X7d7wr78lqahKdAl?h ztNHam@CR??1~ER2Wrsn|m6!ZXuUU$RcxC1qryMGyr^dGWs_=PDumL>+^QnAr zUWeEmEW-?jv+d+sIg!V2;`8zoa(d7RFZqR-^p?!hHnu?!2#ic_a}mpvw< zE`yqlcf9Qf5`9ecr%PdwXj`SGrf7QgZFku`7BJbf(iPfMd??Huunf~xv(u+|myM7S z!`y=VvwnX9|MwkdbsNiI=I^uq4K~!beW+CF6`oqE^a@WcReFV|mMXo%GD4MJ;h~^P zukiY!P_OV%P^edk;6fRQfwh)-b2o^xz`C*s8^Q!4z9G zA|=o1e;k!D#rC2fW%o$1Ta1@atxe^j!_Dc91I5(BQVUD%*#DnWF!NNjw7Yh%$FHc5 OV(W}7KhHMU68bOUFw=qn literal 0 HcmV?d00001 diff --git a/images/weak_scaling.png b/images/weak_scaling.png new file mode 100644 index 0000000000000000000000000000000000000000..59c3cec6c6afb1326587783bd68a393dc42506a1 GIT binary patch literal 433007 zcmeEuWl$9C+b7i?D>266+HZ=TooAB3n4D3ggWWMzsEx4aK z*Moao<2&(dG&G{(f(~fu`c*iXSzkPV-iuL|(4&Yau|Y#;aG!;$*BHZ;0+Sl|r6cCo zVfqb{=e&eR)TOEK=q%~ZuDt6@!nWWJ+L%{-U5X=Nm)}T7x03uJGrkaGuQB{nPEHtO zgg+gwf(Dy{M1_mLq`A6<_mha=dyld;$VB!(9J-Rez42D*_HBu|o9ZY*!n+x7O)Uvc zaRUj{?tN?d)f^ubv~)|VH#;flMbOPFS1z?elT+!hganFR`Q9Yhog-_mW_CN}?Va1W zyO{Gl3Ig$y7ffK~1k9hwTPQ1Iu!6_97?`iEF|ffSOz%;g zah1_e3Q4+tz`&5eke89v^2FSjzTRLo+<0w=OkV4LE~Rv@G(pai`homtFZYn`rU_({=k9E61(DnS-lqWEtv0eIs;LgvnYcU|cuvM?~|)o`-bKm5yYOo#o0 z>tEXZ|LheZvxEe$7UoLb{eSt5af6(1{A)+`e}7s$LP9KUj5?0%|MDAy<9g-qas7K- z|IelPcOLvbu79iR-|C761AnXQUoh}5_iXIH)%9<6{aaoCR@c8&>o3*%s|@^A2L37o ze}$~SLe^iZ^;dWOtGoWS3I1g@|F2B|ten3>)?ZiaudDUfg#By6{xxC$W(NLd2L8HQ ze_gG=uGZf~*1w#~`~Q{5a+UrckwUePEguD4VEY5&nHk<|1w7sdD?Co~Z9L{b;(6>R zDn=@8;T3JfirF)bK6$)wLvH)hZ&$7J(%ltIE>Gc6RHYe{%FoA&^{iV%uh*U$c(3MA zQ`BZ_F-Sa@B)-k>V3^A9Xgsd0Z}+twhR0T{R+WMG#{^ao19qM7?zPyxfL&(Om#6=e zq}sz4tW5t?>$~hdOJM)s7_u0;}0^ z8qNMF=TUpZ)oG7e-nn+%vCNJY zJI!B;>}!|+yI_&1FO&5+=xnQ=6JN&ba8$RsY~FhvlB6QC+g7m6nBQr3^4*4^ zCsa9(0cVp1KFBR_RV3p)4gRA@9}VvxYC*{0mHKP^^W{*n$4c(Yq$q4$~UG z7Y7^T-0Y19umZVQno&Dr18w4(U%~h=ZC#_$wJ}q1HtY$8!RCNV*QEqqzR9*^F0-=0 zINV^Nxloc+h#5UtVe*$j!do@>(%mPlt+%FX_wARn<>O!}mKiDGw`emsz*|=gO5ykE zc4GbcjOAb%E&Hjn8p-?p`8AqB)E-k@ji+lx$`nZ`KlI5)&ISaGdynS7`cA-~T|OXI zudPgQSKyI-c^@A{AFcqdX}(9$fcT{d!JWtVr@@WV%U5(EnfhTbjV*28csOR=j5<>E zJ3HdVEV-Ku7p{;v*NkdDU*cTtpmJ0i5ZPBMJjgV*tC^}b41|@b*PFyUa z$CSutUO(iS(&JAL*LADb7GlLSZVGRdmK}bUGTe8l9M!2^E0h7LV+QPb#~bLoK6$uR zznyXQ#%;B#nYIX$VfRV*Nz`$tIER5M&YzcA5c6_VMvxv#@~31WFPSGe z3$UMG&1@&+q!*A=E~?*So6s+Bp}^v{>{lXsM=__Im*H&_8PNi{9*EalpqXbOsUdzk z6bl2N{N4P=$<)Bx!r>B)_~)qY0M+`fI$NQmpJv0us^Zgy3+L?=CQ6sPw3qLzrrjqG zR=&iJoiSQ2MA=V+nynF<0BQvZWMa$8>Ud{v6EKJ4k9^AXmUD5NU5rcj$n z%6n?rEeX2DDPfjx?{Md`et)wQjjJLI7TtilMJFMsVz3L{`n(Y7NjHU7vUnV(YRGML zj$mnyO(`)VdtDE2*mk_VlK`2DsT5uibCfeSBBE1!Ct-MI1S-tx+3cmi5b9ufIG#ll zk5V47&L_6tFR<{o(9RQHVpIf+c_Nh8qR818@AF*_6rB2!NN}Z|5n{6n%5}_Aqq_*s zi*oxbe=4(Tr4rS_XK>L!Jiqb^tK4Eh$sD%Y!wxrs;YtqUb{u}QAKbmek4aRDb>wu zKomJrY)?Rf8WbDP@5FNy1Q1^aTKR3NC~hN z82r4tnhZ{kHfjDr1Ha$t0qao-%b#v>Ryp3WE=IcBz@)?PUy}dyc}MZ8b+5s9SpN0c zJpLKl^yOZAdY*;M%b5V!>xJfxFN$j)bk0vxt-p^}DNvQ{K?@KJQ2{5Zv>JSvd$ouR z#WaUH>pp={iP5(@-!Dnu$PcTEoxIeGO@1Wgx-$6BLgNSF21csINrqe<{d{6E`H|0F ze+>>b@dishOeA-A$c`HS`bd2X4uzpwIYxfCee6K^gCxVp=>Zof1W|aG{%Zw!V>}pz zFRp;ul5y9?_j0*ZjtyLH+cS+XoIcHh%g#w^guuonWqjneHnQPu4{p>d=&4X^36~dC zdQ8-_pzx8!W`n+F9YVm@75ti9-|ExBxMgMz|BUAs2QhB8nJgj07aKpnt`17y>f

m zUNSHMplKZa`!XsaN0BUqS~wVCL*@`I&^+a~9I!fE?p54}W#8&FH@Wq;fy@3t_V$R* z`es6+o^QKOb!`^J6J1FjV%r?73|UXrIN?*KY=TJ}30W*zYzbbPU+5Z8=eC}BQ~Z3V zBMPOIwS`0D`76kER9m;QN5HY^WVxc7F)bUcWuDQgZ9&{^MW1l14(!Pb)S0EE8+&w- zsPQ^zU7~h}l7SL__%}c!?MvllOLyrPJQN+gDO86$6_EvLFpqeU*d;~*D$qa$D5(e| zWl^kJZ1)*21dq)~zM-4Pgms?bAwbW8Ex`dcUQ`=|FSs$_2s#i0NP^J1 z7&I}{9lY-`7C2O9b@P#ILKop9bL29=RItmJS<1z6 zd;O+C4HjU}r0&_&&0oYd;|A4<%HGz3{Al3e6y?A8bZ|z`lsiQ7dbBX@=J1vrt)Euh z^LNS5y|x>X?zgX9c>sex8M7pP0o&eUX7giDD2@N1geu}XfZRkP;d(OroV~4@^4EM5;6XnN9U`NZYOafS?U-m7Ue@0;1by>7i#=q-7^T? z$_rb}lZ+DG0bKl$R{evu=p~{HVB5-<)SSOtwEyCrtVAn1^5$-T7~W|K=GZw956`}4 za_o{S?5I+2zlPuzQ?3?0nLJ3Z^iUDD_qAH9Q|t8grHwn9)ZYTtvH~1|s>&2-vAFN? z4iA7P(1-i@U{{6WS}LrEVY{8QnajNe8|C)n=1IyvTeX&_TOQ46XcS+u_(9RaF4K2U z9t{#;QJhW5Xl`_I3UAKUE8ly~*iqMf(&q>RRdg`LJ;Z)95X-5!BOBRZ9?d)U6p2r`-)?{qBM@ zSzyxaZw;>c@egmOi`t_J+w$jm4T!AQwEhB(`~AY!VPrbT3I?0Iik8!e%@;+tu&@j7 zl>(}>BqZJ;=Y^kg< ztVSQCO7*P2kN4?nUitH*aD*jWNsTG_${l1d9(sw2B(Q(U5{__n-i@R2UiowYu=&e@ z=0pRB+J_i|7coN+gU)jU70c%6V$`hn^hu>6TS`qnuI5Jx{YYUeKcRTOeoHObIf9#o-0 zUpGZ|&~k)f>T=WxV}>q$8rlz;)M_N|4trhcXNbSImQ)vhvdrI{g2P7bv+;F&ambm9 zVlEYK65uDa-^<;j4LM!ECl}K@1NWj5+iuv0eY-|GWp_v;>iu#~jp#nbWJX?FRVVKp z04k|@-rDP~V}8ckGSL79Y6fLq9OPMpE$^9@`>5IZ_D-Bb?NZ_f+crYqm2e&C1%7B$ zG%%eR$&TN)AJbqC2>XD6wkVc8`A_E_c1>Rvbm4i_+;Wx5?%llNpQk=Y^q+sy_sIGa zw^Q53BEoQk4>xmyZvV~dTsS~Az$0(f z`}kC?%P{T5&T9*k@^=!AxKp5>*M<{+Q`I$elI;dCG`#S!ANnxP! z&|%w1&VH{#vy|Vw{0H6*KZobut9^Mt#%!T@j>=xV)2D5+)<+-5H%HBl4qd>8>e`FM zieGr10x6+rEF&DPw+vyI01}3VMoX0LXjx!n7Eo%JQAd62SwEz5=Uo@2;r0gsv4Lp- z(LA5s&r(CTB0V%#sZBfU zR`vf-zcnOBn_lnmz^rT9DkU=AD>LcS?qg7--?U8FLesYz_WRMw31a4R!1)60D&Rfc zMkcbq`$hj-#|s*}?rJ-2GX_w{Zx;qXyk+?E_J=2LeM-0ZmMc9>l%zB2`8H3|&*PS& zX5Ij%p>2*!G=)}Da-jjJb$bVBYLJFKjZ(_hnSe_a>5E%vnJ1OY?8(~D7dd^?hzA;( zVtz1&sD~OBn-fjvCx}b{qCM|XO@b{g|JvR*N2Y7Y#a^N2hswN1#LR6C%O*9;001~r zEv?khecZgfyB|&e*>$L^X*e94QSOm`x9RT}6_+U-HBLy!EY!?Bi}Bm3g!-``u8p$c zKaKETuWUQp0=htN@ItO?W=6CRptk)pAzulozxHN`3Qq%oQTv((Qzgx=On&RMPg?}A zvvM)<<6j}m#uq=zUipJN0n6~&I!Ra>J$?|gFEF$45e=M4Gt~(Y!DKqm=!N;UzGT`| z5t;dIR>I2$OJpgGc-X6_oNGhESntT)T?Z251&pyjYRIIft+q@ti6emrPQTW15de6P zrtetCfD?y4kT~?ueWTakH!KBSx!GmFTT~5LZ4SVYOzl2dh7nG_$og4!T}UCc(}zhl zV&LbtFGAR>(85i}*lVd*G-luV!S?MRm73k_;Be^9XLA7VWI!jje(lEXcdREHlmOG| zgT`fBHb=c%cfZ7Cp^1LWjdIH+QO_bJ^?0Oi2sg?P@U*h^*5g{Bw~h0)H(qEcvTLtz zxm403kI(`tUq_9*O--j`jXR{;lDL+ykSw9_uopsRL{X+7I+N)Uy085)6D`nfq&#ad zp`01sy}{Y9dFTm+r}|VLi7`|4C+s!|#e}?iYgy@(abpr(-YVp~rVY*xl7X{%amhCW zIkydl2E-^k4gxNb0prQ0w6AqV)?-~f-l|%sNr-Gm4Rm;3OzfPai&byNi;hZppTw90 zkB}qRA&TGLYFt7qNFZVU*bluwTzjF076VOVCx&gWiC7JXGLlt;QUWlS8-vl^k!+eu=!=_fU1$?YV7s%#IQPjZYxhh|i#tQ->O5%kH z03*tv9dVwfp9i-6k@aQM{+DO7sRT-U>uc@hrbVN&>`1#x|Yk>Gy8<8`%&(4>S{;1QOI2EDQo>_x^X(ge^h&fe} zT^UMl3nd$tb0(>Te>7Hr#dCm!$&z+Hnkq&$7V)YKjWY?F6VIDKXW{oziBL;M%bcR8 zD_>Y*a+Vp#o=B)DdB-AW(MUEdGhrjY-}nvkiQ?Ip-B)I1KZoS1MLi!e9#Q)r&#i)f z;xOggok>Prasquj2hcK=Cl56GJR%`k6( zs$tv%BEyCrCVEweV+MK0Hq2aGxNCJl5iWCi_I)pW+ZIx(Q!-E%e%j}E>$QO8G^kdq zVHCzs7doO9cl9O>BleH0n+_pp4LR4yW11IIyPVpmZ3{Q&3OWeJ=ja+m0HyXM!5hd; zs=6kAZjlQNQsh)g0ypOc*vX{$fc)Mus(5i1kpvb9vbf&YNZ7xtchG!!*4)>4o?yOv zGB)#oQ4qmGq@`o6W`0-GON?_tsH9QRat5(!4-v=0jwE}7(Oh#4V3$s3g8L=3h1!u# z94GskmmlX(vkb=p${0RfhhCaj;*z0;GcV%NO~69D6V%8g*LE^pUp?^*SrMR7nOw~5 zNIyu-*}C%ug8Nns09VS?K9p!M!@NIViP6G^7jSPLz@x?HXCRKVP2-|A*d7&f5zj}O zFAkyOyC;CIntvpvCi(N(rN9gCK_?Sb)=Q&?r~g-ygOLK4B%&b_^y0p)cq1LBnh z=Hjm1{fzFm{Uz^*M8#y^QoIy05R*=^K0~q;3HBT_MZ0>iiZ<>U&+t2E;w4P;UK%bB zu%HXu*MR0|a&)($R>R+!SVh4n?vj z;P&YGY*r3ys)~+kR2DXo$Gu<}WWsXUO9ZMLcRL)M*93AC-`$7Lmw;s3>)${5WX7AT zcJ+LJV1SW0!GMVo+SWFAjHa68j0fLoFCH1NGtwJ@ezcVORFS<>tDRmA2-3Y$nhtfV zxpp56##Ju#=*OZWN_O4)c;VT}t_S6rsE28!!w`cD@if_|G8y)4yd`MIYVnk-V_7Q8 z0my;mOwD`;mB6zKxPwLZ?XhLLo6eDVD}TexfzM1sdw$qJEPzDDWsriuJEzs*h@3GnF7Vz=jOs2RVkP+`u%r;WB%@~FYXl6qRWyQX zK5pmKSQspKwkJLG{;0U0c;%)*mJP}FyhtWLHml=`Lc!)6c`vXiM&wt=coy?Ugrbqw zp>Nu;`+h>v+PZ$D$!lP^tUoif`V`i5Bj~hK{Ng>s8+NQBg#_HYR`wBYj=;&_dG)J> z6p=3SAQ+ROm$iwbcJYH<*$SF~DwFn?`?IgLEv}1f@2?0;Z2}oHVYxp&4Ts1W+{W>N zzG$|Xki-&_`Emx;@t1MRds=BwmEt5OY*wr_Q=j>t*3b_~?ytOYZe~)p8N~EPn?^$W zVvdfsC_pwET7a06KHa%=ryH#&=jlhetu58vmtx%uHIcP?B8{sxg>2(}63Q9Rs=Qsa zXH1o7!dKTuwcAE64=W<+YFpInLVivZZBF}iF(gDB|OPKxH6U`A`cdIE#K z2`(JWYsTz|9o7;);52X4s)|(i`_M$Hp#8WFHZik#JzZ?}j`9-R3y99i&i7lxm2l&< zc|EHcOIjl0k)69dW6`g&1msY^u8HUWK$(ZWYu_KmeGutlJ3~CMIM4QRA>(m|94|rT zz@!fQdYH?M%S&*(chBineTXWi1h83;{Lb0w#lA0y>Xyi?IQzDjmmSAa&5!Rq?s=%` zP-zJD2yLcXoFGj2Z#els7hQrHzhbtTxY!I^`rdCLb5K9iPFu1M?6u}B#C!NPF$Fs{l3AhT8-B$K2UahJ};AWFvhIsqLC%c}23eK}xzuiQJBf(2;pmY@Z zD;-E590b%t6{|gv3sJ`RGR*yAXB%;B7=_kyw7xr%r=Sg{*qen)Yd|KJrjv61OjHi&>yzRJLQ}=feR@JJwN;kat2maO8!g zBu)AT%YA7{48`n>S)DS|$s<}t#FCN(4_d#XJBlHu8o@e`Z!A(Cd#&wEAm+rIW6nUn z#i-#{p|+{>tC{N-T}##ryDGtJf6t#5;2R-;zlR3`F7~G+HTo)t zjeA=888b7`9uH8^9$d+y?9lwdUqZmBnlYsYY2PMd98WC+zVo6!dj~OM`sktJ!>=+W zTlyp#V?6i2XvukXs@b3vl01(5j0=)ZfhFx5q!7$9um3scSDP~>fW+Bz7jxjI^#x1y z7(o;3a+xhP*Wve`J`T29sCyL5q+d_FFd@9veotv0>>6avCBCLJ5+e_dIq)kB?tA7G zm>8PLmD+|{Bu!c>!w-GVuJbfBf9~|I7l$JDwHq=Qykw@X$gMaVWY$Mh^0MMTiEY`y z@q28W69-auUzzxz* z%b@b>Qi6Dei}XOH9cbRG@1$>bx27>HHhuuQE_ECC;<0ZeVv5MQf(|GEb&C@X$HflE zKC;;A9kUt^4Z9s+GnIvFwwum0&R=5^NzEj}(ETjYz5h-LEGPb4+N*IolPm1h%x7#7A3ts)tWKBrFVB;}U3-3s(aHJU4^m_05cbzQvU_9vQM@iAQ==LxjOPWL(awkGt!w#=twr8{xVX2kXx z8mljtaCM_Fa&W^7EF!u`cWLg3`FirEp~@iP^0{27~A-{b3!|&m2Q` z+7z8m)1}i(lrb&2hj5c-!^%iw6Tibz7dWlBzwcI^Iv3V=Q)eW)S?yi@vAFsR$Uc4j zMIaobpT^{uxW>uY^rY*BlF|UTS2--JDp_J2YSQcqlV@6*xyjX-Z2z=86~m-(hM0T%SGr4-d9~b5)34(#t}tcy`~OI+eI7x8SaFul1qNg z(8@9W+72~{Lb_k3ZjZV!IMjE809xg6+nnG<2HtyH1B1PXD$b`eY$?yDvZ~g|<{-5b z^s93HeO2Tk?B<8PG0VUClKR?fx?l> zu$XJz?c+G@+W0wOJAc8UA2KIanXxsnj`HONVWEdw%u+x1rsVh##x{DPy)rG6UGMKH zY8U+2vzac^Hh&aNd3^v75)oltoqZ8fBbgm9k#!sUKyMLz6cSK9(gEgHsQwbL6??IE zX@T_;nbrW5y)t|6y>2EpUa$Q{>9+E(@aLDZMXGR6U(+j<5U#~;pkN{c3RDhVJn?eZhrp&c{MS?wqbd~0q- z9ob`zT4&@m!~jNkIFef_srJ=4 zRKERww@m^NlEsWb1s%2F!1s#iW+jpUb(KUvM*QM<>KlC}rJTWJbcCU0hL^>b1#3$2 z-A;XMCv9%myj_V|z)0*gU_kSn^>otqkLi91d|5B;*dE+}JOb?w-KZxSUdt&{eQ1zY zyS@xL-gFEQhLrtat!<)Y#6(!10V1b(?d~fHZ@?k;v>dh5K_y{N!`6>76P3a<(=hWL z&}8+B@JU%jr&C4&oH+9mZs?-CHJ7j#y{|q)Om72WNhqf1^z%vdyj*4> ziU+E+>QX`KXoT4cXwN;`0SnRm{n}9d1ferv>wH1mIs#Zkc7x@X15;OG&W;rn8jX_{ z30&X&n!}yxqLLXpNM&#oPe|6$;FprSmtUYmU+*M(8@_0N-702a#_CJu9P^UMXH>{d zA?8M4m6;Tz@~`Z(v5kJ?q;{L?8ycMicq0Q!xiL@a4UCr(KqgKjvX(rzbb9FW!HMIo z(OU4?I75q-EWxEV+O2%8dkf;Y`PoXDVxp);CUJRtFR>YofL)REWV0Cg1QGE!mW_EI zofSrdanh(}fgO_~>QnsM#rVD18-&L02D2)mxl4?4lI%OJ3SH|}+yMT3*R&C)-}Zk@ ztV19*7S15w?f%~4l+ebp=uORd^0w$aptfXWxh7#0JW#K z@JT&LXE~JA*%$D*EcZTqTa^0gCBZ^1Q0poEizkrcs8!Y4E6Rmq*8Slfv9y`x7C*3d zopt;5c9WXV&H1{y7=;qgCV-LZQc3wj{cO8A0NtQUPVEaw>GkDkyk3qn@jrPwQ~Kh& z^DT&VmP7&BLZb^PF7R{1lWwNWmn7K(cuaSY z`2aQV7)~DGuuYf`xTX+$Sr-F)Ou<~cEwr&}uCr_w3Ut^qH%itGP#6frt9p?AO#_oS zdQ8~3BMA~yXG-ZpyvXD|e-4?DLtmp0{DbrvEHXO^Gq8V7qSque+ZbC^rUup)QRx@{ zk5du^hj`@@zbGOSh~vLo_F>b}kpDD-Frhn+iQjkwAw-y;bhhs;X438u`R=no_^j`K zU+Q)x?wRmf{udi^Qxk|!rHy2p*C|5$(#!i~DXD1mWw-mhfG|)&G)30?H7SEVSg%j@ z4Lb}A4XI+&bht!1Kah&hKijA^oK!ZUef;t%`QnVLm*uHwagiVUE+E!A=lp@*m-UjC zLX387zi-s8G(isOmYVh~zkizW=|91R-f?cB`|YzP0S`BKHbaCLe^T8NVw__y?|r=L zGka$24fIaRW}evru3cvkr$}@3Qp7iPp52-IZczB4&UN(zrBs&b2Tk@LK*NTC&HHF7 zrR)M^R$k94w=%QsfybA%EjxZ}n>@gR|uG8KzZK>;$rLaEuF~ z7cidVolBNvAfh_z4&nAb_pu(^5Zs)w^l_~kV^z${5Z%iRm^4$8JtRW*CHOjUG$48q2|r_$ef{YB_x4!R>u zZ0rK0L#ofje>i|E&%uPH(9fIPhQDvZR%tCD zR#OcL!h+Krb}U$*IZ|sqPR?!lq&;#7{NA!t@#Al|euSU3wIvU$2tz*G(};M!8sD`8 zzdqVn?ONk~o427rru8O5naGqIi1c`4Ha?>!s5M;{#!Vofez7y-65Q(L9w^I{UzyBr&wDfc`r$-Jh@i1txt5QTfhsB zF5718m^Ld)r(P}zgfBdcS;!GpY~YLqLMIQ%@7ATL8QGj|13G6n?~)+_(Xbye^?E?B z2GU0b?SzFB%fQ>8bPKll{%adpm?PDWvkwXLUg-JwZ-cP!_||+wz_2oZ0z|hD4`eWX z#}F@GQ|t6uOg2I^nz$_%Z{4upY#Y?@yeSzYb9ghGk?cwhyz|GW%1|Kc}WVWrB#4s#~? z5UHWA0E$^!akby2-d5U6`P%n>*Q?VQ>f_&~Kg_`;W8GB9mq|=)MxF=c&93?mIQQux zK5nn?t?Z&QQfy0Vmx0^Y+Y3Jw8)!HRYdl^~x}dmmOf2W>Xn!ei5!iCLXX4RlL*;+L z{tBf~DQ(6fGBvTO$3JR;XL^F1@fq+ECY`^_cAKH>{k*pkal8*^n8*M~i#b=nJg5wo zJoJ?t&@+t8*jGy9>&vm3Sn~z#!?-v zJ88Mk(V3lL(m@KyivPe;)1!)FF%q(kA`B)YabW3v1OB&*=Bq5*_q!mGu{mbo=w@_( z^AP4YqwL|+ZC4tjmrk)Z79Aj3jA+60x*?SrH?EQ>iGhhtDDk^L=rVC|gMJ_yCn(a1~w}~Rpi!m<2Qz-9|`4d z>to{$Ui%G$_X*bIE!C}L8W!&P8sRl*;En;K(;iTY>zPYoDpt=6rJ#n_*p)vDa9QAv z0Mh6p}Cj$Y{TNHVq!05`e9 zgliMQ|4$1e>acIsRi$itfAuvIXDb_>H9U};dj)`ht&vL~uS%L1NVradB(p=hu*XsV zrS~nH=LZ08IoggDJK0DlTmU)M=S56Ees+N=$J+%rQ7cpgTz8p1P~zhH3gzj0IU1*} znk{Xk$bO^t!@B!>O=|#1mV()UY}`qJIydjy75c!Q$qw9Lh$=~OyIjs&5z7LRps`$K zsv;doD#go2&s|fW{XYIi9|3q32*{dwcf8AAsoMuGw6d0c-U%N4`gDDDrQJ9T-*2YI zX(8r}PaIi@r@cDc+>C6XOWkY@!{0Os2-B&H$LDN#w2aE!3fj-P?;}oad-lPea~g>N z`c(mi1W8e5YQ`-@(Q6uS<{8Ef9w<0-Gu$kqw$V2VM$fi@9qZJTH|DQ(9KVI*tY`Q;5d%y4<6_+xpE(0+mH6fPawpb@22-u{L-lwx@Y;Ct%8{v)xE0bJb?nv3q%z z;QT216qha#1M9bcEJk6_tDThPtuX!mdAdM}A6PwO?4^*&s@y3R4ytEm%lpCCgERu) zU4=0}Vlf`#&#n)bVBxX&7L2o-x{sAuVVuyH#@`7^>s9hgI|l~%MngW9fJ4pPRmWfE z6=9^o*;&!(fs%<0hGEjyER3)&$^7?ZWF-rE6;?y708x-TzOt%V;^}&7&zdj$f^C(V zv+>}A3pc3MP>N$>`yk>63O|90(SZ4da&S?DH2m|zJ^dQ_@8JZ>mkd$s&sD=RH=8dn z-bW!8E3X=Ku%22RU!(PXGzNYg?~anBLCCdR9FJGj5_q$8LOvX-&N6Rn@I3Wi8T=%B z^;1}4^xdi3sWjV`AWq%*Z52--LgzpkiMP7P{Y&GMlq6md%&Tkk+0~Lx2Og-M(y-vG z;auhZBAD2i*c32PBOkhe3+ofo!TU;h2qA0r0w+Z|`O!CETr~`47o(J1IdoV0VL6KF zXWY+QFov^WR^Ln}#KI)d30jp9T(ZR6(k6h5hP+UcMYntS$1yijCYlTn5!rDnWheUx zpR;lEN!Px;wIXPpR2LhToEI;Kh z4I{DirI7YRb_>%~a#c{oy`hOW#ky0**Oo4P!MN6S>8xP?CK#$=CWJ$lzB!Ijxhz*X z?Fpm=0ToJG6XA8=_?hpKizSZ6&K2!sy? zrKRH{&qpx&!tg1`Q)q0ZUf-lvy0Xag>Y>h%2m#$_V8;7vNb9k4X-iMg2)@! zm8{mYEI()+wE4pOcrm}(R^kUpfiB;b$bItR&dzPOCF_qrYh5h2pA9~z$C+myQ3z?~ zd8ACQdV2Nbnd}u;BjlGEO}vnyU)j%clu{lW*SuY`dIasEd+b+^0%JE8=>Au1;ioJP zcpcjZ_D*>2~8#7ZImD_ir8)#YQWx7lD{3-_~a?|>+9 zYY+rtP+bK+(HPa#icpKjr{@4Acu(zT+0uSrXHiqRLP^Gui!@u=m;a;-dQ`-j*pSZ( z2n*eGii2R+`5SI)tEuHYWK#K3Pea^yze-3D^AdG)@*qX(ba)TPHs7dOsm!ytZN6)$j(bn1}_2*0IwU1S^OGx#JNTm%WTY zfhbw-uT!OX!`UL1m9d-~RD8Yl6bf1(Nme&=qJYUC5g8*cx;?UA*ENs0Fn*!@MJ_hR z&a~536sa*7D^~8x;dUK2Rm{)(Js~sB)ml=~eV6M8+H{pzg{~LwEO>5gCG=`|b6Rw@ z!84|0n+T$NPdpJ(P759Fg;m1`G6Rpe4?V7gl>rHD#`>iGn+jDM{vW_5dQFw+jxMJS zpHux<{$mrP+xb3Fv`6z|y5oE{e1|X}tzF{hMpT>eB&*McZfO`wW$^@ZX|QC0W z*?zW}IS0^fupomK%&ZO8Zq2txrLPx$zUqvVgYI1Tk2C^5T=lWeD!S@#U7Bz;Zxd|& zSfh`}BtY?B-1Q{YLB)#{;+&A7?%Bo4YE2q?J`aMqoW2@UvnXtgcH4XqFf)Fq-++<7 zv&0Ez`gW3>0!*5Gy{1maGggsP%fw|#oA$U6)w5aNQq>?gyc1^@ye3$!^7i$l&yNa0 z5pK^Z)TD-(eDeph1Gz@7L$bm%v6O8k0p5;l+>d9xE&XLcn0XT`W3pIFaC)zVer=>+Vcc~U){ZY76A>472QTmx zY-Rdb_OEv`m};NGT}=wqY?40PE_u?o0xL2|vnzirXPsxdvyK$!7ogeSa|+S(LtN`P ztNvmZOv57gk|sa)obCGxm_YQGF3#bHDaj@4WQ7QQQtX5X;!q!6oRxag^lc+W(C&?j z^ea$Bh%6B9tJ0(b5T+V%%Z`FA$!pXv@j}6|_Koxn=}l@g@!JI;J7!h8z!&20Ys7oBa)&CF8;pm3=#pU-9kGJOB8 zT7Ky@g25XJJ58avIeVH{%)p!yNgm@(!Nqs^v!^tUb7;J2(f{!dG?HKS8{KZ?)rQ(c zkooH@)P-*8PZ#B3ru|dKlwK~`j25N@hZ!n##!wp-dbv$hST7jrUMW3~2kt_pxc_ET3mo4e??quWJZMpB-)5gW1+0oyK7v zzvUGaas213+TDfDGa>R|7?KXT0Q3)T0y-e$ltT2%8QF~>&%S;}dgWk#C;g`s@dq(w z`oc(aG(uK+X=Nn|$=5+~VrVc$fgJgX}?`93QySqqSy#pf2Y zFz)(z(W9T=@jcxOicnN~U|@~_@$N|&@u^o+j^!)7>CsBbhP@un=ASKAMU@73;$A2k`LiLB)GjP^X#xC01<0 z0VD!LLnbQUVNvi{JwA7Nnb&dGWaZgYUJQjIXH>6xa-8ouIZi`usDyRtB^WqH@nyb0 z1rzsGAitK|5c?QbE8=(n4B%>DH1OGDs?879?gI(-E7;8`e4IWPhJVV%OWBsgeR1ma zRX~0zFjioRw#|F1bu#cb1e83l#?L>F4e{=yVcW=S0CM#~k^Q(9&bm6Y38KEGV8}O| zvHkoGC-#rS+h|SdsEK$Ngk*N z{la3?18u>s&|J>zKnE+Abm=Di>0o(U)+_+FN=i__11@?W`=tB4!Fw!Y0hbR^ky zK22BAcP@iQ)4vs)vIueZJVKw+ZxK{o|I=&ezF-BMlVxe)E)B3ZBMvQO!X^Ic%!S%T zHIoNuH3YDCv$63yw<0k5uE>RWZGj%8J19bOPhXJ-``Va35ql^V^7_Md$Dv<69nn;s zHtu<=Uz?Qu0Ob`PjY3LY@iX;f^K4iO7J0`R0~lz4G2ezZDO{m-*Fru4}-EUooG`6u&$# z_XAYLX}zZRE|1ka#mX8ta!zAA@O1?-224EVKp^>K;)k*u`W&W-dX~wrf}d9lL(5LP zSoek->P&||1~XlPQHpV3me_$Gu6)J^HL4%$2huJer27qO`4!N>tN-++55)Nf)g>Ul z=cdv_KJO+E0#F@OMtsoOm(r*H!n*X+MvqgYV?1=$6f8IoBVUKTiZu zWZIfY6xsE}NtE0u^Sb$$0Z9`%Akn4Q-(DBAC?43gBA}R`CHQR5H2nLmN1utVDv1I+ z=(|HS>i9j!ddC*DIa!Uc8MQWYRI8Q_*)mjki5s6N2;4F|02rJo0P{vPdJnO(__7S~ z*YMj9GIwrtQWMz0Wa zE*2gU8lNdiKIO*KdC6i#&HJAEtIuRPJb+s86QshZ1Z%r&;S!Au_)nAu_n+Y? ztd184huw?H)r7JALY&wYBLYhXLPqy}QT3AEqt*m}y4Dy$2l&Tl%>l#B{`!9|5OlU! z<_byNY=OHt<_f4vYA?gjIGl&uv$H^t3TFt3YvtFJR9zo~Lw0|A6_()(p(lp3RFv3B3PXFKxv}Y%RInRVCC|S4qIH@$07$06m@d*h ztsnDklMYQ@Sw=hwV9KxpT;p*w;;S0fe^t+?$`apH@fP<<9?v0C zqt`!-9I0017@>ONSPp(-p>7lLU z>GAxM@FL#GCivfX?swsz1JLAe40MZK$AFaWZk zbY+*6a2E|U(hv;`UL!%zFinT}=*PcVeyU34u_iv{-BR83psokzM6^L{%B6|Kl?D`P zx?ITZ8qgkf0Mp39_zc>PuuJVxckQ?PY^T@1)PWS;2g9f{)6-sh>>tx?v^Fy!wsr)R zK_L(A7dF!2H@>Q*3+Vx8q2ak;m`8sqZx6`q+(FPiCNOOVadf-hWUl`8Tn2R6l<>v| zrhs@ou&I@i2>-}d8bf^egA$FcLqjP3VZ%I_+;nmjh)Bglz0olZla?*22dc-DG{cR& zS}_;4Bem-odnn%=7FrZ`Cs#&K0z7K!~AxM<3^S{`8>!>Q*ZEajZ5K$DR6htwQ27^$LR4h8A zOQc(1(IFrrpn!phba(e6Et&;LENKO00gG-}#5bRJpR>R3>?7~l``w=L8{;?h4+jHQ zJkN8_d&V`ddEIHnSsM=&WV@rXLshoT8N4;Lsu>B;jdNZWZ3ab1wk*ErIPd6hnl!`D!u>-W~MOm{q3BJ z`8z@&DaC0U0eOrO3~Try)24%ZOEA&*h2eZu>NjAF^g-jr6goDt%6yg!X9&^7k3Z1Q z7X(?%EaY0va}beHU)${~GJagxq%HO|&Xz974b44g$L-Lw%+$H>g=eO@gs6LG0ofe| zFNv!TfmVj2DXUj#FAXsisLh0%;1Rip3Z@b$jBZA>@Wv1cyDk?NU<8|B9z-@q>PAkL z-aDd>YIvG;<$FJGJ{M74$~>j;z+EN7!PY%UtH>;L07e#dR+ZwX8dugn%~jxCI7#Pb zNY^|n+C6tj_aLH45HU+s&r8WL$qj%;FvcrJ!3mQZUxe`&iesInS}-&H_P!~VT+`x}R94745;d3HVB zm1+~-yfJMoZ;B|Rw za9e9`7@d28L4|^wFDrvYLm+6yTrRO-v5J3pq&W0nq7L53;YM?JwKx*!F0hTttvfqI^ME`d8G zTMxzb0r5VX@Iz#BAdAHJ_(QM>_hL^BM4>D<7OxnRJ{lx zc?4Cc%ll*tx}m}RP5LB?l39^@@iDuF_PuQ!n*`wgO3}$z?~c>AfqIF&EATa=)@719 zNs6roCiIn34wSoX?M~bs-`Faz-pF{Owkyob zPXX~SPuT|tRocg;GRfJMvF)YvCR^=K6J!Y0Twm4%L|2{&@u+{v**HpDeBDfwu5d*v zK1;pKHVLHQYWCx$l*85<_C>a%hv~2q zq}cbSy*h^Gd69-Qu0Gr0K^gYIz3T9*YKA@SAtHa0G^dMAR~G8AL;rL=MiB*FD3jBA$|!B32=s-}3DyG|}hMBdSo9 zcS6|}@QIi9urr=Is~{9U>Yq*QMpc$vrl}Cop%;Nn&l$BwGonPOE}i+xV)1c_Vfg-1 zXX}Gem&sZURqm7$=<&+Jek*i{gFNCvUmvs+CP87%A4(#-@y|(dy2MKzf4JAq8|B4^ zUyYpC6|o~iiLH%EBjts89YZri&S6m>TqCp+_k+!_*+I^!xgPB=k$+3#SvAbZXET)cY=9BJWVl48mma3sGFg}m7!CK zHe|ni8eq+fh9M@N2DVk^Z%oe-LQ*ZUc$cCAXl-0?p=nz4b($R2zd72mc4uY(2fsrl zG#o15z0=jQ7Jy5ickGhU7r;BMsm@8~#&~XB$?D5#lk~ODm@>hAFzL`o)hy|7bsWIl`PIt26DKixvkAhuG>N};X5G| zcohZ0EBzf4-C5@FFbbhNI%9Glvd3=7WEpeTl2RYW-0zWla=V=`+|;x5we63_G8GD~ zNwr>}=^sOUqVR2r$~5h)?=UlvlGI-P^2Gs z37<`o*oJ=H5BD3q)`R`7Rq4k1-`=UBt4_d_$@}7f`y}cBLOJu>e!n%dpB!W`*>J}6 z+&YjzR%xTU?(!Qozd7Bd9n)ndj@^Yn&2iiIJ*3l9G20s{IZ2u&R+cbGQw;f&Iy|Z@ z#iTpQtzNV8%)Z2)(7dYZO|l&&(F&Ht8k>aATrmGBbn*wL5<9zx%!|Fky^(fDZ}9Q3 zNnfJW;A62OQ~QUZoC+5)-9K76eR(7qoRYk5g929x(p{gfW!<=aipZZdP&q@T1@gNS z1zAHL1g_Dm)t7iZ5H<~3bz%k|(J<(OlKr%o`LOYl`%@5vpyng<9y0(;+i6?t3&A5?B6{m}qkO8M zoG)9IkC+Qr+;DEm+u7Z&F;;e&aME*>mW9~dx>WNU>p6 zk@JvBbcL+J%F6!;a{#ACrOVRk4t<#~x%aMRs!QKwGTe<_k;*qxC}?UdZDnbdeI9gMkZ}d7m?d=Cla2SLy@K4&wYBohhvSF= z4S2loR2nxo>46TMbF1IAoRQ7W0vBH`wpdYxeTP_(kkUhCBTQ?z>F6`*pL=fS4Yz&TJ}YIil3i)Yu^&5<)T^%b@CgE|7dLMIDI4GNi_nD z{z6~-Q<(Nv2fuy4TcJ%JeFywORK#b!7*K5x!pa#-V&!Q_;!WDrA(VbDbHNqsxXnr8 zykwwNR4e@{sj3T~Y_{RpZmX5|)mxP;>8{{DDu4)o;B zy1x4;Q-2;;^w{2XRMIo%C(WS4xliZwc+00)1`bGXv-TVw>nd}0Q2D4qE`fBzjtF0? zuVaiiHx2uGU6>*K@&kQY{`Mub&MC)-W&f!K5GBR0cZr3e_RfMVlP*TpB?ldy1Ign( zm;*2BsAB!dbTx%SF8{^!VF@pJGl>-&P{_)w1V^oNS^1(@C!l*_l8?pK_9ZS+os0Z# zG@~ZnCjeYkMlcBQAT&sRLiKo|&fLejjIKTm<`H!QPVC%s4Y}eKuUr1A4Ap*dCv$+J zi!N0yJ78)cQ3^x&RzkdF&RcM!NMl`_Gk0mk_eG)AK<`_DV9_x(VjYCuZ$qGFrO9Dc zW71Bt=*h=?V}d~rS*)3RNk`&sDx4vN zrfE3T*xHT7od{{i10m!(J1ITzw1!;-J=eG6IIZZW)<_HIL|0*Q8v6n4emZVxvn6s3 zdo~(&wPQ6(i1 zuROI|URLp4_IKRz@pWl|FPhRbJJ-)-(PC*|QT``(i~~#|>R#MgL4|T*6_$P5>pZ%R z;+DgH$9^<9JX$YudxG2gHTA9$ zW)H(Efzi3Ip)jU9OVgM>ceKPLw+)oFB5C^&h<-xB)YuC)W)D(3EPOnggTM$t!!ytz z>O$uIkD9XyUKw=OsEadipX-I*K6XN=3*NZm;KDc~DIX*Ww87U%A8&ned_*J4Wt)TP zewa<3!L?%ERcun~$)1e^@~MUuxfuBv)hrE~^{$mxfM77r$Cg+$lS-!-EY|W*0Nkpz zJ6p=O+EEd>>Y~Rp07Kl+yM)aZM%8=!J|+%rP{P?cEg3L_hm}X8hO|Kk8~HAtIos_#lgTSrzU_V z9o=rwY)!`TeM`EVp|T7VR$sY*^>X~e=XzgVmvLshvc1qBywA1BS=)+6)J5y>*1EUk z9SvhKzO?XThV%-t{-+)B5;6mE&@ka-Dj?q}Ab9ER>&#Sa739a?E1;{S%?$c9CX)ba zpN5<6`psO;4fL07fdi?@eoBV?#T{iuQ&EjRu=|`!Pz-7}q6v3^h}8#_f5iYc36{hJ1pu!yVxcBYYmaHteFmSwTJXp?YNDi- zw$3yqOOTeW5*UZ614o@3n{lgj^0Tcl&|?Z&s_T1S)l9W}fO3{19m0D9PBhe$OEADt zn~~p90bCEK6zn%1Ub1}1EMK!O$~7T0u|5@1+%>KguQ5Sv8MW{J?lh&kB(qvPlE|%T ziGcf2TKx(@QMhgOhR0}P#rxy@ZD~p@5xk~~?@s{kSBprFF%lb6anGvyf)j4nB0gwu z`Mf3m;bR~xvuGs+u~m9156QLT9+TcKr>>0QP{|xq5 ziG*Wflnxa!=Dz+^EH3YIy3+4TrQsm~pwt&b3g%bEQ_{S zue)iAhIO{w)!#qly*cD=pSP^Ua&pV+tFde9pI`+dML%6373nwsi$4tFyS;=*dcp~Vx8xpWL>m*t``x_#w$d#B6)LIG2eYl z=}WlzT={^RDpmxV7j_zwz<#Z4-Ida4nu@6E>X)^4h~1)s&8Wz2sTK*4i*siV&&c~kKVp@vmSMEWpQr3 zwPJGL_YZ4kkl|pMvR63bNDDSC*a`<@H#&vC03#}7c4(?(XqvF4j{64X;AP8njNNmj zMtS~HzsduknfsVP{%fsQ5x$9~%5zglvIZ}-EU7KJ9ycTTw5itbbv##h{@V=c?lTEO zJG-MqZE#10FF!Nwlx>KKkiFtjYFMATba7UDsu-&WGY1M^3E}<+*Y|PzYgJbX56C?c z|32SP)h!T%49Q>u@tQx7c!OI9`7dx10GrPdY03&ODH8xme{Wp3S?M@4_yFhEs$BB{ z-oPNjHa$bW7ZIKw>AuaYe18ujQGZ;q@+(D1v)yDRQ2WBx5bCTM}LZ!MR=tla|m-KAd&dBW+XAsQIWX zY}_NNTSN2+sye&-aXalI)mTa!8p&=oE-9oy&C|!QaSUbj%0JDGThf&K$8Eq|?5*wt z=>=iRoO4H1EL|&V9wM!y?9yR!+|sDsCn#}yLH=l_a37EXiy%?oLUF zVw=%A)iA+zgzp$?3tC~5urpVP+!2w94pHpq6uFZ)!VAcp!lz}l0*N%W?f`b11&N_T z270j7{GP9@!n`We7>+}z&5<>2yJ?c(KJfHP(ZG`Du+$1 zK{|NPd=$&HZ_nlosd#=o&(1b#AW^fq9ozb`*Yr;i>f_{vR;4j0vq2U=;V0D-$*A65 z@vH&j6T>C1J?;3!a)B><8`a0FB5xWCPt{@9T^O zyq;4+G|C@!9THj9Q=$x#>dC$51F^XIHK+QE+;xyOo1F37F5iUcy$_!*0d6V_lIjci z(P6dm{<4B1vo3a{nA64;0LbyXt=e>mIB=04@5H_J_LcYnt##1m<(EVJ zNs^t9c|(}{G+Max8lWoq%r-T<_Zs{_8^9E(5J?+t3jT!?K`}-bT0KFkhAGbI*B!%QTF$rHVP;U+DymIdn`i9GBdKlb?6i@Zq>ts8 z2bD2c7Z4Mi&PUf^8$Jn3Q_2IQUQzzJxic)ajCh8Bz;cm91gcUiQVPOQrK>9)v2W9w zbMTmlF#s3bGLV2T1$1)UO@HMW#C7pZOYE7r+CpWP}y{SiLaEk!VQv22bfJ&&;0QL&u(zrM?gD} z^c5TB7&!D|=-Z*|^)*t6UN6dgBR_V#$mAry>F1AW$4IOr;WUio*cWP&{B+v(&vB#1 zyTXQc-$r1jF5lA~h>=)-HCfdY_I1kCC`!LIpe1t@Fqy6_O|0`0JyXiNI_rwL8_)Hk z0Q71}mW@It`7Ti?+H(OXEEibSWh!#6!eO(ag*m*$6zon-t+7VFDAR{yki;6($0nT* zMJ5qJ2gjIP4vrQnsaUDN>~k5yyUhc=YJH+C$k$=&i076W({}`Y?k~oW_%S-Xp;M%u zs)(Jf0E$?*y<2%C&ovH}8g0rZATo?)X!ZuvyXELHQOG}YknP01Sk7DCJmq&>A31O$ zKM3-H7Ux{|=e`Rj-a$$vw58OH*61yHN^yg+5D__Q+O}#>Puxm?$bJ|Vn8-J?_LO~d z{*;lndpE1w9Ri=i8j^Mbo$QXXWu2RLO#SV9N7_X`EQ%V0$T`p@Xmfs6TQD6LUx@?2G8p|x~vX~3HlUeob0Zpje z&nXT?$j3>L@Pu5nP<*e8Iy3}@56=dlO6&-X)>Se!0N&q2sMv;+Ujhm<@CspVCS0j{ zEZBHFsql8&B~lVLKRWFX7?__!VtM)KTl4qEpEk8ucI5@c5S$075GYNj)NpmF&|BT$ zHBXZKwByUUgY#>5T0hwHd{v|-N<81|v{LL6lv2Y~9oOO!<6pKYK!&o{9af}v$XY2Q z3OYhFU_Q3?B-*{~m0GI=b

9Uow~CIODeMwN}r`x|5!q`5Gie(s@{(6e|6MD{X^p|5D_#P_krqHN@NSwU; z3H@SHZpx3QFuHy20o`FBh1kfjCvCTwKPiQ2E~o{FsGrRQ)uOk_UeACKle&@vGz6W? zA>Daa1A+vg8Ms@TXQCd-&3sM@1m3-yM&?k_0OUA}q)F_B$$-BO3N!+5x*2lCabMZe z1HAzjqE}_z2%=Y;vf&c9K4a9s8RDL%mCxVuB2~?FBn5_3)HIu}}Aaf)H zocX&wp(qamwt6cxwtKrpWc1Db$#2!(DBdUSNhUUN?{_WA)x}A;*&kG5iriY`B0t@+ zYt4ySl{l5=aja*%_gnoXRi81y?>%O8p-RgF8dY)(y;hp|M_~Qf3#OO@R}CWISG9~r z!ua}e5zrWpY?X9r9itaJr3rk^S&?8~CQY1JbqpJHh8)Rb70Hw!HnhpMxqd}pf4a`F zF{itw@yj{p3SY%nU9*wVM80M?@*rf9&zmS$A|^O*JM$Sj!o?PF&W8{%c2*WJDgaH=MZGvt`wFQt>k9q#tOYG>GJN)|4Sb7?YaEq`VWl7k@xHUxZ0r9QeLZ@!|6=!~6p$ zu6`l#yJAtSi$BIwVwShcxL>A5H$BXq<>zf<3qNw=Ks;rt3A2LU%|gS^7i7dOc&b+8 z>F9@3ge_mbFH>X#UVJ`k^2-G)sxn*uzO;v!Nhr42$RzLGDdVKygNzgQW|yNqF{W)R zkW^R*@~rN|b~u>LCTuZl@=|U*X0W>DFqg^c*dZg(17y`vlFMt_aiq`I8O2U&)B&_t zxqBM;uPdZZkCW+oTOr5lJ^|-E5TG=O5|{#6f7EVLBrM}=-b6Lj zPpETPBa49?kxXfTjh|9r$~0%SFeoQCLxaHhX_lEqjhPSN_SVQ%?-HY@JF49{$3<2W z%PVhn*q5CAz7RU?u6-@`d!k;a{ScnuevY=R+It-hDY*mGY`q{B5rrVcd1$nPs2E|63{z-HhY(2fdZ(dO-TFO;L1&4tN#ck@4{i=tZEP&-A) zmTXO0;?b6jp+T>76eM0}9srBQF9?ZtY)Tmu#J9`6bX_OT?97>!BWmB~nRn|YS`XMc zve55oe9diK=UCY+$SZuAB8wS&y+hE_@_Mj2mo9Pa=PPYYXH(y?K!5O1rSz&2vGYNZ z%Z`Ne6+YxVW1u&_nwYtf?G;1MIjYE;n+i+8eYZd-1)+5eg+Z%lzGwjE`<7`eMl$A( ztDJx#C8TGQAuJ!;4JXdL=t9x@g=#|_?FQgD(kh))2< zB8;{yNWRi|U(OlQys-mmbah{Hnu}H0^)&YQZZ15|uU^Dpy@)ho5Cn`>iY*wI)Nqe5>7-(}TgGsnxU1Qc5jMNglrTm}gG7R~_*JUmA((@UQ#)F*(&Q1S(8 z)`|XYWezi#DJMsKT3T9LO+3`?f?cf+Nr)?hCP(B{*R)zpObr=VqIWX<+8QriLUg|_Xn!cBCFzd@W?XicQhJI_I?P92E{YSg^bJ%3dY62wD*31Cxmt#T zb&;TcJYXJyk;>YSCczx6!J?Nb372OJK6n3L%f$26^m>k7__l1-zT)C@27ELPpE*6Tf}$k!{E;w z(^ZxNEoSZUfEQ`jh z9GkJBSBZ^nKs3q{h#%0{JHWOhOjyDWcI#4Yi#8wcNC`$LBc_!%o)lS;u$j%yrKLpE zXEe%o0G-GPI#zwM9mAEg_{3swbnA#E%K~)Vykf}=7SPvm<^%ib01LQISOcA>rR+5h z<{4KTAB!CIxD{h^oq~CXz$59w5Tya*CXsv;I1gl*z=13@ly>w`~*O+XK^8Z(o0n z0@hm(sQW&b7KrCgs1ly7Ay$F(GIe27y0c3Dp{5%Pp;&zj zv|>h(bU)U^OJc%Sb53dIIflDm8iDR@hSYA-IE*xHR5eh(4WXC1`MLP56nEK2 zp_GZu*VCtik)c3@bUB?|@Ocn%ynu-pfY7M1fjjw;HfQ^d)wz9p1B?=huUOgFhnxaB z=c7V5oyXPzq{~~Npk9cmmAU12%pleK-7Jl+G%H9zkxD@pbuHAM6NpfD-SLuvZHa~3 zov|j`eov2Hc(2+=UTVYTG4Z(M*hlvhgQ6bDq`5a|NEZIcfM@ch&dc@59_z^u$AHN4 z*4yEdg8fX+ETwh?izayf{>zF?OklJ&2>oj$46$Wy(c4^8nhsjiBwgoQ64b7pCaQEnIh0vux#a@7^h zpv%K#Ggs8lMfZ&j;z7kIhi!vfq1{f9+(!)puahI5@Z)IN#>oYrf=)#?EiWJ6H6e8xuR z`$Bj(#djAtm$O1F6=Zu7QJ(;)%@u|woC6XPK>+G)cX8JnOd5h&vBfeo66*t~VhtVY zWJH`1n2GBjPIermOnC=h&~>IAfoowlmntQhzGUrN&K&G*f>}Jy3sM>q*{S2O15JL3+dd zHis{U2IAc^C_TIpIitiy9vOMz{$uzf>$jz+P3Xl9u~UUmMGa}E)1@Wg zg-D`DA)|gyq3>OQghPN+D(K!M7(Z-y4O}jlciAwlj*K^45qQ~r=>o2nSW%&s2&9b+ zp{cRFRB>KmWvO`*um*vFuoT-vNlc=`CGC}Rg3&Da0=3-;06N6K{0|;4!aY6xUIn6y*PPfhBW-Z8%%R|2m4N8au-n# z0(q7XX6`t)#3V;zrfO;8SA5cpZn3%#GiFz?G;R}LWa$>$l~q+9%B5XZuM*0w-d!k- zCnfZpvJ5VI=^B(DxK~8)=A>&nUwT*mE%$RD>Du%@U4JLcEv_(!eX&+XZdMB2JDDN9 zEz#`-(PXcy9#SxinF);YW$1Lf++(IcAD*oU1ZdZ>IKPTwKAX>c{Sx0O5iKduxN6^@ z0H*WOE{t6{^{cX~kLps^0y)z>6={-9CY;CAT!4_BD^aFjBk?6z1l~u+y9JjnRJ`cs zLrp02EJ_iYi@r;dDVoGi!(g8!fO)fgHqx~8m#>1%jVnamO0Or}r%q%)?p$DrU5O>7 zGLs(wHO&c%@3PC6`ys)r*!>NhZ83=DoJzBU!(2GDK*Gd&k!YFvYbW3fqJgd`%_1|9 z1Gn`@_)LR(*C}>!y^3F0g>z6V>mL`5ZIU zUaW(CL!Pdf+sT#^sSo?;nF`lvA&hLh_N{_BI(u=XJ^Eq7JYTjp8Zia)R7;F!!<&~B znknwzklo|$1o(d*ks4_6p6d|KG&dcrD0AfGh`q8{C9hY2ulxqSdJLUjky8SsC#O?4 z$uXc6I3gXV=Mfmq!kW7d-N2RiQ-~w4mbsplgX4#@64M-c9Nv-8eE|y1;Ub+(e?cB3 zXf8v$$+eYiV~>|_Ng9DixK+hz8P0gSpWP}~80o!4ZIomz5Ql47m)ILN4}P&1rfRj* z6=C z=gY_QT*DQAyx6)iDQ0cpjj;sDJ1FH|0EQQ%ou$FYp`M?pG$5Y@A^j7;?ImqZQ66Tq zPlk4p^5s4lxzQnZBrIOsH~eUGiuJ`9A;)ZYnBXtg|3r=uY&N01)2Zg2mESD^^oIpf zb2hyspdKpt9Mmd)XB`D5Hprj{tdOgk&|W;qLs26i!#g`!LwtzWSIn2rw;>XmW#3-v zZ)NWaJ9X{x$Hz| z*a+ypVNj`-QUBw8h;&;iXO*)dF(LFtEDVAMq4J3d(mAv~d zz@wD8cfW_b_XcAf%pZq4W@bWiQTHJ8oeue`{OTuz2o-b2M7nLyraG$mD84NX}iKreaKEe0!k@Xv!3qNq@l?1^$XqNo^e zy<8AwBR~wDgckNPh5|b2FIRfvTK7@NE8Dz!A8$YSnMtmixcUCnk*JbuB6$;!lJN`d zyki%5xfn#3j#VNUM1leCD`PAF6JgZQ|9?{rbi;x>L?CmDsutXDqvY65`hv<9q+a7z2Sb&G&glUHkZTF^_)lwocXht=hZOO-3!?- z9Y88}Mv#dLlYgL5r1hgmk_>bzsveSl*oGMfSq|CJx7}&qI8__FQ2zK!52dw-$_I5# z9LLanN|G^M=vFM>c8TXeLbgUxWBK4A?eZ~~JG-Btxs{(hozA*6w+4c9_Kb1(HORmE zkU?r~tY5Y!$}(wdT<5wtE4fp*do>ZqrMsPe&~y7zj`_^zm+ffJla4=dWn4!Udxcy8 z(}@bpUa+?wQM8{LKZ1#02r@tF*K{dQqEfWAYmI%ds=KI!=q_p?hafn0v(j2(g%N#o z1`g3{u3^R6-HXR%t=Z(zwq6*#Y8f#-XJ38BTG;hf!Sv?c#Vzb5N@1VQ&4OtEu^5a( z59>s5G@l2YAvkEq5Os1<`n0`uj@?Vla|D^ed~1*9OYW6(H)1_1f5W9V7FQqFdbnro zSS+R$qVrs|!1Zhx^}L4Lyxtd!Bki?@&SX?}%8^j}7{|>DUHgMrLXX_@%IH=!_0&m5 zj{6fP9?2vI~#ozdIHjm8mG_LA{l(I>Qcb>u>%{>;zc_cxgeY(&w{q)hmHFX~mmaV-EBEQSuvt91d_WeWmC{m9-_7)k-C zsx^%}bjRmKDaPbV1jw~(cjk@?3cCOvxJ>#d9{~$8`9g|24?1`)j}IKv2bUpz#{jB+ z2k@mrJbeYmayT?hQS5Z1ySyfmhcO9CEIY3gnb+{qohKRKvjfU%&IDf_CK2<$2V5El zP^RbuHo*P95^KlT3O0~Od_cHan5IQMwzZ6#X2<#h9*Ge0NQ5w%Bs*VmNDa5=W-DQO zd&R?q?GMc7=kW}&0P06I8*2+w$T9ZFC|>UnvZNCdde2 zA4r^b3bvnvF!mW~CYMjA8y89cz_u?2T_e1$$j-p>==v+GYDqSr8c zuJeMC(D0Kph-AToeRp1Snvz?ZfjS6MuEKPk8SvXnpeE@BeUpgNrrNj25yTn*w&}MR z3I566Fi?M3&0ugWvTJ8so~;IRf55sk;~XC4C*`lekS<_}b~=sM94fKaEk_P)sM1l{ zxd*z7D31h=Xs}@T6aBD|+hF1s)1>-#UPG9Z79&x~(Xm8E4`7Dhpa}9n{z4!fs=*j= zK3YfR@lGiXT=ze538r30!gY2}?dXI4crJV7VgbEFqR7X$^zQf_0Xgt2J~<8* z{Mh+!z%rG+fCtutcsuBw32aVlQ2)WV#{+@I9Q zvXB&T^ib~PVi51dv`J)+%`O5JGN*g;fLN3Om8%C(MHo}ERRKx|K&fx}fOCWJyb)6W z-E?KP&8S$(Klx0uN4q=)$Ua)d0-+W0ra$?Q8JXyJvFgtWRjajD8TA}$s38?+eb-sSB>y8 zYv*GC{+%bZ%j@c4fJx;Xs0aR}Cir)}k=Oepa2QhVGK;3qzvq^BG+gJ?;IE@6SIN-91)YYnA9^az@K$`X^}5$o8L1T&9&?wH6X*^BQso3e(F;#U(i$tCEbBq zQt7>Mk9T>|r>k%25*PVKC)0hAC>vnkgKmCwzg+ZDNQi}us?g#zDW$!x-K|RFh z^SM8{k|PW){r{;2@Fy?Z=}#ftaGdJ6pj`Q>olNJiEy0n~U~s4QCdvH#aIJs1(SO=1 znY7{1uFt4whikvq;LctLgOfOTC*q&Jl7H&^#n7-nlJHlj9{*y4djbac==dxCeZSP= zyzKYE{=ho-QtV_df6YijD8b0~U17C6@>kdW=TCe52<(sSg+hj3Y-Fdv$d1^`_n!T0 zx4YB3emh}g`Tzgug#W^J`R#cBaEZSi?;m^Ox8p_945r_X7x5&&v)?}==s%PF{&u{7 zHDUejcz-+Izh~9I9q%8>$ZyB{+wuP2J@FfAK+5V{zo7=iBz{8;2-5T$@BJg;{QnZ~ z{hjgt&UpVCB>%T}{l<9zu%F)<@9&KF|8@<2Lk)kxZ~to-=cceDR&x}Ikxidp^0 z;?lB^Cg;u$`c7T=@CZ*@IBD)CPXLU-+?0`#$ua(W>-RRLiK(eS=-NkF@vY48mU)&u zIE<9;{}>4MSohd&3twCrVwOV&Yy&{=J_fz?pI;inx_etF|2kus(N9`tiFaZP3N9_f znLy@pASr-H%^>|cQ>6R)`eosvzu8IbSuaHCNL8Dg`VN^g&x;NLV^ZetCopd=TqEl= zWz_u-_j-0;VNuZ%xmS8l4joZwB5^0jKVB%(0@=sQ&wu0EwGZRjcJCG?VrA`CCH{6t zXQgYlgE;T)P1gRu*7^5<$980FVq#)xdvAlK44HDk!7nOglw-{DJ$TG7ZrF)EtjUE| zIX$gW;4f`3!lQ86Pv$i-^)2YXt>1rN(SPMP=Q*Kg_t>}R^xq%Wuld1i5I)z%we-wS zEbd>qzJGlCfk5cs-n&;!_TR1J-}+5#34HG0iRphQMflkTe1Lwwnc5q!|1i^EW))*x zK*}re6Z(m@|2qr#+kF4S9sM@nKX%@4^ZlCxa@nuEQiZzt?|82Ny7j&0DO1za1N!SE zA*>aW1APSF&`b9-5As*Gv)3N-&hGBoe8Z;rC`r;x`*j2j zy(c8;>FK9eklaF#m+T3efg;5s_^({^&i5|X1N0j9@-yv}JHO_ygvImpz%qLFSgH8t zul1s_lY7#evd>WbG~N1ZCZzX>L=fGZkitUMb(c@!E?ctUqu>wodlWgo#&D+zYcTw> zFdL%#)udzV3)<6t%ll-omc!Qkn~NSd+R^V=FZa4JEelu#y7#ocyI($h-SvJUD)LD& zhN``5;24&>;*yoRORF9y$|Y!8+Hvyo;th+Qa_;4meJA{aW&&O>C)XPg$I8NST=(^d zzocDoA11kgtB@Lbp+56OU1>CietM-fIyzddXld`J$IK+1?KkDFP(B4+A@15Pxu*>y zMjuz#ZySe|iQe?;pQ+h;JjLQEdQf26|HCuak?VbPV`FB4nCYO@tn4O_iTm&0q+PHa zo_r%>YSkQ5#Y$N4w$o}iPf!pe>omEFay|EtNyUHNn*X$s9ql42qm1{|aMqm^tVm1! z&{|DcdCa?Kq9!jY?;du4-aXv@2Z^S-?>ubZ3HhM-N*Dvf_(feVK20l1DEnGCrJePF zm4WCLp`yywaKmO3n;$1?H~hBAL-4h(o4K-T7q#2=vCzn@hl+xO}Yi(kpzV zu;aOx{-K&D{7+V{x4<%qx|V(?eCYWaGm}*&s`F)0z8|$z?H_2w&9*iW$Jy*l`-8~e zHio63>e9KEXXbSM$7U+VMpM%>=^K-%gdJ5KRVyl{8l0w|S>OEN??c?%)bKEmE4Wsw zM~?duzV?%2UHtw*8u&*H$N7y@Lm`5(E8_TN+uNO-w}}7aeSCE{?u*S;3lou}hqAV` zD$2G)zo}(o=~;RiMj~3bOurMQlHvtnzf{eC_YCaISyj_2nKdRg?-h^0eVx;ui@NK{ z*lPSCU1T;}H@K{W_CRmHX}HPA^xe7oUY<{LuRr)yCAm}d4bQ|73f}NJdu%k{-F~|r zyPw5V0;V}duL=FEj{8fFMLxdihaVEnNGzzoZOLNLdH>g?VzCrJMP=$?QLS4d|m6F}K5hS->FW z%)P+iuuwZ4N*tY1O=WN~+~NyHw|0F>Ze^uQJk8qFIw(hE$?w{cT__`;;Hr(#%bpXT zE?BSsFc#sCm%J2T{`kC^B)4?Cknl1Bq`gm|966y;L-`VDV*1AMtn%Gcw0rb+jI|w>%gbK2%E)wr#~>>`r&+`0@6v5_{fHVV;Y7 z^G4cg4^MpK07XJ8Xo$dL4Z^1AQI)}yU{2S@7%-uEZ-;MPND-rI(U3KrGb>TPce_A^KIcB2#Y{XD!&%g;wMwVfljw)C!sX|oo zqIO?h2Y3VhJ}>O>uzJ(_Q5jqUmn-&Di3(WgH&sUMskx<;?aR~dt;YJJ-}iU;Td7Kt zz~CRV;!khtpAK*TkCS04BN_;wUY(ZmWEAcZ{orgluwr|4t(TaY$NAf0)5v<$OaMuN z>UBd&SXLVL{!-Vw#~Vn`(v+VI3a78$h?cACoV!He%*sd$j?tNuQqa^sDObIv7sF>P z((KEDLODvJx@v~C&jktd3|5X28_PzmxH=|t@EZ}AoY35MY;SX8vK|O0&Y5U<=| zc(qDsYpYB&SnIsn745RG**U^XH|Ox8n=BWKtt74#C5PQRax~JjmqMxF^{->+$9fX+ zSh%qsJU-M@9Y(4)&@*rte81c6cS=QFsJX2S4`z|A=+=)v?6T<#ccu2z>h#TN-4rABoas@DfapB{TTy4-{9IaG4e zZFBNwTE)zmS@7Qc-SSZbQ-)>LwcQDGay(m-i;D-W6Pb#KUm+iBv9|20W$)MX>Hjozv4OWEkfN=b8qW=Mm9ep6LgU=_b-?Tb>U z?)-REnRjDjqeV_hm~vm6TWliw9XGwEfCxkSQUIx6Dk}&i*z8Dx+5|_+{+&qYu3NR6Iq{Xd%C%;L+ldxn}$jJKBhQ*F=~#Ve<|D z1BdorJGRUHg)9siK|uRUsu!Kf#-={*ZACedn7qjIc$8K zJIyLJz(_V)^>e5803&l3bJ5sdd=))}34fF7tcB}Wi@u$tYR@>gR}OOFuj+68e^CT@ zD5g$USSau*0biw<8z!XEuW7TPMS3Y*I@5WX|b4j-u}5i7Fi{h_@XwO}0psc4IkM>etMa#r97*G=*&9i_W; z0z8{#-h4h6F5Asllp2l;lcc1lratMNKzs8UAL1|KkK9acCEVUT5H?}=V)s=G{AGp3 zsr1H)qpAXVCfbVoqdiB2{LL#lI*##JdhlAg*4?DSmOpNlfkh}Qa}NAf_adfFj^v_h z)tloLmCv%GC}%Xc0-fvjUd(l#c^jX@6XAHR6Z=$H8{0KNBXp?paCxwWe?ki(A*u4p zMd1MjRqc~6Yh?n}U*cj8WWGIb)Hv>VXYC>^TfI9ep>avQF%<85Hb?koF|D=;_C7Zm z`!iMZn|2oL-&XRDuT*&ulpf&ogRgTZHivq6*>6!uZf^Nx z(d}ZKu}G==TAjYdz$25X%MV6J#Mi1&L+kMikWFy0m{_s0#bVj61Fi!ragXJGGW zUBoj!7VY25FH>>vEHsmNQzM;(P~?{}oh zx^4FEaTL!Dh(ph(a9dWB^`r?cF?pAtZ()b#Y|Qw$HL=01(N?u1 zs#^ATgL7U7j$8FJ)M*~1i;Pdt7c`QqysXo|X!A#^RZw23x0&`XddxqIfQ{MszFvL* zQ(Z!K@>b_;?&j6bQWpLhZB6@1NxK|2jZ4ZTP=g2gskmp4Ekd;F8S>Q(=w>eZ+A=vK z^&&@WTO#&_n|IO*_nvfnj?iD`&Kgzpw(}V0Hfm=^r+DaU7Lgb^vnsZC`GkAH7&TJf zd1|cDAyu-Z`2r1;b!lIQa?fmaacwi#V&xiD`Omg`uFhlgA1Dix;4oKss>iy$`L12~ z(&@t4UA5$Nm9SnSMUF+A&iVNbShFqwC57+w+x~(Kt`}eoHVV?!g1esD9;}}lNEh7}Nq2U29#7N8Gn0fHw&J%xxL4+n^r#8WJXSmJ`YD8KpuLNB$Zc6Jp|YT8j9!3i%?7HKB$I(# z)BwbM?MhEcF+Qs?v-0-l+vO!fl�lLe1c@;{u;`}B9TZM}33 ziQr@W=O6a}l%rRYoX~Oi{O%nvuieChW(MyNY374fY*dOI+bbMNxshHh?rKW-K>@O(2CV8GqoC_rmkw;&nI4|adiQVWPn)qeT0i*tP zNt>rOxzc>TN)*!eij{!iAB-oPy@D`XzG5wJcm=G~7QNIi6B<)<7+uk!Pymf2wFG}e zTgpXysEjqm)ehtt4xwT7B;)e!=m}fBJax6g839$AP+U0b68uv$GdVScj*#4pSgF(X z*j~|}>o*@MHaz)k;asHX5z7NTZ>OTT@}_3Z^ETTu=DOc4wL?vrU-Z~+SmK-2haSJ8 z$0`t=+t%Oysv`MeO$b>bs_^3(^D1SV=@qW+q@$Yuhqd<%YchNHMrRPj5fPc8H3HI`^iDu2(xrDo?*u{#5CWVPXXc#$wfF4v&budH zy!>K(o>lJq*VcN3>Wn&#D6#Bc7Mh#b!t`VOO1gi1-p=TVpfcEIE;Q;EF2l zJ(q^;y{&#oe@dF%G^#0?Ka?5jqMO5U?-`{5NoM%V_UqviZ`X0p`4*F=o8?g@)M9?DbkpZeRN4Ewm{+A{7iC-6 z-XY_)^}L_tu^XTEp?Nn3MF)tHzM8EFt*e>ntXR&ms8g~b5SfcF>v3cS0nxcIUetg$n3~16A ze(fXqNz6N@j}p7@D?FFGqiXP)F?B+N_Wq)7ERzPCiP-$XVIX*Qw01#$SRY}{XQl;U zw=Lt^7Y$e-yIru`Sy+RRbfpBEa;`gT12husrxF2V2~P|XY{tGSh$-m@Aex%c9z)H! zIkP0SH4F_|Krn-&((lvHLObZYYOH|W9t;>N>f1f%()Os9my|=b#kH&RY=Ee_C@=Bo zlwSZ^QQOCT07YxCpMH}qWcyYJhYmmWD=;_oQFKv9dbl_;_e+!kZAul$XH!?70?8-b zzM!Vy^RZKaI$IsO{IC}B{?>_OPeoO5q08j%k>}0xEGQOK8~Ryyj}S#&3`GPEz0%#p zj@~cM%;(?^OixL<2J9qovGUu7uKe&}S5s%;A_xku`CMhvk2r5y)rb@`!*?`pG@K5h z_<$`h*cay@f@3e*O~?1wT>yVB8-~*X=BRzQ%U0uw?rF<3i;a|_eu5kOZz(lQ&Cz$n zy#=K+QA1{nyb%WbPgPW=Tbbjiu-<5X*s$AF>3-0nBy((1M{L9)UXt4yPa$-Yk(rgh z?(B1eO&Q_x0Ka962tQ;yG+l#~r5bTkc9PG)Cpb+;YM9RsQqCJWk6|K{RpR&ZLp!~P z9`Vd|(6N7ONJs&Lc2u^U8cFH%gTprykzgv50mI6G}yW_VeTCteCvHmH3K zcXB02+xV&Zf`z)Ix&r_W)3EhB!iLWJ z(hZo0uK=rn7eKbzK|$rxaC5Q&Od1nf?#I_Wpad$i8d;@7HXw-1FrS$|8GQm*c}5wq z(&f@}9B4aIpWzAs;kILirF{>poJm926X(=yHSr%7!rmF;Ov)pyf0IJ`U%5AGS2s7~ zcptf^=S)HM#FQVQ`E|6xz^`dkcB|xP(R7`(c19rQUj~J|mU4BaM<3Zn9`82*BSl8~ zlW0{J_lp&Rj$fK_d|TerA5is?Uousi8tA-?}dE}a4vah1#>f7My{7OP1Hbuak{<5v)1=vB4e|O+>0ffX(*p7MoPtwC^GSwX0IZK{oT z-|!T0vY8B2iiu`@GVlf9;v7dRsVGPAr(!*NhbCs>F<9mG*a-sTE*$$j0{ z3THw@4HJ%fhrRCC#ZkVnvJ-P#n3Y5h4TZrYCbn9E@}`WKoyXm!H_&Vw7@ZWKx{zt` zvA^qEym9+vDQ2{5G4%AysNNxqJVtGyd!UEel-T+Auj`-b&lGcFCYE9NIJ^GisKFTx zPbi_ha~}@U*`TK|6J#yhl!9ib;zJB~kG@!1+P9sSI*PCmcKkA2q6)N!ldXl5%Zn

yLUZ<*h3qOP2 z??R9Oc`6NQ);)VHUA~~DZ-?TdD(ZV<5!Otj9QlFYh~<9=VopCZ%HE*oiyZC$5*6oxuHmTl z@M3DOg4J|XK!IH?YQX1AcL#94Te~*OJwcu46BbiXA@c(^mk8W7H*U^c>#e@ZcA!Ib zSj8={l>f@ws)DrW9hvviivUROgG&$IdqYV7Q9+bz4Uj=}K-A!=oJ-5%@!i=(CDjKq z-bsS;U&PzL{Y(z+W1y!>+R*(#aBAM+b0K4}!Uxpkc+TEk9P*_)ZdN5&6Qu$&6)XMz>dbDg$G~l-zE)e!!QsLJ__69bwNcw|I|GJSl9%YCRZNq2ZZxzLMe6$H8wv=^13U^LQDSs zdcXe4?%hE)yYh36eEVtrwVd0)>BO+vlZsX8@ow*;zy>%SrEa>Q7OT;`{9diH+My*0 zinGWiHmt>6;6ZzS36PGjrX;MIkZR|*uvQKphQhGv z(zy#ayn`fFjL=V>2ywP3_m3OYqN(lx+muwfd+Z2q8^3}n-nuV^aUk=pE#hT15tEly zin*IfL#Xp?e4C%mSo<((`kG0$hm(|s;iKi^w*41~YPe8?ecy(+Hu++ArN0wtf)HsI z)-ous`a1A`{qVkiXo~Vw>|D(-ZoWu#s(bD!$rb>(F#q(+ygbe5Gg9%hVAB?Oq}vF2 z$%9IPLxV=!PmrRPeKT}^^6nf2&~#YMqI(jz?^L>#Cm z*$*U=+j1%0Q*)P6A@l~-8;%-!emj`_^;rlQIiZBwC}P=h&5(|LR@ABu5xcIg^X+C& z36`;}qouHca|pC)Gb|0yR~|*>lyCm@Y=XG9%dA_Yymd_v9i7Tu43`~YZHJzCq1Gs2{|IY11OuG<7qzXKJjVe- z35C64LTsYY(?T&5_FKd8YIpGB^a@`We2nHZP@=qP(xaK;A)9NYe zP5s2aqyfbdDuiAV(8~n9tboHgDlRoj_bHMa>`MSC4}c0DC1Ma1k%@pWsBL7mg;jnTv?l` zMZ+vxM|ZuEVS)Q((Tqsxvu|l^_LnEKv0osuKglcak#}ZtRQgcM!MwgVMBjccn}isV zoZk#f8(lOGbIo@S=E7F#Lj5DVXDEa3HH3br3AYxqmHHe-B*wst2w z_!K*(D!KIJpq{G?BG;3`f`aU0)9V9=bMnm`V6b=*7VAgY-O%D%F>lM;D|?dYv{hc_ zIIF|dYo)o(%ATldkul(@)szcrG!HZt-@zJjBWbP@R;~$dXcrL$%G@A{!%4FbAV(!Zt5rv?O1;Feh(DEji8`&H;9Ih_Y8D;pxu9jpOM=x_5JQ_jcY zVq@O}gWm4?AXfRnqMfg>_w4e}o@1m1B?*TP-EPrQ{FrAK7OY++PI-EM{$-Rs3<6oW z*G=%CcEbr}un7FlSjja{sZu4sOQ}=mmoN9`uEA_#)DfRARKvyOn)K3>aVZXV_?9@J zoxXBGlX@|rCXt5W4tX(f+NniXJL0zNfAknpegBv<$FT@`eedLPOBeVymsvT}xEW1+J4$k({@K@uvo35hj*AZS(cx?!;}P=32avsvNGf z9S?l%PQYu(71?)7Pa!Uu@KAs~n}|glJ@K2e=&s$OW7s_c!o~Bp{2yL0UtZ{+565oR z6*U=`Hk7}*@uQ@^#%l$~5oH3|>5|WRb8GIvXcE8M5SpgPqN26X!Ry&C(Vyx>eQq9$ z#`_C+45lr11os$G@Ii+RDq;>KRYnEagSONw3fiiwz-aM_Txqm97m+dHtwVx2jbYvz z971&#rF)be;rP@D#vB86`vi^^+$0&3b6($gH|UajKb0#Z+|AwrNVeKSeGG&OlG45E6R#S55vMpmDDDbAOhu=S)-#TW43b8dU zC3z8G=|zlJ-%9pOtpqh7Hk9z4(QSiEB-l2^`F?R!`STP00z^JXY$v`2vMc8kuj60i zj)J@>6sQNqvxLkxg;%Rs5SC`S)YVjTXWv=wlH(tj3ZKxz*RN6i3M;OF+Y;i@z?J$* zN$imuD4P{K$iSN^+FB|#`_r$29d#kabT0z&GZT295YnTsuB7b_8#q?q%J*oLNf1by zHvp7jD+c1)w?ovjgMqN^;nhkTjp`rTY<;ueP#lI@Y6K+XaPNy~4x14SSQjTh-j*^iZo1wfP8qbwlHXA^iH69aY0#&;!u#S!8V$EZTYZq2*%g$$1luz%`ysZ(z6IKpDlO+GL=J+ zpYMrLMAcR0Agn8R4`=>XD-Isz5=s|QVz{=)L7*a-;YtDq^N;zBo%>q`p#j zNXNht;QR22LZ!%;&plSR&N(oGPDvw~aAn{iro!0(etah7

XAP($WcVKSH!d)uBj z_8mgaWzt+{+|iHfIDcra8=IsA!i^tov<6;``2hRFNV6Lt{&}`@i(hW8CV|oUx8rUs zqC!KD!~lI8vF~w_VV!a@Dwb&M05F?X7QzJ2rPRhTVoNA}oS6~GDmVDi$@I14(`r=U z#j>zJ+$f&5lhx)3>1?I;RD&sCC7p~N#gW#+~<0`+>!aF&6eC^9a${i3VakN#mf9NipV*;fibwfh=ox_&}%@=>nxq2MKNU z3x{gnEd;7!1bvsMOpeFsq)4nZFMI-zcpW{R#f&9(-Gq+ppO;rcxDhUWqrZfkKhpm1 z|Dkw~%6Uw3Xl>fuML)#BN(EI>VRV7r>bl@yWfG-wGkoa@V-+1AuFhx$Yt+r)qxr3` zOFI)!X)OnuvxX>}K+@X;9P-Wc_mVp7&_n6S@Oi^!>1#ssmRS9nOR~3xqt)YaS^+;e z>f8qXSDp;_?x9E1%KZr3HITyskF&n_k~e02;!|3>Pi93iI2Bg7=BbB1zV`BsOwzy* z5c@MiL(+Z+p1kpH?YYq*wThf=aNAnGM=~9AzU%8 zUpcP={PKP5e(+oRQiY!$?+a|D8vIaqaC}^5 zhoZg_yuh7zch@@VAf7jZkI7G*U(UVGIk!x328?-=UC^RSesChUQAXJL0A0~1hqaplca>K@!j3$0+S8#l(r4y6M5X1*JL5(y@XJI)|8zV`M zu@tVsY%-33n$VWD?qLiY4Pw2tZ?$6FKfPDe>^C$`Vvcuo>eO9mqVTg`oh z&3o4>H1i1%A;b`Eha|iB{WlT+Ut@p*VFD0pDc#8kF7bsDqh(r(p_~pQ{TKHNy$Y3mG+2!f-Xu352`iaLlIU4ML7KOW9>}*!gU?1Rk#Dm+$bw)LVaox&Umq3W3ck_mKLA;NFBPp*^DMvlEUfl zlCp28K;ue}s_khqGFdiy6YD0;OJQ6YgJN<(!h7b_nV^WCpjPQuOQ&WMzKlj4S@IG0 zfr(J!S~``PNQE% zq{#`0*iAKHiIJTgLI`8-{0Y^7a$P>VMGVGad$A%Ts&uG0xS~XKm9qsmP=<_-Edop< zmRVi;_i(#(5hfF}KjO(~7yNu+9A!2m8rhO&hmWEgNx8zy{B#AT(J7?m4|Hf(_Rc~W z%d4mG+(H1AX~pDeu8d&fh>7r%D3olRMx*0?C4)ahV=)vdO-l!nol^&AXYLiVj1Kpl z>HVSRJ-v^@Yac!Liwtv>8oJ<`dVpv9G0g17^i2-uKg5|JW2HL*GeXdnv+T}wvt*Qg zbC^ZM;sSwi@~~5{J;+e4iIdqxpdcMi38IMdioU`d@3se9gJ0DA-V zn919WaT&9oX5Bm+%wW0_{)%xancmX1xIKUBAbstURG+8s#fv{i$S5jNVanhVPJ~G5 zO~P31u{_|a}z-p z&6Bs%?!?|y*6BpH5m~kfZ%r{qF)TP7+yZs}O+vw2x-b&CR^Dq>Z;DB;f$dt z;}Sev>nIoYg>2stvJ34baYl_kgx6*&70?4|vWI2a@5f+Wk<7+c!LD`{wVj~FC}4LZ zpPci-M8``d?i7uGOQVmxRI*$3_holA~3e#oHM4?LPfMoSaM5UI^> z$rxU~YD{x_w_O9suG*gJ5i{e{P7IzyphBLJGNFALS;*J>@b(4SO_P1Hhg~c#(+21A zQvcO}rv68zF6%epqMpKZfF3jiQZrO>sz4PWx>pTGN?Phj!`N-d)W@NLRZ5|}!s{^-y9!U_hFRZFk#dY@(g z{BM8xyZ_0{Q_KO(czMd4`#TW!KYdCYv~hgH(u9xx+s8K^c(mCFby8Qszd*GfroWW`ajK>vGn||?ORna)IH5YOp+|qICzr3`rmbwr;qQKq%?he zr;qPf1oU(nH(kbk)j#?Fp@s2y!lc#RaK?-o^yh}H-RSO>&Ai6;-}FybaJ` + +### Writing Your First Training Loop +The following steps will walk you through how you can create a sample GPT model split across tensors (Tensor model parallel ) on 2 GPUS, and run a forward pass through it using a MockGPT dataset helper class that we created in Megatron core. + +
+ +**NOTE: All of the following steps are already put into a script [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) which you can run as follows** +``` +PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py +``` + +
+ +**STEP 1 - Initialize Distributed Training and Model parallel setup** +The following utility when called initalizes your distributed setup. + +```python +import os +import torch +from megatron.core import parallel_state + +def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1): + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size) +``` +
+ +**STEP 2 - GPT Model Setup** +The following step shows you how you can quickly create a GPT model. For a list of other configs that you can pass into the model look into [transformer_config.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/transformer_config.py) +``` +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + use_cpu_initialization=True, + pipeline_dtype=torch.float32) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=64) + + return gpt_model +``` +
+ +**STEP 3 - GPT Mock dataset setup** +The following shows you how you can quickly get started with a mock dataset utility we created. In order to train with your data, please use the actual GPTDataset class in [gpt_dataset.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/gpt_dataset.py) + +To find more information about megatron core data pipeline please refer to [this](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/readme.md?ref_type=heads) + +``` +import torch +from torch.utils.data import DataLoader + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset +from megatron.training.tokenizer.tokenizer import _NullTokenizer +from megatron.core.datasets.utils import compile_helpers + +_SEQUENCE_LENGTH = 64 + +def get_train_data_iterator(): + if torch.distributed.is_available() and torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() + else: + compile_helpers() + + config = GPTDatasetConfig( + random_seed=0, + sequence_length=_SEQUENCE_LENGTH, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH), + ) + + datasets = BlendedMegatronDatasetBuilder( + MockGPTDataset, [1000, None, None], lambda: True, config + ).build() + + train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True) + + train_iterator = iter(train_dataloader) + + return train_iterator + +``` +
+ +**STEP 4 - Forward Step Function** +In megatron core, we use [schedules.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/pipeline_parallel/schedules.py) to run the model. So it is sufficient to define a forward step function which takes as input the data iterator and the model and produces as output the output tensor and a loss function + +```python +from functools import partial + +def forward_step_func(data_iterator, model): + + def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + # If you have data parallel reduce loss across data parallel groups. + # If pipeline parallel, loss computation is done only in last stage. + + return loss, {'lm loss': loss} + + data = next(data_iterator) + tokens = data['tokens'].to(device) + attention_mask = data['attention_mask'].to(device) + position_ids = data['position_ids'].to(device) + labels = data['labels'].to(device) + loss_mask = data['loss_mask'].to(device) + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) +``` +
+ +**STEP 5 - Load and Save Distributed Checkpoint** +Megatron core uses distributed checkpoint for loading and saving model. This gives you the flexiblity to convert model from one model parallel setting to another when you load a model (i.e A model trained with tensor parallel size 2, can now be loaded as tensor model parallel size 4 etc.) + +```python +from megatron.core import dist_checkpointing + +def save_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict = gpt_model.sharded_state_dict(prefix='') + dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model +``` +
+ +**STEP 6 - Main Function** +The following is the main function that needs to go into your script. + +```python +from pathlib import Path +from torch.optim import Adam +from megatron.core.pipeline_parallel.schedules import get_forward_backward_func +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + +if __name__ == "__main__": + initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + device = torch.device("cuda") + gpt_model.to(device) + + optim = Adam(gpt_model.parameters()) + + train_iterator = get_train_data_iterator() + + forward_backward_func = get_forward_backward_func() + + # Running the model for 5 iterations + for _ in range(5): + optim.zero_grad() + + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=train_iterator, + model=gpt_model, + num_microbatches=1, + seq_length=64, + micro_batch_size=8, + decoder_seq_length=64, + forward_only=False) + + optim.step() + + print(f'Losses reduced : {losses_reduced}') + + # Saving the model + save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt') + + # Loading the model + gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt') + gpt_model.to(device) + print('Successfully loaded the model') +``` +
+ + + +### Extending Further +The above example introduced you to a basic training loop in MCore. To see more advanced examples please look at [pretrain_gpt.py]. That will show you how you can write more complex training loops, involving pipeline parallel, context parallel, rope embeddings, mixture of experts and all other functionalities present in mcore. diff --git a/megatron/core/README.md b/megatron/core/README.md new file mode 100644 index 0000000..38970b0 --- /dev/null +++ b/megatron/core/README.md @@ -0,0 +1,14 @@ +# Megatron-Core + +Megatron-Core is an open-source PyTorch-based library that contains GPU-optimized techniques and cutting-edge system-level optimizations. It abstracts them into composable and modular APIs, allowing full flexibility for developers and model researchers to train custom transformers at-scale on NVIDIA accelerated computing infrastructure. This library is compatible with all NVIDIA Tensor Core GPUs, including FP8 acceleration support for [NVIDIA Hopper architectures](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/). + +Megatron-Core offers core building blocks such as attention mechanisms, transformer blocks and layers, normalization layers, and embedding techniques. Additional functionality like activation re-computation, distributed checkpointing is also natively built-in to the library. The building blocks and functionality are all GPU optimized, and can be built with advanced parallelization strategies for optimal training speed and stability on NVIDIA Accelerated Computing Infrastructure. Another key component of the Megatron-Core library includes advanced model parallelism techniques (tensor, sequence, pipeline, context, and MoE expert parallelism). + +Megatron-Core can be used with [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/), an enterprise-grade AI platform. Alternatively, you can explore Megatron-Core with the native PyTorch training loop [here](https://github.com/NVIDIA/Megatron-LM/tree/main/examples). Visit [Megatron-Core documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) to learn more. + +## Quick links + +- [Benchmark using NVIDIA NeMo](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html#performance-benchmarks) +- [Multimodal example (LLaVA training pipeline)](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal) +- [Mixture-of-Experts](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/moe) +- [Training Mamba-based Language Models](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mamba) diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md new file mode 100644 index 0000000..fe9062c --- /dev/null +++ b/megatron/core/README_STRAGGLER.md @@ -0,0 +1,93 @@ +## StragglerDetector for a TP Group + +The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts. +It can be used to find straggling TP group based on the RTT of the ranks in the TP Group. It also collects +Power/Temp/Utilization for GPUs, which can additionally be used to narrow down to the exact GPU in the TP Group, +assuming the straggling was caused by hardware anomaly in a given GPU.
+This class supports collecting timing events for various steps of a given iteration. It +keeps collecting such timing events on a per rank basis, and when the reporter is invoked +during a logging interval, it computes the min and max of certain metric across all +ranks and logs the observed metric and the rank as follows + +``` + 0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27ms/23 | MxDRtt/Rnk: 34.65ms/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8 +``` +


+ +### Description of the metrics + +Each metric is prefixed with `Mn` or `Mx` to represent `Minimum` or `Maximum`. Each metric is also suffixed with the rank where the metric was measured. The metrics are averaged over the logging interval. Between the prefix and the rank is the name of the metric as follows + +- Rtt : RoundTrip Time (time spent in all the traced ops per iteration) +- Pwr : GPU Power +- Tmp : GPU Temperature +- Utl : GPU Utilization +- Clk : GPU Clock +- DRtt: get_batch latency +- Etpt: Estimated throughput. This is derived from actual computed throughput dividied by Rtt. Since we do not collect timing for backward pass, the value is further divided by three to come up with estimated throughput. +
+ +### Command Line activation +To start using the StragglerDetector, need to pass the following argument `--log-straggler`. It optionally also takes two additional parameters. Default disabled +- `--disable-straggler-on-startup` - whether to keept the StragglerDetector disabled on startup and enable later. Default enabled +- `--straggler-ctrlr-port` - The StragglerDetector can toggle between on/off just by sending `curl Rank0Host:port`. Default port is 65535. Every time it is turned +- `--straggler-minmax-count` - If set to > 1 (N), it prints N Top and Bottom Etpt/Rank pairs as shown below +``` + 0: INFO:megatron.core.utils:^^^^ Bottom 4 Ranks with lowest Etpt(TF): 296.02/0, 296.17/2, 296.23/1, 296.23/4, + 0: INFO:megatron.core.utils:^^^^ Top 4 Ranks with highest Etpt(TF): 297.28/15, 297.28/11, 297.32/12, 297.32/8, +``` +
+ +### Programming the StragglerDetector +The StragglerDetector class supports context, and its implementation is a Singleton. +- Initialization + +``` + # initialization, where StragglerDetector will be used + from megatron.core.utils import StragglerDetector + stimer = StragglerDetector() +``` + +- One time for each rank + +``` + # one time before the training loop starts + stimer.configure(world, rank, enabled=True, port=65545) + + # Arguments to configure + # world : World Size + # rank : The rank of this trainer + # mmcnt : (Optional) Number of ranks to print for showing Min/Max Etpt + # amp : (Optional) Set to 3.0 if we only use timers in fwd pass + # port : (Optional) control port, useful only for rank-0 + # prefill : (Optional) howmany Events to pre-populate + # enabled : (Optional) whether or not collection is enabled on startup +``` + +- To Capture time + +``` + # whereever timing need to be captured + with stimer: + do_operation() + + # special case for get_batch + with stimer(bdata=True): + input,... = get_batch(iterator,...) +``` + +- Logging in main training loop + +``` + # logging + total_flops = 0.0 + iteration = 0 + # inside the main training loop + while training: + iteration += 1 + do_step() + total_flops += get_computed_flops() + if iteration % log_interval: + stimer.report(total_flops, log_interval) + total_flops = 0.0 +``` diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py new file mode 100644 index 0000000..902bdd9 --- /dev/null +++ b/megatron/core/__init__.py @@ -0,0 +1,35 @@ +import megatron.core.tensor_parallel +import megatron.core.utils +from megatron.core import parallel_state +from megatron.core.distributed import DistributedDataParallel +from megatron.core.inference_params import InferenceParams +from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator +from megatron.core.package_info import ( + __contact_emails__, + __contact_names__, + __description__, + __download_url__, + __homepage__, + __keywords__, + __license__, + __package_name__, + __repository_url__, + __shortversion__, + __version__, +) +from megatron.core.timers import Timers + +# Alias parallel_state as mpu, its legacy name +mpu = parallel_state + +__all__ = [ + "parallel_state", + "tensor_parallel", + "utils", + "DistributedDataParallel", + "InferenceParams", + "init_num_microbatches_calculator", + "ModelParallelConfig", + "Timers", +] diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile new file mode 100644 index 0000000..8f9db76 --- /dev/null +++ b/megatron/core/datasets/Makefile @@ -0,0 +1,9 @@ +CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color +CPPFLAGS += $(shell python3 -m pybind11 --includes) +LIBNAME = helpers +LIBEXT = $(shell python3-config --extension-suffix) + +default: $(LIBNAME)$(LIBEXT) + +%$(LIBEXT): %.cpp + $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ diff --git a/megatron/core/datasets/__init__.py b/megatron/core/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py new file mode 100644 index 0000000..657cc6a --- /dev/null +++ b/megatron/core/datasets/bert_dataset.py @@ -0,0 +1,199 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import numpy + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.masked_dataset import ( + MaskedWordPieceDataset, + MaskedWordPieceDatasetConfig, +) +from megatron.core.datasets.utils import Split + + +@dataclass +class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig): + """Configuration object for Megatron Core BERT WordPiece datasets""" + + classification_head: bool = None + """Option to perform the next sequence prediction during sampling""" + + def __post_init__(self) -> None: + """Do asserts and set fields post init + """ + super().__post_init__() + + assert self.classification_head is not None + + +class BERTMaskedWordPieceDataset(MaskedWordPieceDataset): + """The BERT dataset that assumes WordPiece tokenization + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset + + dataset_path (str): The real path on disk to the dataset, for bookkeeping + + indexed_indices (numpy.ndarray): The set of the documents indices to expose + + num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch. + + index_split (Split): The indexed_indices Split + + config (BERTMaskedWordPieceDatasetConfig): The config + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: str, + indexed_indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: BERTMaskedWordPieceDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + + self.token_lookup = list(self.config.tokenizer.inv_vocab.keys()) + # Account for the single and two token ids + self.sample_index = self._build_sample_index( + self.config.sequence_length - 3, 2 if self.config.classification_head else 1 + ) + + @staticmethod + def _key_config_attributes() -> List[str]: + """Inherited method implementation + + Returns: + List[str]: The key config attributes + """ + return super( + BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset + )._key_config_attributes() + ["classification_head",] + + def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: + """Abstract method implementation + + Args: + idx (int): The index into the dataset + + Returns: + Dict[str, Union[int, numpy.ndarray]]: The + """ + idx_beg, idx_end, target_sequence_length = self.sample_index[idx] + sample = [self.dataset[i] for i in range(idx_beg, idx_end)] + numpy_random_state = numpy.random.RandomState( + seed=(self.config.random_seed + idx) % 2 ** 32 + ) + + assert target_sequence_length <= self.config.sequence_length + + # Split the sample into contiguous subsegments A and B + pivot = len(sample) + is_next_random = False + if self.config.classification_head: + assert len(sample) > 1, "the sample must contain at least two sentences" + pivot = 1 + if len(sample) >= 3: + pivot = numpy_random_state.randint(low=1, high=len(sample)) + is_next_random = numpy_random_state.random() < 0.5 + split_A = [] + for sample_a in sample[:pivot]: + split_A.extend(sample_a) + split_B = [] + for sample_b in sample[pivot:]: + split_B.extend(sample_b) + if is_next_random: + split_A, split_B = split_B, split_A + + # Trim the subsegments from either end to a desired joint length + length_A = len(split_A) + length_B = len(split_B) + if length_A + length_B <= target_sequence_length: + truncated = False + else: + while length_A + length_B > target_sequence_length: + split = split_A if length_A > length_B else split_B + if numpy_random_state.random() < 0.5: + del split[0] + else: + del split[-1] + length_A = len(split_A) + length_B = len(split_B) + truncated = True + + # Merge the subsegments and create the token assignment labels + tokens = [ + self.config.tokenizer.cls, + *split_A, + self.config.tokenizer.sep, + ] + assignments = [0 for _ in range(1 + len(split_A) + 1)] + if split_B: + tokens += [*split_B, self.config.tokenizer.sep] + assignments += [1 for _ in range(len(split_B) + 1)] + + # Masking + tokens, masked_positions, masked_labels, _, _ = self._create_masked_lm_predictions( + tokens, target_sequence_length, numpy_random_state + ) + + # Pad the sequences and convert to NumPy + length_toks = len(tokens) + length_pads = self.config.sequence_length - length_toks + assert length_pads >= 0 + + tokens = numpy.array(tokens, dtype=numpy.int64) + tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad) + + assignments = numpy.array(assignments, dtype=numpy.int64) + assignments = numpy.pad( + assignments, (0, length_pads), constant_values=self.config.tokenizer.pad + ) + + # Get the padding mask + mask_pads = numpy.ones(length_toks, dtype=numpy.int64) + mask_pads = numpy.pad( + mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad + ) + + # Mask the labels + labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1 + labels[masked_positions] = masked_labels + + # Get the loss mask + mask_loss = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) + mask_loss[masked_positions] = 1 + + return { + "text": tokens, + "types": assignments, + "labels": labels, + "is_random": int(is_next_random), + "padding_mask": mask_pads, + "loss_mask": mask_loss, + "truncated": int(truncated), + } + + def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]: + """Abstract method implementation + + 80% of the time, replace the token id with mask token id. 10% of the time, replace token id + with a random token id from the vocabulary. 10% of the time, do nothing. + + Args: + numpy_random_state (RandomState): The NumPy random state + + Returns: + Optional[int]: The replacement token id or None + """ + if numpy_random_state.random() < 0.8: + return self.config.tokenizer.mask + else: + if numpy_random_state.random() >= 0.5: + return self.token_lookup[numpy_random_state.randint(0, len(self.token_lookup))] + return None diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py new file mode 100644 index 0000000..f262b05 --- /dev/null +++ b/megatron/core/datasets/blended_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import hashlib +import json +import logging +import os +import time +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union + +import numpy +import torch + +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.megatron_dataset import MegatronDataset +from megatron.core.datasets.utils import normalize +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + +_VERBOSE = False + + +class BlendedDataset(torch.utils.data.Dataset): + """Conjugating class for a set of MegatronDataset instances + + Args: + datasets (List[MegatronDataset]): The MegatronDataset instances to blend + + weights (List[Union[int, float]]): The weights that determine the dataset blend ratios + + size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx]. + + config (BlendedMegatronDatasetConfig): The config + + Raises: + RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization + """ + + def __init__( + self, + datasets: List[MegatronDataset], + weights: List[Union[int, float]], + size: Optional[int], + config: BlendedMegatronDatasetConfig, + ) -> None: + assert len(datasets) == len(weights) + assert len(datasets) < 32767 + assert all(map(lambda _: type(_) == type(datasets[0]), datasets)) + assert all(map(lambda _: _.index_split == datasets[0].index_split, datasets)) + assert all(map(lambda _: _ > 0, weights)) + assert all(map(lambda _: type(_) == type(weights[0]), weights)) + if size is None and isinstance(weights[0], float): + assert all(map(lambda _: _ == int(_), weights)) + + # Alert user to unnecessary blending + if len(datasets) == 1: + log_single_rank( + logger, logging.WARNING, f"Building a BlendedDataset for a single MegatronDataset" + ) + + if size is not None: + weights = normalize(weights) + + self.datasets = datasets + self.split = self.datasets[0].index_split + self.weights = weights + self.size = size + self.config = config + + unique_identifiers = OrderedDict() + unique_identifiers["class"] = type(self).__name__ + unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets] + unique_identifiers["split"] = self.split.name + unique_identifiers["weights"] = self.weights + unique_identifiers["size"] = self.size + + self.unique_description = json.dumps( + unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers + ) + self.unique_description_hash = hashlib.md5( + self.unique_description.encode("utf-8") + ).hexdigest() + + self.built_anew_on_cache_miss = False + + self.dataset_index, self.dataset_sample_index = self._build_indices() + + def __len__(self) -> int: + return self.dataset_index.shape[0] + + def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: + dataset_id = self.dataset_index[idx] + dataset_sample_id = self.dataset_sample_index[idx] + return { + "dataset_id": dataset_id, + **self.datasets[dataset_id][dataset_sample_id], + } + + def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]: + """Build and optionally cache the dataset index and the dataset sample index + + The dataset index is a 1-D mapping which determines the dataset to query. The dataset + sample index is a 1-D mapping which determines the sample to request from the queried + dataset. + + Returns: + Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index + """ + path_to_cache = self.config.path_to_cache + + if path_to_cache: + get_path_to = lambda suffix: os.path.join( + path_to_cache, + f"{self.unique_description_hash}-{type(self).__name__}-{self.split.name}-{suffix}", + ) + path_to_description = get_path_to("description.txt") + path_to_dataset_index = get_path_to("dataset_index.npy") + path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy") + cache_hit = all( + map( + os.path.isfile, + [path_to_description, path_to_dataset_index, path_to_dataset_sample_index], + ) + ) + else: + cache_hit = False + + if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0): + log_single_rank( + logger, + logging.INFO, + f"Build and save the {type(self).__name__} indices", + ) + self.built_anew_on_cache_miss = True + + # Build the dataset and dataset sample indexes + log_single_rank( + logger, logging.INFO, f"\tBuild and save the dataset and dataset sample indexes" + ) + t_beg = time.time() + from megatron.core.datasets import helpers + + if self.size is not None: + dataset_index = numpy.zeros(self.size, dtype=numpy.int16) + dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64) + helpers.build_blending_indices( + dataset_index, + dataset_sample_index, + self.weights, + len(self.datasets), + self.size, + _VERBOSE, + ) + else: + size = sum(self.weights) + dataset_index = numpy.zeros(size, dtype=numpy.int16) + dataset_sample_index = numpy.zeros(size, dtype=numpy.int64) + helpers.build_exhaustive_blending_indices( + dataset_index, dataset_sample_index, self.weights, len(self.datasets) + ) + + if path_to_cache: + os.makedirs(path_to_cache, exist_ok=True) + # Write the description + with open(path_to_description, "wt") as writer: + writer.write(self.unique_description) + # Save the indexes + numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True) + numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True) + else: + log_single_rank( + logger, + logging.WARNING, + f"Unable to save the {type(self).__name__} indexes because path_to_cache is None", + ) + + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + return dataset_index, dataset_sample_index + + log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} indices") + + log_single_rank( + logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}" + ) + t_beg = time.time() + dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r') + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, + logging.INFO, + f"\tLoad the dataset sample index from {path_to_dataset_sample_index}", + ) + t_beg = time.time() + dataset_sample_index = numpy.load( + path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r' + ) + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + return dataset_index, dataset_sample_index diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py new file mode 100644 index 0000000..4a4dd8d --- /dev/null +++ b/megatron/core/datasets/blended_megatron_dataset_builder.py @@ -0,0 +1,528 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import logging +import math +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Callable, Iterable, List, Optional, Type, Union + +import numpy +import torch + +from megatron.core.datasets.blended_dataset import BlendedDataset +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset +from megatron.core.datasets.utils import Split, normalize +from megatron.core.parallel_state import get_virtual_pipeline_model_parallel_rank +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + +MidLevelDataset = MegatronDataset + +TopLevelDataset = Union[BlendedDataset, MidLevelDataset] + +DistributedDataset = Union[ + TopLevelDataset, MidLevelDataset, LowLevelDataset, torch.utils.data.Dataset +] + + +class BlendedMegatronDatasetBuilder(object): + """Builder class for the BlendedDataset and MegatronDataset classes + + Args: + cls (Type[MegatronDataset]): The class to instantiate, must inherit from MegatronDataset + + sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split + + is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value. + + config (BlendedMegatronDatasetConfig): The config object which informs dataset creation + """ + + def __init__( + self, + cls: Type[MidLevelDataset], + sizes: List[int], + is_built_on_rank: Callable, + config: BlendedMegatronDatasetConfig, + ): + self.cls = cls + self.sizes = sizes + self.is_built_on_rank = is_built_on_rank + self.config = config + + log_single_rank( + logger, + logging.INFO, + f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}", + ) + + if not self.config.mock: + for split in Split: + size_is_none = self.sizes[split.value] is None + if self.config.blend_per_split is None: + weights_are_none = self.config.blend[1] is None + else: + if self.config.blend_per_split[split.value] is None: + continue + weights_are_none = self.config.blend_per_split[split.value][1] is None + if size_is_none: + assert ( + weights_are_none + ), f"size_is_none => weights_are_none fails for {split.name} split" + + if torch.distributed.is_initialized(): + gb_rank = torch.distributed.get_rank() + vp_rank = get_virtual_pipeline_model_parallel_rank() + if gb_rank == 0 and (vp_rank == 0 or vp_rank is None): + assert ( + self.is_built_on_rank() + ), "is_built_on_rank must return True when global rank = 0 and vp rank = 0" + + def build(self) -> List[Optional[TopLevelDataset]]: + """Build all dataset splits according to the provided blend(s) + + This method is distributed-aware and must be called on all ranks. + + The dataset splits returned can vary according to the config. Supply config.blend and + config.split to build BlendedDataset and/or MegatronDataset splits from the same + distribution. Supply config.blend_per_split to build BlendedDataset and/or MegatronDataset + splits from separate distributions. In either case, for each split, handle the following + cases: + + (1) The split is None + - do nothing + + (2) The split has one contributing dataset, and... + + (a) 'size' is not None + - Build a mid-level dataset with low-level dataset sampling in proportion to the size + + (b) 'size' is None + - Build mid-level datasets with no excess low-level dataset sampling + + (3) The split has multiple contributing datasets, and... + + (a) 'weights' is not None and 'size' is not None + - Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size + - Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size + + (b) 'weights' is not None and 'size' is None + - Error + + (c) 'weights' is None and 'size' is not None + - Build mid-level datasets with no excess low-level dataset sampling + - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size + + - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths + + (d) 'weights' is None and 'size' is None + - Build mid-level datasets with no excess low-level dataset sampling + - Build a top-level dataset with no excess mid-level dataset sampling + + Returns: + List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split + """ + datasets = self._build_blended_dataset_splits() + + for dataset in datasets: + if dataset is not None and len(dataset) > 0: + if isinstance(dataset, BlendedDataset): + if dataset.built_anew_on_cache_miss or any( + x.built_anew_on_cache_miss for x in dataset.datasets + ): + log_single_rank( + logger, + logging.INFO, + f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split", + ) + else: + log_single_rank( + logger, + logging.INFO, + f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification", + ) + continue + # Check blend size + assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0] + # Check blend access of mid-level datasets + _, sizes = numpy.unique(dataset.dataset_index, return_counts=True) + for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)): + if len(dataset_and_size[0]) < dataset_and_size[1]: + raise IndexError( + f"{type(dataset).__name__} blend goes out of bounds for {type([dataset_and_size[0]]).__name__} {i} for {dataset.split.name} split" + ) + + return datasets + + def _build_blended_dataset_splits( + self, + ) -> List[Optional[TopLevelDataset]]: + """Build all dataset splits according to the provided blend(s) + + See the BlendedMegatronDatasetBuilder.build alias for more information. + + Returns: + List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split + """ + ## + # Return fake "mock" datasets + ## + if self.config.mock: + split = self.config.split_matrix + try: + return self._build_megatron_dataset_splits(None, split, self.sizes) + except Exception as error: + raise Exception( + f"{self.cls.__name__} failed to build as a mock data generator" + ) from error + + ## + # All splits come from the same distribution + ## + elif self.config.blend: + prefixes, weights = self.config.blend + if weights is not None: + weights = normalize(weights) + + split = self.config.split_matrix + + # Blend consists of a single prefix + if len(prefixes) == 1 and weights is None: + return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes) + + # Build the mid-level datasets + if weights is None: + sizes_per_dataset = [[None for split in Split] for prefix in prefixes] + else: + sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes) + + # build each dataset in parallel + megatron_datasets = self._build_megatron_datasets_parallel( + prefixes, split, sizes_per_dataset + ) + + # Build the top-level datasets + blended_datasets = [None] * len(Split) + for i in range(len(Split)): + if split[i] is not None: + weights_i = weights + if weights_i is not None and self.sizes[i] is not None: + size_i = sum(list(zip(*sizes_per_dataset))[i]) + elif weights_i is None: + try: + weights_i = [ + len(megatron_dataset) for megatron_dataset in megatron_datasets[i] + ] + except TypeError: + weights_i = [0 for _ in prefixes] + if self.sizes[i] is not None: + size_i = min(self.sizes[i], sum(weights_i)) + else: + size_i = None # => the size will be sum(weights_i) + else: + raise RuntimeError + blended_datasets[i] = self.build_generic_dataset( + BlendedDataset, + self.is_built_on_rank, + True, # synchronize_ranks, default behavior to build on rank-0 first + megatron_datasets[i], + weights_i, + size_i, + self.config, + ) + + return blended_datasets + + ## + # Each split comes from a separate distribution + ## + else: + blended_datasets = [None] * len(Split) + for i in range(len(Split)): + split_spoof = [None] * len(Split) + split_spoof[i] = (0.0, 1.0) + sizes_spoof = [0] * len(Split) + sizes_spoof[i] = self.sizes[i] + + # Blend is provided for the split + blend = self.config.blend_per_split[i] + if blend is not None: + prefixes, weights = blend + if weights is not None: + weights = normalize(weights) + + # Blend consists of a sigle prefix + if len(prefixes) == 1: + blended_datasets[i] = self._build_megatron_dataset_splits( + prefixes[0], split_spoof, sizes_spoof + )[i] + continue + + # Build mid-level datasets + if weights is None: + sizes_per_dataset = [[None for split in Split] for prefix in prefixes] + else: + sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof) + + # build each dataset in parallel + megatron_datasets = self._build_megatron_datasets_parallel( + prefixes, split_spoof, sizes_per_dataset + )[i] + + # Build top-level dataset + if weights is not None and self.sizes[i] is not None: + size = list(map(sum, zip(*sizes_per_dataset)))[i] + elif weights is None: + try: + weights = [ + len(megatron_dataset) for megatron_dataset in megatron_datasets + ] + except TypeError: + weights = [0 for _ in prefixes] + if self.sizes[i] is not None: + size = min(self.sizes[i], sum(weights)) + else: + size = None # => the size will be sum(weights) + else: + raise RuntimeError + blended_datasets[i] = self.build_generic_dataset( + BlendedDataset, + self.is_built_on_rank, + True, # synchronize_ranks, default behavior to build on rank-0 first + megatron_datasets, + weights, + size, + self.config, + ) + + return blended_datasets + + def _build_megatron_datasets_parallel( + self, + prefixes: List[str], + split: List[float], + sizes_per_dataset: List[List[int]], + ) -> List[List[Optional[MegatronDataset]]]: + """Build the megatron datasets for a list of prefixes in parallel + + Args: + prefixes (List[str]): The list of prefix strings + + split (List[float]): The dataset split ratios (must sum to 1.00) + + sizes_per_dataset (List[List[int]]): The number of samples to request + per MegatronDataset per spilt + + Returns: + List[List[Optional[MegatronDataset]]]: For each split, have a list of + MegatronDataset per prefix + """ + + # Helper function to wrap the threading logic + def _threading_helper( + megatron_datasets: List[List[Optional[MegatronDataset]]], + num_workers: int, + prefixes: List[str], + split: List[float], + sizes_per_dataset: List[List[int]], + ) -> None: + with ThreadPoolExecutor(max_workers=num_workers) as executor: + all_futures = [] + for i in range(len(prefixes)): + all_futures.append( + executor.submit( + self._build_megatron_dataset_splits, + prefixes[i], + split, + sizes_per_dataset[i], + False, # synchronize_ranks, barrier is called in this function + ) + ) + for future in all_futures: + try: + megatron_datasets_split = future.result() + for j in range(len(megatron_datasets_split)): + megatron_datasets[j].append(megatron_datasets_split[j]) + except Exception as err: + raise err + return megatron_datasets + + megatron_datasets = [[] for _ in range(len(Split))] + num_dataset_builder_threads = self.config.num_dataset_builder_threads + + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + # First, build on rank 0 + if rank == 0: + num_workers = num_dataset_builder_threads + if num_workers > 1: + # since only rank 0 is running, scale up the thread count + # but not too much to avoid overloading storage on miss path. + # if user set num_dataset_builder_threads to 1, + # i.e. meant for serial build, do not scale up. + num_workers *= min(2, max(1, torch.cuda.device_count())) + _threading_helper( + megatron_datasets, + num_workers, + prefixes, + split, + sizes_per_dataset, + ) + + torch.distributed.barrier() + + # Then, build on other ranks; guaranteed to be data_cache hit + if rank != 0: + _threading_helper( + megatron_datasets, + num_dataset_builder_threads, + prefixes, + split, + sizes_per_dataset, + ) + else: + _threading_helper( + megatron_datasets, + num_dataset_builder_threads, + prefixes, + split, + sizes_per_dataset, + ) + + return megatron_datasets + + def _build_megatron_dataset_splits( + self, + dataset_path: Optional[str], + split: List[float], + sizes: List[int], + synchronize_ranks: bool = True, + ) -> List[Optional[MidLevelDataset]]: + """Build each MidLevelDataset split from a single LowLevelDataset + + Args: + dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes + + split (List[Tuple[float, float]]): The dataset split matrix + + sizes (List[int]): The number of total samples to draw from each split + + synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level. + + Returns: + List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split + """ + # Build the low level dataset + low_level_dataset = self.cls.build_low_level_dataset(dataset_path, self.config) + + # Build the split indices for the low level dataset + num_elements = self.cls.numel_low_level_dataset(low_level_dataset) + split_indices = [] + for i, _ in enumerate(Split): + if split[i] is not None: + beg = int(round(split[i][0] * float(num_elements))) + end = int(round(split[i][1] * float(num_elements))) + split_indices.append(numpy.arange(start=beg, stop=end, step=1, dtype=numpy.int32)) + else: + split_indices.append(None) + + # Build the mid level dataset + mid_level_datasets = [] + for i, _split in enumerate(Split): + if split[i] is None: + mid_level_datasets.append(None) + else: + mid_level_datasets.append( + self.build_generic_dataset( + self.cls, + self.is_built_on_rank, + synchronize_ranks, + low_level_dataset, + dataset_path, + split_indices[i], + sizes[i], + _split, + self.config, + ) + ) + + return mid_level_datasets + + @staticmethod + def build_generic_dataset( + cls: Union[Type[DistributedDataset], Callable], + is_built_on_rank: Callable, + synchronize_ranks: bool, + *args: Any, + ) -> Optional[Union[DistributedDataset, Iterable]]: + """Build the DistributedDataset + + Return None if and only if the underlying dataset class is not built on the current rank + and torch.distributed is initialized. + + Args: + cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable. + + synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level. + + args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class + + Raises: + Exception: When the dataset constructor raises an OSError + + Returns: + Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None + """ + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + + dataset = None + + # First, build on rank 0 + if rank == 0 and is_built_on_rank(): + try: + dataset = cls(*args) + except OSError as err: + log = ( + f"Failed to write dataset materials to the data cache directory. " + + f"Please supply a directory to which you have write access via " + + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and " + + f"retry. Refer to the preserved traceback above for more information." + ) + raise Exception(log) from err + + if synchronize_ranks: + torch.distributed.barrier() + + # After, build on other ranks + if rank != 0 and is_built_on_rank(): + dataset = cls(*args) + + return dataset + + return cls(*args) + + +def _get_size_per_split_per_dataset( + normalized_weights: List[float], target_size_per_split: List[int] +) -> List[List[int]]: + """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits + + Args: + normalized_weights (List[float]): e.g. [0.3, 0.7] + + target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split + + Returns: + List[List[int]]: The number of samples to request per MegatronDataset per split + """ + assert numpy.isclose(sum(normalized_weights), 1.0) + + # Use 0.5% target margin to ensure we satiate the request + sizes_per_dataset = [ + [int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split] + for weight in normalized_weights + ] + + return sizes_per_dataset diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py new file mode 100644 index 0000000..10cd590 --- /dev/null +++ b/megatron/core/datasets/blended_megatron_dataset_config.py @@ -0,0 +1,172 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import functools +import logging +import re +from dataclasses import dataclass, field +from typing import List, Optional, Tuple + +from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer +from megatron.core.datasets.utils import Split, log_single_rank, normalize + +logger = logging.getLogger(__name__) + + +@dataclass +class BlendedMegatronDatasetConfig: + """Configuration object for Megatron Core datasets""" + + random_seed: int + """The seed for all RNG during dataset creation.""" + + sequence_length: int + """The sequence length.""" + + blend: Optional[Tuple[List[str], Optional[List[float]]]] = None + """The blend, consisting of a list of dataset prefixes and optionally a list of dataset + weights. For example, [["dataset-path1", "dataset-path2"], [0.3, 0.7]]. When the weights are + None, they are inferred from the lengths of the contributing datasets. Not to be used with + 'blend_per_split'. Defaults to None. + """ + + blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] = None + """A set of blends, as defined above, one for each split distribution. Not to be used with + 'blend'. Defauls to None. + """ + + split: Optional[str] = None + """The split string, a comma separated weighting for the dataset splits when drawing samples + from a single distribution. Not to be used with 'blend_per_split'. Defaults to None. + """ + + split_matrix: Optional[List[Tuple[float, float]]] = field(init=False, default=None) + """The split matrix consisting of non-overlapping book-ends of each split in order. For more + information, refer to 'convert_split_vector_to_split_matrix'. Created automatically from + 'split'. Not to be passed in to the constructor. + """ + + num_dataset_builder_threads: int = 1 + """The number of threads to use for dataset building.""" + + path_to_cache: Optional[str] = None + """Where all re-useable dataset indices are to be cached.""" + + mmap_bin_files: bool = True + """Whether to mmap the .bin files or use file pointers.""" + + mock: bool = field(init=False, default=False) + """Whether to bypass real data loading and validation in favor of mock data generation. + Created automatically from 'blend' and 'blend_per_split'. Not to be passed in to the + constructor. + """ + + tokenizer: Optional[MegatronTokenizer] = None + """The MegatronTokenizer instance or None. Required for datasets which do online tokenization.""" + + def __post_init__(self) -> None: + """Do asserts and set fields post init + """ + if self.blend_per_split is not None and any(self.blend_per_split): + assert self.blend is None, "blend and blend_per_split are incompatible" + assert self.split is None, "split and blend_per_split are incompatible" + assert len(self.blend_per_split) == len( + Split + ), f"blend_per_split must contain {len(Split)} blends" + for split in Split: + if self.blend_per_split[split.value] is None: + log_single_rank( + logger, logging.INFO, f"blend not provided for {split.name} split" + ) + else: + assert self.blend_per_split[split.value][1] is None or len( + self.blend_per_split[split.value][0] + ) == len( + self.blend_per_split[split.value][1] + ), "blend per split prefixes and weights must be equal in number" + else: + if self.blend is not None: + assert self.blend[1] is None or len(self.blend[0]) == len( + self.blend[1] + ), "blend prefixes and weights must be equal in number" + assert self.split is not None, "split must be provided when blend is not None" + else: + self.mock = True + log_single_rank( + logger, + logging.INFO, + f"Let mock = True, as both blend and blend_per_split are None", + ) + self.split = "1,1,1" + log_single_rank( + logger, + logging.INFO, + f"Let split = {self.split}, an arbitrarily even split, as mock is True", + ) + split_vector = parse_and_normalize_split(self.split) + self.split_matrix = convert_split_vector_to_split_matrix(split_vector) + log_single_rank(logger, logging.INFO, f"Let split_matrix = {self.split_matrix}") + + +def parse_and_normalize_split(split: str) -> List[float]: + """Parse the dataset split ratios from a string + + Args: + split (str): The train valid test split string e.g. "99,1,0" + + Returns: + List[float]: The trian valid test split ratios e.g. [0.99, 0.01, 0.0] + """ + split = list(map(float, re.findall(r"[.0-9]+", split))) + split = split + [0.0 for _ in range(len(Split) - len(split))] + + assert len(split) == len(Split) + assert all(map(lambda _: _ >= 0.0, split)) + + split = normalize(split) + + return split + + +def convert_split_vector_to_split_matrix( + vector_a: List[float], vector_b: Optional[List[float]] = None +) -> List[Optional[Tuple[float, float]]]: + """Build the split matrix from one or optionally two contributing split vectors. + + Ex. a standard conversion: + + [0.99, 0.01, 0.0] -> [(0, 0.99), (0.99, 1.0), None] + + Ex. a conversion for Retro when Retro pretraining uses a [0.99, 0.01, 0.0] split and Retro + preprocessing used a [0.98, 0.02, 0.0] split: + + [0.99, 0.01, 0.0], [0.98, 0.02, 0.0] -> [(0, 0.98), (0.99, 1.0), None] + + Args: + vector_a (List[float]): The primary split vector + + vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None. + + Returns: + List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order + """ + if vector_b is None: + vector_b = vector_a + + # [.900, .090, .010] -> [0.00, .900, .990, 100] + expansion_a = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_a]) + expansion_b = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_b]) + + # [0.00, .900, .990, 100.0] -> [(0.00, .900), (.900, .990), (.990, 100)] + bookends_a = list(zip(expansion_a[:-1], expansion_a[1:])) + bookends_b = list(zip(expansion_b[:-1], expansion_b[1:])) + + # gather per-split overlap or None + matrix = [] + for bookend_a, bookend_b in zip(bookends_a, bookends_b): + if min(bookend_a[1], bookend_b[1]) <= max(bookend_a[0], bookend_b[0]): + overlap = None + else: + overlap = (max(bookend_a[0], bookend_b[0]), min(bookend_a[1], bookend_b[1])) + matrix.append(overlap) + + return matrix diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py new file mode 100644 index 0000000..6bcb013 --- /dev/null +++ b/megatron/core/datasets/gpt_dataset.py @@ -0,0 +1,780 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import logging +import os +import time +from dataclasses import dataclass +from typing import Dict, Optional, Tuple + +import numpy +import torch + +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.megatron_dataset import MegatronDataset +from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer +from megatron.core.datasets.utils import Split +from megatron.core.datasets.utils_s3 import S3Config, is_s3_path +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + +_PAD_TOKEN_ID = -1 + + +@dataclass +class GPTDatasetConfig(BlendedMegatronDatasetConfig): + """Configuration object for Megatron Core GPT datasets""" + + reset_position_ids: bool = None + """Option to reset the position IDs in the dataset at an interval""" + + reset_attention_mask: bool = None + """Option to reset the attention mask from the dataset""" + + eod_mask_loss: bool = None + """Option to enable the EOD mask loss""" + + create_attention_mask: bool = True + """Option to enable the attention masks generation. Can be disabled if attention kernel + generates masks by itself. + """ + + drop_last_partial_validation_sequence: bool = True + """Option to drop the last partial validation sequence""" + + add_extra_token_to_sequence: bool = True + """Option to draw sequences with one extra token to ensure the sample input tokens and sample + output tokens are both of the desired sequence length + """ + + s3_cache_path: str = None + """Path for caching indices for s3 dataloading.""" + + def __post_init__(self) -> None: + """Do asserts and set fields post init""" + super().__post_init__() + + assert self.tokenizer is not None + + assert self.reset_position_ids is not None + assert self.reset_attention_mask is not None + assert self.eod_mask_loss is not None + + +class GPTDataset(MegatronDataset): + """The base GPT dataset + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the GPTDataset + + dataset_path (Optional[str]): The real path on disk to the dataset, for bookkeeping + + indexed_indices (numpy.ndarray): The set of the documents indices to expose + + num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch. + + index_split (Split): The indexed_indices Split + + config (GPTDatasetConfig): The config + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: Optional[str], + indexed_indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: GPTDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + self.masks_and_position_ids_are_cacheable = not any( + [ + self.config.reset_position_ids, + self.config.reset_attention_mask, + self.config.eod_mask_loss, + ] + ) + self.masks_and_position_ids_are_cached = False + self.cached_attention_mask = None + self.cached_loss_mask = None + self.cached_position_ids = None + + try: + self._pad_token_id = self.config.tokenizer.pad + except: + self._pad_token_id = _PAD_TOKEN_ID + + ( + self.document_index, + self.sample_index, + self.shuffle_index, + ) = self._build_document_sample_shuffle_indices() + + @staticmethod + def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int: + """Abstract method implementation + + For GPT, the underlying IndexedDataset should be split by sequence, as opposed to, say, + BERT, which should be split by document + + Args: + low_level_dataset (IndexedDataset): The underlying IndexedDataset + + Returns: + int: The number of unique elements in the underlying IndexedDataset + """ + return low_level_dataset.sequence_lengths.shape[0] + + @staticmethod + def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> IndexedDataset: + """Abstract method implementation + + Args: + dataset_path (str): The real path prefix to the IndexedDataset .bin and .idx files + + config (GPTDatasetConfig): The config + + Returns: + IndexedDataset: The underlying IndexedDataset + """ + if is_s3_path(dataset_path): + return IndexedDataset( + dataset_path, + multimodal=False, + mmap=config.mmap_bin_files, + s3_config=S3Config(path_to_idx_cache=config.s3_cache_path), + ) + return IndexedDataset(dataset_path, multimodal=False, mmap=config.mmap_bin_files) + + def __len__(self) -> int: + """Abstract method implementation + + Returns: + int: The length of the dataset + """ + return self.sample_index.shape[0] - 1 + + def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]: + """Abstract method implementation + + Args: + idx (Optioal[int]): The index into the dataset + + Returns: + Dict[str, torch.Tensor]: The sample information wrapped in a dictionary + """ + if idx is None: + # Batch padding sequence so the index does not matter + text, _ = self._query_document_sample_shuffle_indices(0) + else: + text, _ = self._query_document_sample_shuffle_indices(idx) + + text = torch.from_numpy(text).long() + if self.config.add_extra_token_to_sequence: + tokens = text[:-1].contiguous() + labels = text[1:].contiguous() + else: + tokens = text + labels = torch.roll(text, shifts=-1, dims=0) + labels[-1] = self._pad_token_id + + if ( + not self.masks_and_position_ids_are_cacheable + or not self.masks_and_position_ids_are_cached + ): + attention_mask, loss_mask, position_ids = _get_ltor_masks_and_position_ids( + tokens, + self.config.tokenizer.eod, + self.config.reset_position_ids, + self.config.reset_attention_mask, + self.config.eod_mask_loss, + self.config.create_attention_mask, + ) + if self.masks_and_position_ids_are_cacheable: + self.cached_attention_mask = attention_mask + self.cached_loss_mask = loss_mask + self.cached_position_ids = position_ids + self.masks_and_position_ids_are_cached = True + else: + attention_mask = self.cached_attention_mask + loss_mask = self.cached_loss_mask + position_ids = self.cached_position_ids + + # For padded sequences, mask the loss + loss_mask[labels == self._pad_token_id] = 0.0 + + # For padded sequences, ensure the embedding layer can map the token ID + tokens[tokens == self._pad_token_id] = 0 + labels[labels == self._pad_token_id] = 0 + + # Batch padding sequence so we mask the loss + if idx is None: + loss_mask = torch.zeros_like(loss_mask) + + if self.config.create_attention_mask: + return { + "tokens": tokens, + "labels": labels, + "attention_mask": attention_mask, + "loss_mask": loss_mask, + "position_ids": position_ids, + } + else: + return { + "tokens": tokens, + "labels": labels, + "loss_mask": loss_mask, + "position_ids": position_ids, + } + + def _query_document_sample_shuffle_indices( + self, idx: int + ) -> Tuple[numpy.ndarray, numpy.ndarray]: + """Get the text (token ids) and document ids for a given index + + Args: + idx (int): The index into the dataset + + Returns: + Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids + """ + # Do the shuffle mapping + idx = self.shuffle_index[idx] + + # Get the beginning and end documents and offsets + doc_index_beg, doc_index_beg_offset = self.sample_index[idx] + doc_index_end, doc_index_end_offset = self.sample_index[idx + 1] + + document_ids = [] + sample_parts = [] + + # Sample spans a single document + if doc_index_beg == doc_index_end: + # Add the document id + document_ids.append(self.document_index[doc_index_beg]) + + # Add the entire sample + sample_parts.append( + self.dataset.get( + self.document_index[doc_index_beg], + offset=doc_index_beg_offset, + length=doc_index_end_offset + - doc_index_beg_offset + + self.config.add_extra_token_to_sequence, + ) + ) + + # Sample spans multiple documents + else: + for i in range(doc_index_beg, doc_index_end + 1): + # Add the document id + document_ids.append(self.document_index[i]) + + # Add the sample part + offset = 0 if i > doc_index_beg else doc_index_beg_offset + length = ( + None + if i < doc_index_end + else doc_index_end_offset + self.config.add_extra_token_to_sequence + ) + sample_parts.append( + self.dataset.get(self.document_index[i], offset=offset, length=length) + ) + assert len(document_ids) == len( + sample_parts + ), f"len(document_ids) ({len(document_ids)}) != len(sample_parts) ({len(sample_parts)})" + + length = sum(map(len, sample_parts)) + + # Pad the sample if necessary + if length < (self.config.sequence_length + self.config.add_extra_token_to_sequence): + sample_parts.append( + [self._pad_token_id] + * (self.config.sequence_length + self.config.add_extra_token_to_sequence - length) + ) + + return ( + numpy.concatenate(sample_parts, dtype=numpy.int64), + numpy.array(document_ids, dtype=numpy.int64), + ) + + def _build_document_sample_shuffle_indices( + self, + ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: + """Build the document index, the sample index, and the shuffle index + + The document index: + -- 1-D + -- An ordered array of document ids + + The sample index: + -- 2-D + -- The document indices and offsets which mark the start of every sample + + The shuffle index: + -- 1-D + -- A random permutation of index range of the sample index + + Returns: + Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the shuffle index + """ + path_to_cache = self.config.path_to_cache + if path_to_cache is None and not self.config.mock: + path_to_cache = os.path.join( + self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices" + ) + + if path_to_cache: + get_path_to = lambda suffix: os.path.join( + path_to_cache, + f"{self.unique_description_hash}-{type(self).__name__}-{self.index_split.name}-{suffix}", + ) + path_to_description = get_path_to("description.txt") + path_to_document_index = get_path_to("document_index.npy") + path_to_sample_index = get_path_to("sample_index.npy") + path_to_shuffle_index = get_path_to("shuffle_index.npy") + cache_hit = all( + map( + os.path.isfile, + [ + path_to_description, + path_to_document_index, + path_to_sample_index, + path_to_shuffle_index, + ], + ) + ) + else: + cache_hit = False + + if not path_to_cache or ( + not cache_hit + and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0) + ): + + log_single_rank( + logger, + logging.INFO, + f"Build and save the {type(self).__name__} {self.index_split.name} indices", + ) + self.built_anew_on_cache_miss = True + t_beg = time.time() + + sequence_length = self.config.sequence_length + num_tokens_per_epoch = self._get_num_tokens_per_epoch() + num_epochs = self._get_num_epochs(num_tokens_per_epoch) + + if num_epochs == 1: + separate_final_epoch = False + else: + # Get the number of samples for the last epoch + num_samples_sans_final_epoch = ( + (num_epochs - 1) * num_tokens_per_epoch + - self.config.add_extra_token_to_sequence + ) // sequence_length + num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch + num_samples_per_epoch = ( + num_tokens_per_epoch - self.config.add_extra_token_to_sequence + ) // sequence_length + + # num_samples_from_final_epoch should be non-negative + assert num_samples_from_final_epoch >= 0 + + # num_samples_from_final_epoch should not exceed max value + assert num_samples_from_final_epoch <= num_samples_per_epoch + 1 + + # Separate the final epoch if it falls below the threshold + threshold = 0.80 + separate_final_epoch = num_samples_from_final_epoch < int( + threshold * num_samples_per_epoch + ) + + log_single_rank( + logger, + logging.DEBUG, + f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}", + ) + log_single_rank(logger, logging.DEBUG, f"> threshold: {threshold}") + log_single_rank( + logger, logging.DEBUG, f"> num_samples_per_epoch: {num_samples_per_epoch}" + ) + + log_single_rank( + logger, logging.DEBUG, f"> separate_final_epoch: {separate_final_epoch}" + ) + + numpy_random_state = numpy.random.RandomState(self.config.random_seed) + + # Build the document index + document_index = _build_document_index( + self.indices, num_epochs, numpy_random_state, separate_final_epoch + ) + + drop_last_partial_sequence = True + if self.index_split == Split.valid: + drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence + + # Build the sample index + from megatron.core.datasets import helpers + + if self.index_split == Split.valid: + drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence + else: + drop_last_partial_sequence = True + + assert document_index.dtype == numpy.int32 + assert self.dataset.sequence_lengths.dtype == numpy.int32 + if len(document_index) * 2 > len(self.dataset.sequence_lengths): + # Heuristic: if "access density" of sequence_lengths is relatively high, + # force loading the mmap-ed array into memory by taking a copy. + # System performance benefits come from two aspects: + # 1. **sequentially** pre-loading the whole file if we're gonna read a large fraction anyways. + # 2. GIL is held when calling into c++ code; making the c++ func faster improves parallelism. + sequence_lengths_for_cpp = self.dataset.sequence_lengths.copy() + else: + sequence_lengths_for_cpp = self.dataset.sequence_lengths + sample_index = helpers.build_sample_idx( + sequence_lengths_for_cpp, + document_index, + sequence_length, + num_epochs, + num_tokens_per_epoch, + drop_last_partial_sequence, + self.config.add_extra_token_to_sequence, + ) + + # Build the shuffle index + if separate_final_epoch: + shuffle_index = _build_shuffle_index( + num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state + ) + else: + shuffle_index = _build_shuffle_index( + sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state + ) + + if path_to_cache: + os.makedirs(path_to_cache, exist_ok=True) + # Write the description + with open(path_to_description, "wt") as writer: + writer.write(self.unique_description) + numpy.save(path_to_document_index, document_index, allow_pickle=True) + numpy.save(path_to_sample_index, sample_index, allow_pickle=True) + numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True) + else: + log_single_rank( + logger, + logging.WARNING, + f"Unable to save the {type(self).__name__} indexes because path_to_cache is None", + ) + + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}" + ) + log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}") + + return document_index, sample_index, shuffle_index + + log_single_rank( + logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices" + ) + + log_single_rank( + logger, + logging.INFO, + f"\tLoad the document index from {os.path.basename(path_to_document_index)}", + ) + t_beg = time.time() + document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r') + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, + logging.INFO, + f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}", + ) + t_beg = time.time() + sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r') + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, + logging.INFO, + f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}", + ) + t_beg = time.time() + shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r') + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}" + ) + + return document_index, sample_index, shuffle_index + + def _get_num_tokens_per_epoch(self) -> int: + """Calculate the number of tokens in a single epoch + + Returns: + int: The number of tokens in a single epoch + """ + return int(numpy.sum(self.dataset.sequence_lengths[self.indices])) + + def _get_num_epochs(self, num_tokens_per_epoch: int) -> int: + """Calculate the number of epochs + + Args: + num_tokens_per_epoch (int): The number of tokens in a single epoch + + Returns: + int: The number of epochs + """ + num_epochs = 1 + num_tokens = num_tokens_per_epoch + if self.num_samples is None: + return num_epochs + else: + num_tokens_requested = ( + self.num_samples * self.config.sequence_length + ) + self.config.add_extra_token_to_sequence + while num_tokens < num_tokens_requested: + num_epochs += 1 + num_tokens += num_tokens_per_epoch + return num_epochs + + +def _build_document_index( + documents: numpy.ndarray, + num_epochs: int, + numpy_random_state: numpy.random.RandomState, + separate_final_epoch: bool, +) -> numpy.ndarray: + """Build an array with length = num epochs * num documents + + Args: + documents (numpy.ndarray): the subset of exposed document indices + + num_epochs (int): The number of epochs + + numpy_random_state (numpy.random.RandomState): The NumPy random state + + separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle + + Returns: + numpy.ndarray: The document index + """ + if not separate_final_epoch or num_epochs == 1: + document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1] + document_index[:] = documents + document_index = document_index.reshape(-1) + document_index = document_index.astype(numpy.int32) + numpy_random_state.shuffle(document_index) + return document_index + + doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False) + doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False) + return numpy.concatenate((doc_idx_first, doc_idx_last)) + + +def _build_shuffle_index( + num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState +) -> numpy.ndarray: + """Build the range [0, size) and shuffle + + Args: + num_samples (int): The size of the first shuffle range [0, num_samples) + + total_size (int): The size of the entire index. If larger than 'num_samples', it defines the second shuffle range [num_samples, total_size) + + numpy_random_state (numpy.random.RandomState): The NumPy random state + + Returns: + numpy.ndarray: The shuffle index + """ + dtype_ = numpy.uint32 + if total_size >= (numpy.iinfo(numpy.uint32).max - 1): + dtype_ = numpy.int64 + + shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_) + numpy_random_state.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_) + numpy_random_state.shuffle(shuffle_idx_last) + + return numpy.concatenate((shuffle_idx_first, shuffle_idx_last)) + + +def _get_ltor_masks_and_position_ids( + data: torch.Tensor, + eod_token: int, + reset_position_ids: bool, + reset_attention_mask: bool, + eod_mask_loss: bool, + create_attention_mask: bool, +): + """Build masks and position id for left to right model. + + Args: + data (torch.Tensor): The data tenor that holds the tokens from the dataset + + eod_token (int): ID of the token to that is considered the EOD + + reset_position_ids (bool): Switch to reset the document position ID's + + reset_attention_mask (bool): Switch to reset the attention mask + + eod_mask_loss (bool): Switch to enable the EOD mask loss + + create_attention_mask (bool): Switch to enable the attention masks generation. Can be disabled if attention kernel generates masks by itself. + + Returns: + torch.Tensor: Attention mask needed to be used for Attention + + torch.Tensor: The mask used for loss value during training + + torch.Tensor: The position ID's of the token + """ + seq_length = data.numel() + + if create_attention_mask: + attention_mask = torch.tril( + torch.ones((seq_length, seq_length), device=data.device) + ).unsqueeze(0) + else: + attention_mask = None + + # Loss mask. + loss_mask = torch.ones(seq_length, dtype=torch.float, device=data.device) + if eod_mask_loss: + loss_mask[data == eod_token] = 0.0 + + # Position ids. + position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device) + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Find indices where EOD token is. + eod_index = position_ids[data == eod_token] + # Detach indices from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indices: + prev_index = 0 + for j in range(eod_index.numel()): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask and attention_mask is not None: + attention_mask[0, (i + 1) :, : (i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[(i + 1) :] -= i + 1 - prev_index + prev_index = i + 1 + + if attention_mask is not None: + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + + return attention_mask, loss_mask, position_ids + + +class MockGPTLowLevelDataset: + + seed: int = 0 + size: int = 100000 + max_sequence_length: int = 4096 + + def __init__(self, tokenizer: MegatronTokenizer) -> None: + self.tokenizer = tokenizer + rng = numpy.random.default_rng(seed=self.seed) + self.sequence_lengths = rng.integers( + low=1, high=self.max_sequence_length, size=self.size, dtype=numpy.int32 + ) + + def __len__(self) -> int: + return self.size + + def __getitem__(self, idx: int) -> numpy.number: + length = self.sequence_lengths[idx] + sample = numpy.int64( + numpy.concatenate([numpy.arange(length - 1) + 1, [self.tokenizer.eod]]) + ) + return sample + + def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray: + if length is None: + length = self.sequence_lengths[idx] - offset + return self[idx][offset : offset + length] + + +class MockGPTDataset(GPTDataset): + """The mock GPT dataset + + Args: + indexed_dataset (MockGPTLowLevelDataset): The MockGPTLowLevelDataset around which to build the MockGPTDataset + + dataset_path (Optional[str]): This argument is of no consequence for the MockGPTDataset + + indices (numpy.ndarray): The set of the dataset indices to expose + + num_samples (int): The number of samples to draw from the dataset + + index_split (Split): The indices Split + + config (GPTDatasetConfig): The config + """ + + def __init__( + self, + dataset: MockGPTLowLevelDataset, + dataset_path: Optional[str], + indices: numpy.ndarray, + num_samples: int, + index_split: Split, + config: GPTDatasetConfig, + ) -> None: + assert config.mock + + super().__init__(dataset, dataset_path, indices, num_samples, index_split, config) + + @staticmethod + def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset) -> int: + """Abstract method implementation + + Args: + low_level_dataset (MockGPTLowLevelDataset): The underlying MockGPTLowLevelDataset + + Returns: + int: The number of unique elements in the underlying MockGPTLowLevelDataset + """ + return len(low_level_dataset) + + @staticmethod + def build_low_level_dataset( + dataset_path: Optional[str], config: GPTDatasetConfig + ) -> MockGPTLowLevelDataset: + """Abstract method implementation + + Args: + dataset_path (Optional[str]): This argument is of no consequence for the MockGPTLowLevelDataset + + config (GPTDatasetConfig): The config + + Returns: + MockGPTLowLevelDataset: The underlying MockGPTLowLevelDataset + """ + return MockGPTLowLevelDataset(config.tokenizer) diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp new file mode 100644 index 0000000..0b05f09 --- /dev/null +++ b/megatron/core/datasets/helpers.cpp @@ -0,0 +1,839 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ + +/* Helper methods for fast index mapping builds */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +using namespace std; + +const int32_t LONG_SENTENCE_LEN = 512; + + +void build_exhaustive_blending_indices(py::array_t &dataset_index, py::array_t &dataset_sample_index, const py::array_t &sizes, const int32_t num_datasets) { + /* + Build blending indices by sampling exactly as many samples from dataset[i] + as is requested by sizes[i] for all i in the range [0, num_datasets). + */ + auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); + auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); + auto sizes_ptr = sizes.unchecked<1>(); + + int64_t total_size = 0; + int64_t dataset_sample_counts[num_datasets]; + std::set dataset_unspent_indices; + for (int32_t i = 0; i < num_datasets; ++i) { + total_size += sizes_ptr[i]; + dataset_sample_counts[i] = 0; + dataset_unspent_indices.insert(i); + } + + // still need fractional weights to sample in proportion to sizes + double weights[num_datasets]; + for (int32_t i = 0; i < num_datasets; ++i) { + weights[i] = sizes_ptr[i] / static_cast(total_size); + } + + int64_t index_sample = 0; + while (dataset_unspent_indices.size() > 0) { + double index_sample_double = std::max(static_cast(index_sample), 1.0); + + int64_t error_argmax; + double error_max = std::numeric_limits::lowest(); + + for (int32_t index_dataset : dataset_unspent_indices) { + double error = weights[index_dataset] * index_sample_double - static_cast(dataset_sample_counts[index_dataset]); + if (error > error_max) { + error_argmax = index_dataset; + error_max = error; + } + } + + // Populate the indices. + dataset_index_ptr[index_sample] = static_cast(error_argmax); + dataset_sample_index_ptr[index_sample] = dataset_sample_counts[error_argmax]; + + // Update the total samples. + dataset_sample_counts[error_argmax] += 1; + + if (sizes_ptr[error_argmax] - static_cast(dataset_sample_counts[error_argmax]) == 0) { + dataset_unspent_indices.erase(error_argmax); + } + + index_sample += 1; + } +} + +void build_blending_indices(py::array_t &dataset_index, + py::array_t &dataset_sample_index, + const py::array_t &weights, + const int32_t num_datasets, + const int64_t size, const bool verbose) +{ + /* Given multiple datasets and a weighting array, build samples + such that it follows those wieghts.*/ + + if (verbose) + { + std::cout << "> building indices for blended datasets ..." << std::endl; + } + + // Get the pointer access without the checks. + auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); + auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); + auto weights_ptr = weights.unchecked<1>(); + + // Initialize buffer for number of samples used for each dataset. + int64_t current_samples[num_datasets]; + for (int64_t i = 0; i < num_datasets; ++i) + { + current_samples[i] = 0; + } + + // For each sample: + for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) + { + + // Determine where the max error in sampling is happening. + auto sample_idx_double = std::max(static_cast(sample_idx), 1.0); + int64_t max_error_index = 0; + double max_error = weights_ptr[0] * sample_idx_double - + static_cast(current_samples[0]); + for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) + { + double error = weights_ptr[dataset_idx] * sample_idx_double - + static_cast(current_samples[dataset_idx]); + if (error > max_error) + { + max_error = error; + max_error_index = dataset_idx; + } + } + + // Populate the indices. + dataset_index_ptr[sample_idx] = static_cast(max_error_index); + dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; + + // Update the total samples. + current_samples[max_error_index] += 1; + } + + // print info + if (verbose) + { + std::cout << " > sample ratios:" << std::endl; + for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) + { + auto ratio = static_cast(current_samples[dataset_idx]) / + static_cast(size); + std::cout << " dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; + } + } +} + +py::array build_sample_idx(const py::array_t &sizes_, + const py::array_t &doc_idx_, + const int32_t seq_length, + const int32_t num_epochs, + const int64_t tokens_per_epoch, + const bool drop_last_partial_sequence = true, + const int add_extra_token_to_sequence = 1) +{ + /* Sample index (sample_idx) is used for gpt2 like dataset for which + the documents are flattened and the samples are built based on this + 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] + where [..., 0] contains the index into `doc_idx` and [..., 1] is the + starting offset in that document.*/ + + // Consistency checks. + assert(seq_length > 1); + assert(num_epochs > 0); + assert(tokens_per_epoch > 1); + + // Remove bound checks. + auto sizes = sizes_.unchecked<1>(); + auto doc_idx = doc_idx_.unchecked<1>(); + + // Mapping and it's length (1D). + int64_t num_samples = 0; + if (drop_last_partial_sequence == true) + { + num_samples = (num_epochs * tokens_per_epoch - add_extra_token_to_sequence) / seq_length; + } + else + { + num_samples = ceil(float(num_epochs * tokens_per_epoch - add_extra_token_to_sequence) / seq_length); + } + int64_t *sample_idx = new int64_t[2 * (num_samples + 1)]; + + // Index into sample_idx. + int64_t sample_index = 0; + // Index into doc_idx. + int64_t doc_idx_index = 0; + // Begining offset for each document. + int32_t doc_offset = 0; + // Start with first document and no offset. + sample_idx[2 * sample_index] = doc_idx_index; + sample_idx[2 * sample_index + 1] = doc_offset; + ++sample_index; + + while (sample_index <= num_samples) + { + // Start with a fresh sequence. + int32_t remaining_seq_length = seq_length + add_extra_token_to_sequence; + while (remaining_seq_length != 0) + { + // Get the document length. + auto doc_id = doc_idx[doc_idx_index]; + auto doc_length = sizes[doc_id] - doc_offset; + // And add it to the current sequence. + remaining_seq_length -= doc_length; + // If we have more than a full sequence, adjust offset and set + // remaining length to zero so we return from the while loop. + // Note that -1 here is for the same reason we have -1 in + // `_num_epochs` calculations. + if (remaining_seq_length <= 0) + { + doc_offset += (remaining_seq_length + doc_length - add_extra_token_to_sequence); + remaining_seq_length = 0; + } + else + { + // Otherwise, start from the begining of the next document. + if (doc_idx_index == (doc_idx_.shape(0) - 1)) + { + // If we have reached the end of the documents, break. + assert(sample_index == num_samples); + doc_offset = sizes[doc_idx[doc_idx_index]] - add_extra_token_to_sequence; + break; + } + ++doc_idx_index; + doc_offset = 0; + } + } + // Record the sequence. + sample_idx[2 * sample_index] = doc_idx_index; + sample_idx[2 * sample_index + 1] = doc_offset; + ++sample_index; + } + + // Method to deallocate memory. + py::capsule free_when_done(sample_idx, [](void *mem_) + { + int64_t *mem = reinterpret_cast(mem_); + delete[] mem; }); + + // Return the numpy array. + const auto byte_size = sizeof(int64_t); + return py::array(std::vector{num_samples + 1, 2}, // shape + {2 * byte_size, byte_size}, // C-style contiguous strides + sample_idx, // the data pointer + free_when_done); // numpy array references +} + +inline int32_t get_target_sample_len(const int32_t short_seq_ratio, + const int32_t max_length, + std::mt19937 &rand32_gen) +{ + /* Training sample length. */ + if (short_seq_ratio == 0) + { + return max_length; + } + const auto random_number = rand32_gen(); + if ((random_number % short_seq_ratio) == 0) + { + return 2 + random_number % (max_length - 1); + } + return max_length; +} + +template +py::array build_mapping_impl(const py::array_t &docs_, + const py::array_t &sizes_, + const int32_t num_epochs, + const uint64_t max_num_samples, + const int32_t max_seq_length, + const double short_seq_prob, + const int32_t seed, + const bool verbose, + const int32_t min_num_sent) +{ + /* Build a mapping of (start-index, end-index, sequence-length) where + start and end index are the indices of the sentences in the sample + and sequence-length is the target sequence length. + */ + + // Consistency checks. + assert(num_epochs > 0); + assert(max_seq_length > 1); + assert(short_seq_prob >= 0.0); + assert(short_seq_prob <= 1.0); + assert(seed > 0); + + // Remove bound checks. + auto docs = docs_.unchecked<1>(); + auto sizes = sizes_.unchecked<1>(); + + // For efficiency, convert probability to ratio. Note: rand() generates int. + int32_t short_seq_ratio = 0; + if (short_seq_prob > 0) + { + short_seq_ratio = static_cast(round(1.0 / short_seq_prob)); + } + + if (verbose) + { + const auto sent_start_index = docs[0]; + const auto sent_end_index = docs[docs_.shape(0) - 1]; + const auto num_sentences = sent_end_index - sent_start_index; + cout << " using:" << endl + << std::flush; + cout << " number of documents: " << docs_.shape(0) - 1 << endl + << std::flush; + cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl + << std::flush; + cout << " total number of sentences: " << num_sentences << endl + << std::flush; + cout << " number of epochs: " << num_epochs << endl + << std::flush; + cout << " maximum number of samples: " << max_num_samples << endl + << std::flush; + cout << " maximum sequence length: " << max_seq_length << endl + << std::flush; + cout << " short sequence probability: " << short_seq_prob << endl + << std::flush; + cout << " short sequence ration (1/prob): " << short_seq_ratio << endl + << std::flush; + cout << " seed: " << seed << endl + << std::flush; + } + + // Mapping and it's length (1D). + int64_t num_samples = -1; + DocIdx *maps = NULL; + + // Perform two iterations, in the first iteration get the size + // and allocate memory and in the second iteration populate the map. + bool second = false; + for (int32_t iteration = 0; iteration < 2; ++iteration) + { + + // Set the seed so both iterations produce the same results. + std::mt19937 rand32_gen(seed); + + // Set the flag on second iteration. + second = (iteration == 1); + + // Counters: + uint64_t empty_docs = 0; + uint64_t one_sent_docs = 0; + uint64_t long_sent_docs = 0; + + // Current map index. + uint64_t map_index = 0; + + // For each epoch: + for (int32_t epoch = 0; epoch < num_epochs; ++epoch) + { + if (map_index >= max_num_samples) + { + if (verbose && (!second)) + { + cout << " reached " << max_num_samples << " samples after " + << epoch << " epochs ..." << endl + << std::flush; + } + break; + } + // For each document: + for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) + { + + // Document sentences are in [sent_index_first, sent_index_last) + const auto sent_index_first = docs[doc]; + const auto sent_index_last = docs[doc + 1]; + + // At the begining of the document previous index is the + // start index. + auto prev_start_index = sent_index_first; + + // Remaining documents. + auto num_remain_sent = sent_index_last - sent_index_first; + + // Some bookkeeping + if ((epoch == 0) && (!second)) + { + if (num_remain_sent == 0) + { + ++empty_docs; + } + if (num_remain_sent == 1) + { + ++one_sent_docs; + } + } + + // Detect documents with long sentences. + bool contains_long_sentence = false; + if (num_remain_sent > 1) + { + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + if (sizes[sent_index] > LONG_SENTENCE_LEN) + { + if ((epoch == 0) && (!second)) + { + ++long_sent_docs; + } + contains_long_sentence = true; + break; + } + } + } + + // If we have more than two sentences. + if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) + { + + // Set values. + auto seq_len = int32_t{0}; + auto num_sent = int32_t{0}; + auto target_seq_len = get_target_sample_len(short_seq_ratio, + max_seq_length, + rand32_gen); + + // Loop through sentences. + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + + // Add the size and number of sentences. + seq_len += sizes[sent_index]; + ++num_sent; + --num_remain_sent; + + // If we have reached the target length. + // and if not only one sentence is left in the document. + // and if we have at least two sentneces. + // and if we have reached end of the document. + if (((seq_len >= target_seq_len) && + (num_remain_sent > 1) && + (num_sent >= min_num_sent)) || + (num_remain_sent == 0)) + { + + // Check for overflow. + if ((3 * map_index + 2) > + std::numeric_limits::max()) + { + cout << "number of samples exceeded maximum " + << "allowed by type int64: " + << std::numeric_limits::max() + << endl; + throw std::overflow_error("Number of samples"); + } + + // Populate the map. + if (second) + { + const auto map_index_0 = 3 * map_index; + maps[map_index_0] = static_cast(prev_start_index); + maps[map_index_0 + 1] = static_cast(sent_index + 1); + maps[map_index_0 + 2] = static_cast(target_seq_len); + } + + // Update indices / counters. + ++map_index; + prev_start_index = sent_index + 1; + target_seq_len = get_target_sample_len(short_seq_ratio, + max_seq_length, + rand32_gen); + seq_len = 0; + num_sent = 0; + } + + } // for (auto sent_index=sent_index_first; ... + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { + + if (!second) + { + if (verbose) + { + cout << " number of empty documents: " << empty_docs << endl + << std::flush; + cout << " number of documents with one sentence: " << one_sent_docs << endl + << std::flush; + cout << " number of documents with long sentences: " << long_sent_docs << endl + << std::flush; + cout << " will create mapping for " << map_index << " samples" << endl + << std::flush; + } + assert(maps == NULL); + assert(num_samples < 0); + maps = new DocIdx[3 * map_index]; + num_samples = static_cast(map_index); + } + + } // for (int iteration=0; iteration < 2; ++iteration) { + + // Shuffle. + // We need a 64 bit random number generator as we might have more + // than 2 billion samples. + std::mt19937_64 rand64_gen(seed + 1); + for (auto i = (num_samples - 1); i > 0; --i) + { + const auto j = static_cast(rand64_gen() % (i + 1)); + const auto i0 = 3 * i; + const auto j0 = 3 * j; + // Swap values. + swap(maps[i0], maps[j0]); + swap(maps[i0 + 1], maps[j0 + 1]); + swap(maps[i0 + 2], maps[j0 + 2]); + } + + // Method to deallocate memory. + py::capsule free_when_done(maps, [](void *mem_) + { + DocIdx *mem = reinterpret_cast(mem_); + delete[] mem; }); + + // Return the numpy array. + const auto byte_size = sizeof(DocIdx); + return py::array(std::vector{num_samples, 3}, // shape + {3 * byte_size, byte_size}, // C-style contiguous strides + maps, // the data pointer + free_when_done); // numpy array references +} + +py::array build_mapping(const py::array_t &docs_, + const py::array_t &sizes_, + const int num_epochs, + const uint64_t max_num_samples, + const int max_seq_length, + const double short_seq_prob, + const int seed, + const bool verbose, + const int32_t min_num_sent) +{ + + if (sizes_.size() > std::numeric_limits::max()) + { + if (verbose) + { + cout << " using uint64 for data mapping..." << endl + << std::flush; + } + return build_mapping_impl(docs_, sizes_, num_epochs, + max_num_samples, max_seq_length, + short_seq_prob, seed, verbose, + min_num_sent); + } + else + { + if (verbose) + { + cout << " using uint32 for data mapping..." << endl + << std::flush; + } + return build_mapping_impl(docs_, sizes_, num_epochs, + max_num_samples, max_seq_length, + short_seq_prob, seed, verbose, + min_num_sent); + } +} + +template +py::array build_blocks_mapping_impl(const py::array_t &docs_, + const py::array_t &sizes_, + const py::array_t &titles_sizes_, + const int32_t num_epochs, + const uint64_t max_num_samples, + const int32_t max_seq_length, + const int32_t seed, + const bool verbose, + const bool use_one_sent_blocks) +{ + /* Build a mapping of (start-index, end-index, sequence-length) where + start and end index are the indices of the sentences in the sample + and sequence-length is the target sequence length. + */ + + // Consistency checks. + assert(num_epochs > 0); + assert(max_seq_length > 1); + assert(seed > 0); + + // Remove bound checks. + auto docs = docs_.unchecked<1>(); + auto sizes = sizes_.unchecked<1>(); + auto titles_sizes = titles_sizes_.unchecked<1>(); + + if (verbose) + { + const auto sent_start_index = docs[0]; + const auto sent_end_index = docs[docs_.shape(0) - 1]; + const auto num_sentences = sent_end_index - sent_start_index; + cout << " using:" << endl + << std::flush; + cout << " number of documents: " << docs_.shape(0) - 1 << endl + << std::flush; + cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl + << std::flush; + cout << " total number of sentences: " << num_sentences << endl + << std::flush; + cout << " number of epochs: " << num_epochs << endl + << std::flush; + cout << " maximum number of samples: " << max_num_samples << endl + << std::flush; + cout << " maximum sequence length: " << max_seq_length << endl + << std::flush; + cout << " seed: " << seed << endl + << std::flush; + } + + // Mapping and its length (1D). + int64_t num_samples = -1; + DocIdx *maps = NULL; + + // Acceptable number of sentences per block. + int min_num_sent = 2; + if (use_one_sent_blocks) + { + min_num_sent = 1; + } + + // Perform two iterations, in the first iteration get the size + // and allocate memory and in the second iteration populate the map. + bool second = false; + for (int32_t iteration = 0; iteration < 2; ++iteration) + { + + // Set the flag on second iteration. + second = (iteration == 1); + + // Current map index. + uint64_t map_index = 0; + + uint64_t empty_docs = 0; + uint64_t one_sent_docs = 0; + uint64_t long_sent_docs = 0; + // For each epoch: + for (int32_t epoch = 0; epoch < num_epochs; ++epoch) + { + // assign every block a unique id + int32_t block_id = 0; + + if (map_index >= max_num_samples) + { + if (verbose && (!second)) + { + cout << " reached " << max_num_samples << " samples after " + << epoch << " epochs ..." << endl + << std::flush; + } + break; + } + // For each document: + for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) + { + + // Document sentences are in [sent_index_first, sent_index_last) + const auto sent_index_first = docs[doc]; + const auto sent_index_last = docs[doc + 1]; + const auto target_seq_len = max_seq_length - titles_sizes[doc]; + + // At the begining of the document previous index is the + // start index. + auto prev_start_index = sent_index_first; + + // Remaining documents. + auto num_remain_sent = sent_index_last - sent_index_first; + + // Some bookkeeping + if ((epoch == 0) && (!second)) + { + if (num_remain_sent == 0) + { + ++empty_docs; + } + if (num_remain_sent == 1) + { + ++one_sent_docs; + } + } + // Detect documents with long sentences. + bool contains_long_sentence = false; + if (num_remain_sent >= min_num_sent) + { + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + if (sizes[sent_index] > LONG_SENTENCE_LEN) + { + if ((epoch == 0) && (!second)) + { + ++long_sent_docs; + } + contains_long_sentence = true; + break; + } + } + } + // If we have enough sentences and no long sentences. + if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) + { + + // Set values. + auto seq_len = int32_t{0}; + auto num_sent = int32_t{0}; + + // Loop through sentences. + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + + // Add the size and number of sentences. + seq_len += sizes[sent_index]; + ++num_sent; + --num_remain_sent; + + // If we have reached the target length. + // and there are an acceptable number of sentences left + // and if we have at least the minimum number of sentences. + // or if we have reached end of the document. + if (((seq_len >= target_seq_len) && + (num_remain_sent >= min_num_sent) && + (num_sent >= min_num_sent)) || + (num_remain_sent == 0)) + { + + // Populate the map. + if (second) + { + const auto map_index_0 = 4 * map_index; + // Each sample has 4 items: the starting sentence index, ending sentence index, + // the index of the document from which the block comes (used for fetching titles) + // and the unique id of the block (used for creating block indexes) + + maps[map_index_0] = static_cast(prev_start_index); + maps[map_index_0 + 1] = static_cast(sent_index + 1); + maps[map_index_0 + 2] = static_cast(doc); + maps[map_index_0 + 3] = static_cast(block_id); + } + + // Update indices / counters. + ++map_index; + ++block_id; + prev_start_index = sent_index + 1; + seq_len = 0; + num_sent = 0; + } + } // for (auto sent_index=sent_index_first; ... + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { + + if (!second) + { + if (verbose) + { + cout << " number of empty documents: " << empty_docs << endl + << std::flush; + cout << " number of documents with one sentence: " << one_sent_docs << endl + << std::flush; + cout << " number of documents with long sentences: " << long_sent_docs << endl + << std::flush; + cout << " will create mapping for " << map_index << " samples" << endl + << std::flush; + } + assert(maps == NULL); + assert(num_samples < 0); + maps = new DocIdx[4 * map_index]; + num_samples = static_cast(map_index); + } + + } // for (int iteration=0; iteration < 2; ++iteration) { + + // Shuffle. + // We need a 64 bit random number generator as we might have more + // than 2 billion samples. + std::mt19937_64 rand64_gen(seed + 1); + for (auto i = (num_samples - 1); i > 0; --i) + { + const auto j = static_cast(rand64_gen() % (i + 1)); + const auto i0 = 4 * i; + const auto j0 = 4 * j; + // Swap values. + swap(maps[i0], maps[j0]); + swap(maps[i0 + 1], maps[j0 + 1]); + swap(maps[i0 + 2], maps[j0 + 2]); + swap(maps[i0 + 3], maps[j0 + 3]); + } + + // Method to deallocate memory. + py::capsule free_when_done(maps, [](void *mem_) + { + DocIdx *mem = reinterpret_cast(mem_); + delete[] mem; }); + + // Return the numpy array. + const auto byte_size = sizeof(DocIdx); + return py::array(std::vector{num_samples, 4}, // shape + {4 * byte_size, byte_size}, // C-style contiguous strides + maps, // the data pointer + free_when_done); // numpy array references +} + +py::array build_blocks_mapping(const py::array_t &docs_, + const py::array_t &sizes_, + const py::array_t &titles_sizes_, + const int num_epochs, + const uint64_t max_num_samples, + const int max_seq_length, + const int seed, + const bool verbose, + const bool use_one_sent_blocks) +{ + + if (sizes_.size() > std::numeric_limits::max()) + { + if (verbose) + { + cout << " using uint64 for data mapping..." << endl + << std::flush; + } + return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_, + num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); + } + else + { + if (verbose) + { + cout << " using uint32 for data mapping..." << endl + << std::flush; + } + return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_, + num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); + } +} + +PYBIND11_MODULE(helpers, m) +{ + m.def("build_mapping", &build_mapping); + m.def("build_blocks_mapping", &build_blocks_mapping); + m.def("build_sample_idx", &build_sample_idx); + m.def("build_blending_indices", &build_blending_indices); + m.def("build_exhaustive_blending_indices", &build_exhaustive_blending_indices); +} diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py new file mode 100644 index 0000000..ae05bcb --- /dev/null +++ b/megatron/core/datasets/indexed_dataset.py @@ -0,0 +1,864 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Essentially re-written in entirety + +import logging +import os +import shutil +import struct +import time +from abc import ABC, abstractmethod +from enum import Enum +from functools import lru_cache +from itertools import accumulate +from types import TracebackType +from typing import List, Optional, Tuple, Type, Union + +try: + import boto3 +except ModuleNotFoundError: + pass +import numpy +import torch + +from megatron.core.datasets.utils_s3 import ( + S3Config, + is_s3_path, + maybe_download_file, + object_exists, + parse_s3_path, +) +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + +_INDEX_HEADER = b"MMIDIDX\x00\x00" + + +class DType(Enum): + """The NumPy data type Enum for writing/reading the IndexedDataset indices""" + + uint8 = 1 + int8 = 2 + int16 = 3 + int32 = 4 + int64 = 5 + float64 = 6 + float32 = 7 + uint16 = 8 + + @classmethod + def code_from_dtype(cls, value: Type[numpy.number]) -> int: + """Get the code from the dtype + + Args: + value (Type[numpy.number]): The dtype + + Returns: + int: The code + """ + return cls[value.__name__].value + + @classmethod + def dtype_from_code(cls, value: int) -> Type[numpy.number]: + """Get the dtype from the code + + Args: + value (int): The code + + Returns: + Type[numpy.number]: The dtype + """ + return getattr(numpy, cls(value).name) + + @staticmethod + def size(key: Union[int, Type[numpy.number]]) -> int: + """Get the size of the dtype/code in bytes + + Args: + key (Union[int, Type[numpy.number]]): The dtype or code + + Raises: + ValueError: If the key is neither dtype nor integer code + + Returns: + int: The size of the dtype/code in in bytes + """ + if isinstance(key, int): + return DType.dtype_from_code(key)().itemsize + elif numpy.number in key.__mro__: + return key().itemsize + else: + raise ValueError + + @staticmethod + def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]: + """Get the dtype to use for an index of a certain cardinality + + Args: + cardinality (Optional[int]): The number of elements to be indexed + + Returns: + Type[numpy.number]: The dtype to use for the index + """ + if cardinality is not None and cardinality < 65500: + return numpy.uint16 + else: + return numpy.int32 + + +class _IndexWriter(object): + """Object class to write the index (.idx) file + + Args: + idx_path (str): The path to the index file + + dtype (Type[numpy.number]): The dtype of the index file + """ + + def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None: + self.idx_path = idx_path + self.dtype = dtype + + def __enter__(self) -> "_IndexWriter": + """Enter the context introduced by the 'with' keyword + + Returns: + _IndexWriter: The instance + """ + self.idx_writer = open(self.idx_path, "wb") + # fixed, vestigial practice + self.idx_writer.write(_INDEX_HEADER) + # fixed, vestigial practice + self.idx_writer.write(struct.pack(" Optional[bool]: + """Exit the context introduced by the 'with' keyword + + Args: + exc_type (Optional[Type[BaseException]]): Exception type + + exc_val (Optional[BaseException]): Exception value + + exc_tb (Optional[TracebackType]): Exception traceback object + + Returns: + Optional[bool]: Whether to silence the exception + """ + self.idx_writer.close() + + def write( + self, + sequence_lengths: List[int], + sequence_modes: Optional[List[int]], + document_indices: List[int], + ) -> None: + """Write the index (.idx) file + + Args: + sequence_lengths (List[int]): The length of each sequence + + sequence_modes (Optional[List[int]]): The mode of each sequences + + document_indices (List[int]): The seqyebce indices demarcating the end of each document + """ + sequence_pointers = self._sequence_pointers(sequence_lengths) + + # the number of sequences in the dataset + sequence_count = len(sequence_lengths) + self.idx_writer.write(struct.pack(" List[int]: + """Build the sequence pointers per the sequence lengths and dtype size + + Args: + sequence_lengths (List[int]): The length of each sequence + + Returns: + List[int]: The pointer to the beginning of each sequence + """ + itemsize = DType.size(self.dtype) + curr_ptr = 0 + list_ptr = [] + for length in sequence_lengths: + list_ptr.append(curr_ptr) + curr_ptr += length * itemsize + return list_ptr + + +class _IndexReader(object): + """Object class to read the index (.idx) file + + Args: + idx_path (str): The path to the index file + + multimodal (bool): Whether the dataset is multimodal + """ + + def __init__(self, idx_path: str, multimodal: bool) -> None: + + log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}") + + with open(idx_path, "rb") as stream: + header = stream.read(9) + assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}" + + version = struct.unpack(" time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank(logger, logging.INFO, f"\tExtract the sequence pointers") + t_beg = time.time() + self.sequence_pointers = numpy.frombuffer( + self.bin_buffer, + dtype=numpy.int64, + count=self.sequence_count, + offset=offset + self.sequence_lengths.nbytes, + ) + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank(logger, logging.INFO, f"\tExtract the document indices") + t_beg = time.time() + self.document_indices = numpy.frombuffer( + self.bin_buffer, + dtype=numpy.int64, + count=self.document_count, + offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes, + ) + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + self.sequence_modes = None + if multimodal: + log_single_rank(logger, logging.INFO, f"\tExtract the sequence modes") + t_beg = time.time() + self.sequence_modes = numpy.frombuffer( + self.bin_buffer, + dtype=numpy.int8, + count=self.sequence_count, + offset=offset + + self.sequence_lengths.nbytes + + self.sequence_pointers.nbytes + + self.document_indices.nbytes, + ) + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + assert self.sequence_lengths.shape[0] == len(self) + assert self.sequence_lengths.shape[0] == self.sequence_count + assert self.sequence_lengths.shape[0] == self.document_indices[-1] + + log_single_rank(logger, logging.INFO, f"> total number of sequences: {len(self)}") + log_single_rank( + logger, + logging.INFO, + f"> total number of documents: {self.document_indices.shape[0] - 1}", + ) + + def __del__(self) -> None: + """Clean up the object""" + if hasattr(self, "bin_buffer_mmap"): + self.bin_buffer_mmap._mmap.close() + del self.bin_buffer_mmap + + def __len__(self) -> int: + """Return the length of the dataset + + Returns: + int: The length of the dataset + """ + return self.sequence_count + + @lru_cache(maxsize=8) + def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: + """Return the pointer, length, and mode at the index + + Args: + idx (int): The index into the dataset + + Returns: + Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at the index + """ + return ( + self.sequence_pointers[idx], + self.sequence_lengths[idx], + self.sequence_modes[idx] if self.sequence_modes is not None else None, + ) + + +class _BinReader(ABC): + """Abstract class to read the data (.bin) file""" + + @abstractmethod + def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray: + """Read bytes into a numpy array. + + Args: + dtype (Type[numpy.number]): Data-type of the returned array. + + count (int): Number of items to read. + + offset (int): Start reading from this offset (in bytes). + + Returns: + numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`. + """ + pass + + +class _MMapBinReader(_BinReader): + """A _BinReader that memory maps the data (.bin) file + + Args: + bin_path (str): bin_path (str): The path to the data (.bin) file. + """ + + def __init__(self, bin_path: str) -> None: + self._bin_buffer_mmap = numpy.memmap(bin_path, mode="r", order="C") + self._bin_buffer = memoryview(self._bin_buffer_mmap) + + def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray: + """Read bytes into a numpy array. + + Args: + dtype (Type[numpy.number]): Data-type of the returned array. + + count (int): Number of items to read. + + offset (int): Start reading from this offset (in bytes). + + Returns: + numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`. + """ + return numpy.frombuffer( + self._bin_buffer, + dtype=dtype, + count=count, + offset=offset, + ) + + def __del__(self) -> None: + """Clean up the object.""" + if self._bin_buffer_mmap is not None: + self._bin_buffer_mmap._mmap.close() + del self._bin_buffer_mmap + + +class _FileBinReader(_BinReader): + """A _BinReader that reads from the data (.bin) file using a file pointer + + Args: + bin_path (str): bin_path (str): The path to the data (.bin) file. + """ + + def __init__(self, bin_path: str) -> None: + self._bin_path = bin_path + + def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray: + """Read bytes into a numpy array. + + Args: + dtype (Type[numpy.number]): Data-type of the returned array. + + count (int): Number of items to read. + + offset (int): Start reading from this offset (in bytes). + + Returns: + numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`. + """ + sequence = numpy.empty(count, dtype=dtype) + with open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file: + bin_buffer_file.seek(offset) + bin_buffer_file.readinto(sequence) + return sequence + + +class _S3BinReader(_BinReader): + """A _BinReader that reads from the data (.bin) file from S3 + + Args: + bin_path (str): bin_path (str): The path to the data (.bin) file. + + bin_chunk_nbytes (int, optional): If not None, then maintain an in-memory cache to speed up calls to the `read` method. Furthermore, on a cache miss, download this number of bytes to refresh the cache. Otherwise (None), do not maintain an in-memory cache. A class that inherits from _BinReader may not implement caching in which case it should assert that `bin_chunk_nbytes` is None at initialization. + """ + + def __init__(self, bin_path: str, bin_chunk_nbytes: int) -> None: + assert bin_chunk_nbytes > 0 + self._client = boto3.client("s3") + self._s3_bucket, self._s3_key = parse_s3_path(bin_path) + self._cache = None + self._cache_bytes_start = None + self._cache_bytes_end = None + self._cache_nbytes = bin_chunk_nbytes + + def _extract_from_cache(self, offset: int, size: int) -> bytes: + """Extract `size` bytes starting at `offset` bytes into the cache""" + start = offset - self._cache_bytes_start + assert start >= 0 + end = start + size + assert end <= len(self._cache) + return self._cache[start:end] + + def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray: + """Read bytes into a numpy array. + + Let `size` be the `count` * `DType.size(dtype)`. If the requested span of bytes [`offset`, + `offset` + `size`) is covered by the in-memory cache maintained by this class, then this + function extracts the requested span from that cache and returns it. Otherwise, this + function first refreshes the cache and then extracts the requested span from the refreshed + cache and returns it. + + The cache is refreshed based on `offset` and `size`. In particular, we divide all the bytes + in an S3 object into blocks, where each block contains `bin_chunk_nbytes` bytes. We assign + each block an index starting from 0. We take the block with index (`offset` // + `bin_chunk_nbytes`) to refresh the cache. If this new block still does not cover the + requested span, we extend it just enough to include `offset` + `size`. + + Args: + dtype (Type[numpy.number]): Data-type of the returned array. + + count (int): Number of items to read. + + offset (int): Start reading from this offset (in bytes). + + Returns: + numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`. + """ + size = count * DType.size(dtype) + if ( + self._cache is not None + and offset >= self._cache_bytes_start + and offset + size <= self._cache_bytes_end + ): + return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype) + + bytes_start = (offset // self._cache_nbytes) * self._cache_nbytes + assert bytes_start >= 0 + assert offset >= bytes_start + bytes_end = max(bytes_start + self._cache_nbytes, offset + size) + assert bytes_end >= 1 + self._cache = self._client.get_object( + Bucket=self._s3_bucket, + Key=self._s3_key, + # Subtract 1, because the end of Range is inclusive. + Range=f'bytes={bytes_start}-{bytes_end-1}', + )['Body'].read() + self._cache_bytes_start = bytes_start + self._cache_bytes_end = bytes_end + return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype) + + def __del__(self) -> None: + """Clean up the object""" + self._client.close() + + +class IndexedDataset(torch.utils.data.Dataset): + """The low-level interface dataset class + + Args: + path_prefix (str): The index (.idx) and data (.bin) prefix + + multimodal (bool): Whether the dataset is multimodal. Defaults to False. + + mmap (bool): Whether to mmap the .bin files. Defaults to True. + + s3_config (Optional[S3Config]): Supplied only for data stored on S3. IndexedDataset downloads the index (.idx) file to `s3_config.path_to_idx_cache` and streams data from the data (.bin) file in `s3_config.bin_chunk_nbytes` blocks. Note that `mmap` must be disabled for S3 data loading. Defaults to None. + """ + + def __init__( + self, + path_prefix: str, + multimodal: bool = False, + mmap: bool = True, + s3_config: Optional[S3Config] = None, + ) -> None: + super().__init__() + self.path_prefix = None + self.multimodal = None + self.mmap = None + self.s3_config = None + + self.index = None + self.bin_reader = None + + if is_s3_path(path_prefix) and s3_config is not None: + idx_path = get_idx_path(path_prefix) + cache_idx_path = os.path.join(s3_config.path_to_idx_cache, os.path.basename(idx_path)) + maybe_download_file(idx_path, cache_idx_path) + + self.initialize(path_prefix, multimodal, mmap, s3_config) + + def initialize( + self, path_prefix: str, multimodal: bool, mmap: bool, s3_config: Optional[S3Config] + ) -> None: + """Initialize the dataset + + This method is called by IndexedDataset.__init__ during object creation and by + IndexedDataset.__setstate__ during un-pickling + + Args: + path_prefix (str): The index (.idx) and data (.bin) prefix + + multimodal (bool): Whether the dataset is multimodal + + mmap (bool): Whether to mmap the .bin file + + s3_config (Optional[S3Config]): See IndexedDataset docstring for details. + """ + idx_path = get_idx_path(path_prefix) + bin_path = get_bin_path(path_prefix) + if s3_config is None: + assert os.path.exists(idx_path) and os.path.exists( + bin_path + ), f"One or both of the .idx and .bin files cannot be found at the path prefix {path_prefix}" + self.path_prefix = path_prefix + self.multimodal = multimodal + self.mmap = mmap + self.s3_config = s3_config + if mmap: + assert not s3_config + self.bin_reader = _MMapBinReader(bin_path) + elif s3_config: + assert not mmap + self.bin_reader = _S3BinReader(bin_path, s3_config.bin_chunk_nbytes) + idx_path = os.path.join( + s3_config.path_to_idx_cache, os.path.basename(get_idx_path(path_prefix)) + ) + else: + self.bin_reader = _FileBinReader(bin_path) + self.index = _IndexReader(idx_path, self.multimodal) + + def __getstate__(self) -> Tuple[str, bool, bool, Optional[S3Config]]: + """Get the state during pickling + + Returns: + Tuple[str, bool, bool, Optional[S3Config]]: The state tuple + """ + return self.path_prefix, self.multimodal, self.mmap, self.s3_config + + def __setstate__(self, state: Tuple[str, bool, bool, Optional[S3Config]]) -> None: + """Set the state during un-pickling + + Args: + state (Tuple[str, bool, bool, Optional[S3Config]]): The state tuple + """ + path_prefix, multimodal, mmap, s3_config = state + self.initialize(path_prefix, multimodal, mmap, s3_config) + + def __del__(self) -> None: + """Clean up the object""" + del self.bin_reader + del self.index + + def __len__(self) -> int: + """Return the length of the dataset i.e. the number of sequences in the index + + Returns: + int: The length of the dataset + """ + return len(self.index) + + def __getitem__( + self, idx: Union[int, numpy.integer, slice] + ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: + """Return from the dataset + + Args: + idx (Union[int, numpy.integer, slice]): The index or index slice into the dataset + + Raises: + ValueError: When the index slice is non-contiguous + + TypeError: When the index is of an unexpected type + + Returns: + Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and modes at the index or index slice + """ + if isinstance(idx, (int, numpy.integer)): + sequence_pointer, sequence_length, sequence_mode = self.index[idx] + sequence = self.bin_reader.read( + dtype=self.index.dtype, + count=sequence_length, + offset=sequence_pointer, + ) + return (sequence, sequence_mode) if sequence_mode is not None else sequence + elif isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + if step != 1: + raise ValueError("Slices into indexed_dataset must be contiguous") + sequence_lengths = self.index.sequence_lengths[idx] + sequence_modes = self.index.sequence_modes[idx] if self.multimodal else None + sequence_offsets = list(accumulate(sequence_lengths)) + sequences = numpy.split( + self.bin_reader.read( + dtype=self.index.dtype, + count=sum(sequence_lengths), + offset=self.index.sequence_pointers[start], + ), + sequence_offsets[:-1], + ) + return (sequences, sequence_modes) if sequence_modes is not None else sequences + else: + raise TypeError("Unexpected type received for idx: {}".format(type(idx))) + + def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray: + """Retrieve a single item from the dataset with the option to only + return a portion of the item. + + get(idx) is the same as [idx] but get() does not support slicing. + + Args: + idx (Union[int, numpy.integer]): The index into the dataset + + offset (int): The integer token offset in the sequence + + length (int): The number of tokens to grab from the sequence + + Returns: + Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and modes at the index + """ + sequence_pointer, sequence_length, sequence_mode = self.index[idx] + if length is None: + length = sequence_length - offset + sequence_pointer += offset * DType.size(self.index.dtype) + sequence = self.bin_reader.read( + dtype=self.index.dtype, count=length, offset=sequence_pointer + ) + return (sequence, sequence_mode) if sequence_mode is not None else sequence + + @property + def sequence_lengths(self) -> numpy.ndarray: + """Get the sequence lengths + + Returns: + numpy.ndarray: The sequence lengths + """ + return self.index.sequence_lengths + + @property + def document_indices(self) -> numpy.ndarray: + """Get the document indices + + Returns: + numpy.ndarray: The document indices + """ + return self.index.document_indices + + def get_document_indices(self) -> numpy.ndarray: + """Get the document indices + + This method is slated for deprecation. + + Returns: + numpy.ndarray: The document indices + """ + return self.index.document_indices + + def set_document_indices(self, document_indices: numpy.ndarray) -> None: + """Set the document indices + + This method is slated for deprecation. + + Args: + document_indices (numpy.ndarray): The document indices + """ + self.index.document_indices = document_indices + + @property + def sequence_modes(self) -> numpy.ndarray: + """Get the sequence modes + + Returns: + numpy.ndarray: The sequence modes + """ + return self.index.sequence_modes + + @staticmethod + def exists(path_prefix: str) -> bool: + """Return whether the IndexedDataset exists on disk at the prefix + + Args: + path_prefix (str): The prefix to the index (.idx) and data (.bin) files + + Returns: + bool: Whether the IndexedDataset exists on disk at the prefix + """ + if is_s3_path(path_prefix): + s3_client = boto3.client("s3") + return object_exists(s3_client, get_idx_path(path_prefix)) and object_exists( + s3_client, get_bin_path(path_prefix) + ) + return os.path.exists(get_idx_path(path_prefix)) and os.path.exists( + get_bin_path(path_prefix) + ) + + +class IndexedDatasetBuilder(object): + """Builder class for the IndexedDataset class + + Args: + bin_path (str): The path to the data (.bin) file + + dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32. + + multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False. + """ + + def __init__( + self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False + ) -> None: + self.data_file = open(bin_path, "wb") + self.dtype = dtype + self.multimodal = multimodal + + self.sequence_lengths = [] + self.document_indices = [0] + self.sequence_modes = [] if self.multimodal else None + + def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None: + """Add a single item to the dataset + + Args: + tensor (torch.Tensor): The item to add to the data file + + mode (int, optional): The mode for the item. Defaults to 0. + """ + np_array = numpy.array(tensor.numpy(), dtype=self.dtype) + self.data_file.write(np_array.tobytes(order="C")) + self.sequence_lengths.append(np_array.size) + if self.multimodal: + self.sequence_modes.append(mode) + + def add_document( + self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None + ) -> None: + """Add an entire document to the dataset + + Args: + tensor (torch.Tensor): The document to add + + lengths (List[int]): The lengths of each item in the document + + modes (Optional[List[int]], optional): The modes for each item in the document. Defaults to None. + """ + np_array = numpy.array(tensor, dtype=self.dtype) + self.data_file.write(np_array.tobytes(order="C")) + self.sequence_lengths.extend(lengths) + self.document_indices.append(len(self.sequence_lengths)) + if self.multimodal: + self.sequence_modes.extend(modes if modes is not None else [0] * lengths) + + def end_document(self) -> None: + """Finalize the document, for use with IndexedDatasetBuilder.add_item""" + self.document_indices.append(len(self.sequence_lengths)) + + def add_index(self, path_prefix: str) -> None: + """Add an entire IndexedDataset to the dataset + + Args: + path_prefix (str): The index (.idx) and data (.bin) prefix + """ + # Concatenate index + index = _IndexReader(get_idx_path(path_prefix), multimodal=self.multimodal) + assert index.dtype == self.dtype + + offset = len(self.sequence_lengths) + self.sequence_lengths.extend(index.sequence_lengths) + self.document_indices.extend((offset + index.document_indices)[1:]) + + if self.multimodal: + self.sequence_modes.extend(index.sequence_modes) + + # Concatenate data + with open(get_bin_path(path_prefix), "rb") as f: + shutil.copyfileobj(f, self.data_file) + + def finalize(self, idx_path: str) -> None: + """Clean up and write the index (.idx) file + + Args: + idx_path (str): The path to the index file + """ + self.data_file.close() + with _IndexWriter(idx_path, self.dtype) as writer: + writer.write(self.sequence_lengths, self.sequence_modes, self.document_indices) + + +def get_idx_path(path_prefix: str) -> str: + """Get the path to the index file from the prefix + + Args: + path_prefix (str): The prefix + + Returns: + str: The path to the index file + """ + return path_prefix + ".idx" + + +def get_bin_path(path_prefix: str) -> str: + """Get the path to the data file from the prefix + + Args: + path_prefix (str): The prefix + + Returns: + str: The path to the data file + """ + return path_prefix + ".bin" diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py new file mode 100644 index 0000000..081d585 --- /dev/null +++ b/megatron/core/datasets/masked_dataset.py @@ -0,0 +1,431 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import logging +import os +import time +from abc import abstractmethod +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy +import torch + +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.megatron_dataset import MegatronDataset +from megatron.core.datasets.utils import Split +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + + +@dataclass +class MaskedWordPieceDatasetConfig(BlendedMegatronDatasetConfig): + """Configuration object for Megatron Core Masked WordPiece datasets""" + + masking_probability: float = None + """The probability we mask a candidate N-gram""" + + short_sequence_probability: float = None + """The probability we return a sequence shorter than the target sequence length""" + + masking_max_ngram: int = None + """The maximum length N-gram to consider masking or permuting""" + + masking_do_full_word: bool = None + """Whether we mask the the whole word or its component parts""" + + masking_do_permutation: bool = None + """Whether we shuffle a subset of candidate N-grams in addition""" + + masking_use_longer_ngrams: bool = None + """Whether to favor longer N-grams over shorter N-grams""" + + masking_use_geometric_distribution: bool = None + """Whether to draw the size of the N-gram from a geometric distribution according to SpanBERT + https://arxiv.org/abs/1907.10529 (Section 3.1) + """ + + def __post_init__(self) -> None: + """Do asserts and set fields post init""" + super().__post_init__() + + assert self.tokenizer is not None + + assert self.masking_probability is not None + assert self.short_sequence_probability is not None + assert self.masking_max_ngram is not None + assert self.masking_do_full_word is not None + assert self.masking_do_permutation is not None + assert self.masking_use_longer_ngrams is not None + assert self.masking_use_geometric_distribution is not None + + assert self.masking_probability > 0 and self.masking_probability < 1.0 + assert self.short_sequence_probability >= 0 and self.short_sequence_probability <= 1.0 + assert self.masking_max_ngram > 0 + assert not (self.masking_use_geometric_distribution and self.masking_do_permutation) + + if self.masking_use_geometric_distribution and self.masking_use_longer_ngrams: + log_single_rank( + logger, + logging.WARNING, + "The use of a geometric distribution overrides the default distribution", + ) + + +class MaskedWordPieceDataset(MegatronDataset): + """The semi-abstract base class for masked WordPiece datasets + + This implementation makes the rigid assumption that all inheritor datasets are built upon the + IndexedDataset class. This assumption may be pushed down to the inheritors in future if + necessary. + + NB: WordPiece tokenization prepends a double hash "##" to all tokens/pieces in a word, save the + first token/piece. + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset + + dataset_path (str): The real path on disk to the dataset, for bookkeeping + + indexed_indices (numpy.ndarray): The set of the documents indices to expose + + num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch. + + index_split (Split): The indexed_indices Split + + config (MaskedWordPieceDatasetConfig): The config + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: str, + indexed_indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: MaskedWordPieceDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + + @staticmethod + def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int: + return low_level_dataset.document_indices.shape[0] - 1 + + @staticmethod + def build_low_level_dataset( + dataset_path: str, config: MaskedWordPieceDatasetConfig + ) -> IndexedDataset: + return IndexedDataset(dataset_path) + + @staticmethod + def _key_config_attributes() -> List[str]: + """Inherited method implementation + + Returns: + List[str]: The key config attributes + """ + return super(MaskedWordPieceDataset, MaskedWordPieceDataset)._key_config_attributes() + [ + "masking_probability", + "short_sequence_probability", + "masking_max_ngram", + "masking_do_full_word", + "masking_do_permutation", + "masking_use_longer_ngrams", + "masking_use_geometric_distribution", + ] + + def __len__(self) -> int: + return self.sample_index.shape[0] + + def _build_sample_index( + self, sequence_length: int, min_sentences_per_sample: int + ) -> numpy.ndarray: + path_to_cache = self.config.path_to_cache + if path_to_cache is None: + path_to_cache = os.path.join( + self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices" + ) + + get_path_to = lambda suffix: os.path.join( + path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}" + ) + path_to_description = get_path_to("description.txt") + path_to_sample_index = get_path_to("sample_index.npy") + cache_hit = all( + map( + os.path.isfile, + [ + path_to_description, + path_to_sample_index, + ], + ) + ) + + if self.num_samples is not None: + num_epochs = numpy.iinfo(numpy.int32).max - 1 + else: + num_epochs = 1 + + if not cache_hit and torch.distributed.get_rank() == 0: + log_single_rank( + logger, + logging.INFO, + f"Build and save the {type(self).__name__} {self.index_split.name} indices", + ) + self.built_anew_on_cache_miss = True + + os.makedirs(path_to_cache, exist_ok=True) + + # Write the description + with open(path_to_description, "wt") as writer: + writer.write(self.unique_description) + + # Build the sample index + log_single_rank( + logger, + logging.INFO, + f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}", + ) + t_beg = time.time() + from megatron.core.datasets import helpers + + # Add +1 for access to document upper bound + indices = numpy.append(self.indices, self.indices[-1] + 1) + + sample_index = helpers.build_mapping( + self.dataset.document_indices[indices], + self.dataset.sequence_lengths, + num_epochs, + self.num_samples, + sequence_length, + self.config.short_sequence_probability, + self.config.random_seed, + False, + min_sentences_per_sample, + ) + numpy.save(path_to_sample_index, sample_index, allow_pickle=True) + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + log_single_rank( + logger, logging.INFO, f"> total number of samples: {sample_index.shape[0]}" + ) + log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}") + + return sample_index + + log_single_rank( + logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices" + ) + + log_single_rank( + logger, + logging.INFO, + f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}", + ) + t_beg = time.time() + sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r") + t_end = time.time() + log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") + + return sample_index + + def _create_masked_lm_predictions( + self, + token_ids: List[int], + target_sequence_length: int, + numpy_random_state: numpy.random.RandomState, + ) -> Tuple[List[int], List[int], List[int], List[int], List[Tuple[List[int], List[int]]]]: + """Creates the predictions for the masked LM objective + + Args: + token_ids (List[int]): The token ids + target_sequence_length (int): The target sequence length + numpy_random_state (numpy.random.RandomState): The NumPy random state + + Returns: + Tuple[List[int], List[int], List[int], List[int], List[Tuple[List[int], List[int]]]]: + 1. masked_token_ids -> The masked sequence + 2. masked_positions -> The indices for the masked token ids + 3. masked_labels -> The original token ids for the masked token ids + 4. boundaries -> The sentence and word boundaries for the sequence + 4. masked_spans -> The masked positions and labels with N-gram info intact + """ + # Build the token sentence and word boundaries and the masking candidates + # e.g. [cls, id, ##id, ##id, id, ##id, sep, id, ##id, sep] + # -> boundaries: [1, 1, 0, 0, 1, 0, 1, 1, 0, 1] + # -> candidates with whole word masking: [[1, 2, 3], [4, 5], [7, 8]] + # -> candidates sans whole word masking: [[1], [2], [3], [4], [5], [7], [8]] + boundaries = [] + candidates = [] + for i, token_id in enumerate(token_ids): + if token_id == self.config.tokenizer.cls or token_id == self.config.tokenizer.sep: + boundaries.append(1) + else: + if not self.config.tokenizer.inv_vocab[token_id].startswith("##"): + boundaries.append(1) + candidates.append([i]) + else: + boundaries.append(0) + if self.config.masking_do_full_word and len(candidates) > 0: + candidates[-1].append(i) + else: + candidates.append([i]) + + n_maskings = min( + self.config.masking_probability * target_sequence_length, + max(1, int(round(len(token_ids) * self.config.masking_probability))), + ) + + ngram_nvals = numpy.arange(self.config.masking_max_ngram, dtype=numpy.int64) + 1 + + # By default, the N-gram probabilites are inversely proportional to N + # e.g. N = 3 + # -> P = array([0.54545455, 0.27272727, 0.18181818]) + nprobs = 1.0 / ngram_nvals + nprobs = nprobs / nprobs.sum(keepdims=True) + if self.config.masking_use_longer_ngrams: + nprobs = nprobs[::-1] + + # Create a nested list of depth 3 + # layer 1: the candidate dimension + # layer 2: the N-gram dimension + # layer 3: the token dimension + candidate_ngrams = [ + [candidates[idx : idx + n] for n in ngram_nvals] for idx in range(len(candidates)) + ] + numpy_random_state.shuffle(candidate_ngrams) + + masked_token_ids = list(token_ids) + masked_positions_and_labels = [] + masked_spans = [] + masked_indices = set() + for candidate_idx in range(len(candidate_ngrams)): + n_ngrams = len(candidate_ngrams[candidate_idx]) + + # Stop when we hit our desired number of maskings + if len(masked_positions_and_labels) >= n_maskings: + break + + # Do nothing for candidates with no ngrams + if not candidate_ngrams[candidate_idx]: + continue + + # Choose the initial value of N + if self.config.masking_use_geometric_distribution: + # Sample N from a geometric distribution with p = 0.2 and clip + # i.e. SpanBERT + # -> https://arxiv.org/abs/1907.10529 (Section 3.1) + p = 0.2 + n = min(numpy_random_state.geometric(p), self.config.masking_max_ngram) + else: + p = nprobs[:n_ngrams] / nprobs[:n_ngrams].sum(keepdims=True) + n = numpy_random_state.choice(ngram_nvals[:n_ngrams], p=p) + + while True: + ngram_indices = sum(candidate_ngrams[candidate_idx][n - 1], []) + n = n - 1 + # Success: masking this N-gram puts us below the desired number of maskings + if n_maskings >= len(masked_positions_and_labels) + len(ngram_indices): + skip_candidate = False + break + # Failure: no N-grams remain for this candidate + if n == 0: + skip_candidate = True + break + + # Do nothing for candidates whose 1-gram is too long + if skip_candidate: + continue + + # Do nothing for candidate indices which have already been masked + if any(map(lambda idx: idx in masked_indices, ngram_indices)): + continue + + # Mask the tokens and record their original positions and values + for index in ngram_indices: + masked_indices.add(index) + mask = self._get_token_mask(numpy_random_state) + if mask is None: + masked_token_ids[index] = token_ids[index] + else: + masked_token_ids[index] = mask + masked_positions_and_labels.append((index, token_ids[index])) + + masked_spans.append((ngram_indices, [token_ids[index] for index in ngram_indices])) + + assert len(masked_positions_and_labels) <= n_maskings + + numpy_random_state.shuffle(candidate_ngrams) + + if self.config.masking_do_permutation: + + n_swappings = n_maskings + + permuted_indices = set() + for candidate_idx in range(len(candidate_ngrams)): + n_ngrams = len(candidate_ngrams[candidate_idx]) + + if len(permuted_indices) >= n_swappings: + break + + # Do nothing for candidates with no ngrams + if not candidate_ngrams[candidate_idx]: + continue + + p = nprobs[:n_ngrams] / nprobs[:n_ngrams].sum(keepdims=True) + n = numpy.random.choice(ngram_nvals[:n_ngrams], p=p) + + while True: + ngram_indices = sum(candidate_ngrams[candidate_idx][n - 1], []) + n = n - 1 + # Success: swapping this N-gram puts us below the desired number of swappings + if n_swappings >= len(permuted_indices) + len(ngram_indices): + skip_candidate = False + break + # Failure: no N-grams remain for this candidate + if n == 0: + skip_candidate = True + break + + # Do nothing for candidates whose 1-gram is too long + if skip_candidate: + continue + + # Do nothing for candidate indices which have already been masked or permuted + if any( + map(lambda idx: idx in masked_indices or idx in permuted_indices, ngram_indices) + ): + continue + + for index in ngram_indices: + permuted_indices.add(index) + + assert len(permuted_indices) <= n_swappings + + permuted_indices = sorted(permuted_indices) + permuted_indices_copy = list(permuted_indices) + numpy_random_state.shuffle(permuted_indices_copy) + masked_token_ids_copy = list(masked_token_ids) + + for idx, idx_copy in zip(permuted_indices, permuted_indices_copy): + masked_token_ids[idx] = masked_token_ids_copy[idx_copy] + masked_positions_and_labels.append((idx, masked_token_ids_copy[idx])) + + masked_positions_and_labels = sorted(masked_positions_and_labels, key=lambda x: x[0]) + masked_positions = [] + masked_labels = [] + for position, label in masked_positions_and_labels: + masked_positions.append(position) + masked_labels.append(label) + + masked_spans = sorted(masked_spans, key=lambda x: x[0][0]) + + return masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans + + @abstractmethod + def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]: + pass diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py new file mode 100644 index 0000000..15a9a53 --- /dev/null +++ b/megatron/core/datasets/megatron_dataset.py @@ -0,0 +1,139 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import hashlib +import json +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Optional, Union + +import numpy +import torch + +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.utils import Split + +LowLevelDataset = Union[IndexedDataset, Iterable] + + +class MegatronDataset(ABC, torch.utils.data.Dataset): + """The highest level wrapper class from which all dataset classes should inherit + + Args: + dataset (LowLevelDataset): The dataset around which to build the MegatronDataset + + dataset_path (Optional[str]): The real path on disk to the dataset, for bookkeeping + + indices (numpy.ndarray): The set of the documents indices to expose + + num_samples (Optional[int]): The minimum number of samples to build from the indexed dataset. When None, build as many samples as correspond to one epoch. + + index_split (Split): The indices Split + + config (BlendedMegatronDatasetConfig): The config + """ + + def __init__( + self, + dataset: LowLevelDataset, + dataset_path: Optional[str], + indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: BlendedMegatronDatasetConfig, + ) -> None: + self.dataset = dataset + self.dataset_path = dataset_path + self.indices = indices + self.num_samples = num_samples + self.index_split = index_split + self.config = config + + self.unique_identifiers = OrderedDict() + + self.unique_identifiers["class"] = type(self).__name__ + self.unique_identifiers["dataset_path"] = self.dataset_path + self.unique_identifiers["num_samples"] = self.num_samples + self.unique_identifiers["index_split"] = self.index_split.name + for attr in self._key_config_attributes(): + self.unique_identifiers[attr] = getattr(self.config, attr) + + self.unique_description = json.dumps( + self.unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers + ) + self.unique_description_hash = hashlib.md5( + self.unique_description.encode("utf-8") + ).hexdigest() + + self.built_anew_on_cache_miss = False + + @staticmethod + def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: + """Return the number of elements in the underlying low level dataset for the purpose of + segregating the train/valid/test split indices + + It may be that the low level dataset can be split any number of ways, depending on the mid + level dataset it supports, which is why we define the "number of elements" function + separately from the __len__ function here in the mid level dataset class + + Args: + low_level_dataset (LowLevelDataset): The underlying low level dataset + + Returns: + int: The number of elements in the underlying low level dataset + """ + raise NotImplementedError + + @staticmethod + def build_low_level_dataset( + dataset_path: str, config: BlendedMegatronDatasetConfig + ) -> LowLevelDataset: + """Build the low level dataset via a function to be called from within + BlendedMegatronDatasetBuilder.build_generic_dataset + + It may be that the low level dataset spans any subset of train/valid/test splits, which is + why we define a static "build" function separately from the constructor in the mid level + dataset class + + Args: + dataset_path (str): The real path on disk to the dataset + + config (BlendedMegatronDatasetConfig): The dataset config + + Returns: + LowLevelDataset: The low level dataset + """ + raise NotImplementedError + + @staticmethod + def _key_config_attributes() -> List[str]: + """Return all config attributes which contribute to uniquely identifying the dataset. + + These attributes will be used to build a uniquely identifying string and MD5 hash which + will be used to cache/load dataset resources from run to run. + + Returns: + List[str]: The key config attributes + """ + return ["random_seed", "sequence_length", "split", "split_matrix", "tokenizer"] + + @abstractmethod + def __len__(self) -> int: + """Return the length of the dataset + + Returns: + int: See abstract implementation + """ + pass + + @abstractmethod + def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy.ndarray]]: + """Return from the dataset + + Args: + idx (int): The index into the dataset + + Returns: + Dict[str, Union[torch.Tensor, numpy.ndarray]]: See abstract implementation + """ + pass diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py new file mode 100644 index 0000000..b19bec0 --- /dev/null +++ b/megatron/core/datasets/megatron_tokenizer.py @@ -0,0 +1,141 @@ +import json +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import Any + +import numpy + + +class MegatronTokenizer(ABC): + """Abstract class for tokenizer + + Absent a config or class-specific tracking of which objects are uniquely identifying, we must + include all key word arguments as unique identifiers + + Args: + tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes + + tokenizer_options (Dict[str, Any]): All tokenizer options + """ + + def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any): + + self.unique_identifiers = OrderedDict() + self.unique_identifiers["class"] = type(self).__name__ + self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths) + for option in tokenizer_options: + self.unique_identifiers[option] = str(tokenizer_options[option]) + + self.unique_description = json.dumps(self.unique_identifiers, indent=4) + + super().__init__() + + @abstractmethod + def tokenize(self, text: str) -> numpy.ndarray: + """Convert text to embedding ids + + Args: + text (str): The text to convert + + Returns: + numpy.ndarray: The converted embedding ids + """ + pass + + def detokenize(self, ids: numpy.ndarray) -> str: + """Convert embedding ids to text + + Args: + ids (numpy.ndarray): The ids to convert + + Returns: + str: The converted text + + Raises: + NotImplementedError: Non-abstract, optional method + """ + raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__)) + + @property + @abstractmethod + def vocab(self): + """Dictionary from vocab text token to id token + """ + pass + + @property + @abstractmethod + def inv_vocab(self): + """Dictionary from vocab id token to text token + """ + pass + + @property + @abstractmethod + def vocab_size(self): + """The vocabulary size + """ + pass + + @property + def cls(self): + """The CLS token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__)) + + @property + def sep(self): + """The SEP token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__)) + + @property + def pad(self): + """The PAD token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__)) + + @property + def eod(self): + """The EOD token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__)) + + @property + def bos(self): + """The BOS token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__)) + + @property + def eos(self): + """The EOS token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__)) + + @property + def mask(self): + """The MASK token id + + Raises: + NotImplementedError: Non-abstract, optional attribute + """ + raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__)) diff --git a/megatron/core/datasets/multimodal_dataset.py b/megatron/core/datasets/multimodal_dataset.py new file mode 100644 index 0000000..0a3e93a --- /dev/null +++ b/megatron/core/datasets/multimodal_dataset.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Callable, Dict + +import torch + +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset + + +@dataclass +class MultimodalDatasetConfig(GPTDatasetConfig): + """Configuration object for Megatron Core Multimodal datasets. + + Note: This is unused at the moment and may be missing features. Follow-up changes will use this. + """ + + image_h: int = None + """Image height.""" + + image_w: int = None + """Image width.""" + + # Function to preprocess the data sample to a format expected by a specific model. By default, do nothing. + preprocess_func: Callable[[Dict[str, torch.Tensor]], Dict[str, torch.Tensor]] = lambda x: x + """Optional function to preprocess data samples for a specific model.""" + + def __post_init__(self) -> None: + super().__post_init__() + + assert self.image_h is not None + assert self.image_w is not None + + +class MockMultimodalDataset(MockGPTDataset): + """Mock multimodal dataset. + + + This is unused at the moment and may be missing features. Follow-up changes will use this. + """ + + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + """Return a sample that contains a dummy image, text sequence and the associated labels and cost and attention masks. + + Args: + idx (int): The integer seed for mock data generation. + + Returns: + Dict[str, torch.Tensor]: The mock data. + """ + # Get a text sample. + sample = super().__getitem__(idx) + + # Add mock input image. + sample["image"] = torch.zeros( + (3, self.config.image_h, self.config.image_w), dtype=torch.float32 + ) + + # Run optional data preprocessing. + preprocess_func = self.config.preprocess_func + + return preprocess_func(sample) diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md new file mode 100644 index 0000000..12ade94 --- /dev/null +++ b/megatron/core/datasets/readme.md @@ -0,0 +1,193 @@ +# Data Pipeline + +## Data pre-processing + +Data preprocessing is built around the following classes: + +1. `IndexedDatasetBuilder` +2. `IndexedDataset` + +At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details. + +#### IndexedDatasetBuilder + +The `IndexedDatasetBuilder` is capable of building and merging `IndexedDataset` instances. + +#### IndexedDataset + +The `IndexedDataset` class is the lowest-level data interface in Megatron Core. Internally, an `IndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata. + +The index file stores dataset-level metadata first: +- The index header, for backward compatibility +- The index version, for backward compatibility +- A numeric code corresponding to the data type used to write data to the data file +- The number of sequences in the dataset +- The number of documents in the dataset + +The index file stores document-level and sequence-level metadata second: +- In order, the number of elements per sequence +- In order, the byte offset (pointer) per sequence +- In order, the consecutive sequence index range `[...)` per document +- In order, the mode per sequence (in the multimodal case) + +## Data loading: construction + +Building the data loaders is a distributed-aware process built around the following classes: + +1. `BlendedMegatronDatasetConfig` +2. `BlendedMegatronDatasetBuilder` +3. `IndexedDataset` +3. `MegatronDataset` +4. `BlendedDataset` + +See the class docstrings for more details. + +#### BlendedMegatronDatasetConfig (extendable) + +The `BlendedMegatronDatasetConfig` class parameterizes the `BlendedMegatronDatasetBuilder` and in turn the `MegatronDataset` and `BlendedDataset`. + +Different training/inference regimes will require different extensions e.g. the `GPTDatasetConfig` + +#### BlendedMegatronDatasetBuilder + +The `BlendedMegatronDatasetBuilder` class builds the highest-level data interfaces in Megatron Core. + +**NB:** All ranks should attempt to build the dataset via the `BlendedMegatronDatasetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `BlendedMegatronDatasetConfig`. + +#### IndexedDataset + +The `IndexedDataset` class is the lowest-level data interface in Megatron Core. + +The `IndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces. + + +#### MegatronDataset (extendable) + +The `MegatronDataset` abstract class is a high-level data interface in Megatron Core. It is an abstraction built upon the `IndexedDataset`. + +Different training/inference regimes will require different extensions e.g. the `GPTDataset` + +#### BlendedDataset + +The `BlendedDataset` class is a high-level data interface in Megatron Core. It is an abstraction built upon the `MegatronDataset`. + +The `BlendedDataset` is only necessary when a blend multiple data distributions, i.e. multiple `MegatronDataset` instances, should contribute to a certain dataset split. The blend can be controlled via the `BlendedMegatronDatasetConfig`. + +## Data loading: implementation + +### GPTDataset + +The `GPTDataset` is parameterized by the following variables: the underlying `IndexedDataset` instance `indexed_dataset`, the split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing), the number of samples `N`, the sequence length `S`, and the random seed `R`. + +The `GPTDataset` creates three index mappings to facilitate lookup: (1) the document index, (2) the sample index, and (3) the shuffle index. + +1. The document index _Do_idx_ is a 1-D array mapping from _i_ to document index of length `E * |indexed_indices|` where `E` corresponds to the minimum number of epochs such that `E * |indexed_indices| >= N`. The document index is shuffled according to `R`. + + ``` + Given: + + N = 15 + indexed_indices = [5, 6, 7, 8, 9] + E = 3 + + Then, for example: + + Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9] + ``` + +2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample. + + ``` + Given: + + S = 1024 + + Then, for example: + + Sa_idx[0] = (0, 0) + Sa_idx[1] = (0, 1024) => Do_idx[0] has length greater than S + Sa_idx[2] = (1, 512) => Do_idx[0] has length 1536 + Sa_idx[3] = (2, 0) => Do_idx[1] has length 1536 + Sa_idx[4] = (5, 300) => Do_idx[2:5] are shorter documents relative to Do_idx[0:2] + Sa_idx[5] = (6, 24) => Do_idx[5] has length 1300 + ``` + +3. The shuffle index _Sh_idx_ is a 1-D array mapping from _k_ to _j_ of length `N`. The shuffle index is shuffled according to `R`. + + ``` + Given + + N = 10 + + Then, for example: + + Sh_idx = [4, 0, 2, 6, 1, 9, 5, 8, 7, 3] + ``` + +To query the `GPTDataset` for the _k_-th sample we do the following + +- Use the shuffle index to get the index _j_ into the sample index. + + ``` + j = Sh_idx[k] + ``` +- Use the sample index to get the left and right sample-bounding indices into the document index and the starting token offset for each document. + + ``` + i, offset = Sa_idx[j] + i_next, offset_next = Sa_idx[j + 1] + ``` +- Use the document index to retrieve `S` tokens from consecutive (in the document index) documents. + + ``` + sample = [] + sample += indexed_dataset[Do_idx[i]][offset:] + if i != i_next: + sample += indexed_dataset[Do_idx[i + 1:i_next]] + sample += indexed_dataset[Do_idx[i_next]][:offset_next] + ``` + +To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `MegatronDataset.__init__` function. + +### BlendedDataset + +The `BlendedDataset` is parameterized by the following variables: the underlying `MegatronDataset` instances `D`, the weights `W` (one per dataset), and the size `S`. The `BlendedDataset` will draw samples from contributing datasets in proportion to the weights until achieving a composite dataset of the desired size. During each sampling step, we draw a single sample from the dataset which has the greatest sampling error. + +The `BlendedDataset` creates two "blending" indices to facilitate lookup: (1) the dataset index and (2) the dataset sample index. + +1. The dataset index _Da_idx_ is a 1-D array mapping from _i_ to dataset index of length `S`. + + ``` + Given + + D = [d0, d1, d2] + W = [1/2, 1/4, 1/4] + S = 4 + + Then, for example: + + Da_idx = [0, 1, 2, 0] + + ``` + +2. The dataset sample index _Sa_idx_ is a 1-D mapping from _i_ to the sample index for dataset _Da_idx[i]_ of length `S`. + + ``` + Given + + Da_idx = [0, 1, 2, 0] + + Then, for example: + + Sa_idx = [0, 0, 0, 1] + ``` + +To query the `BlendedDataset` for the _k_-th sample we do the following + +- Use the dataset index to retrieve the corresponding dataset from `D` and the dataset sample index to retrieve the corresponding sample from that dataset. + + ``` + sample = D[Da_idx[k]][Sa_idx[k]] + ``` + +To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `BlendedDataset.__init__` function. diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py new file mode 100644 index 0000000..7ce970c --- /dev/null +++ b/megatron/core/datasets/retro/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from .config import RetroGPTChunkDatasets +from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig +from .query.retro_dataset import get_retro_datasets diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py new file mode 100644 index 0000000..3635bed --- /dev/null +++ b/megatron/core/datasets/retro/config/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Exports: + + - Embedder: Base class for all Bert embedders. + - RetroBertEmbedders: Container class for in-memory and on-disk embedders. + - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing. + - RetroGPTChunkDatasets: Container class for train, valid, and test datasets. + - RetroTokenizers: Container class for GPT and Bert tokenizers. +""" + +from .bert_embedders import Embedder, RetroBertEmbedders +from .config import RetroPreprocessingConfig +from .gpt_chunk_datasets import RetroGPTChunkDatasets +from .tokenizers import RetroTokenizers diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py new file mode 100644 index 0000000..8f3fe85 --- /dev/null +++ b/megatron/core/datasets/retro/config/bert_embedders.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Container dataclass for holding both in-memory and on-disk Bert embedders.""" + +import abc +from dataclasses import dataclass +from typing import Any + +import numpy as np +import torch + + +class Embedder(abc.ABC): + """Base class for all Bert embedders. + + All embedders should be able to embed either an entire text dataset (to a 2D + numpy array), or a single text string (to a 1D numpy array). + """ + + @abc.abstractmethod + def embed_text_dataset(self, text_dataset: torch.utils.data.Dataset) -> np.ndarray: + """Embed a text dataset. + + Args: + text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value. + + Returns: + A 2D ndarray with shape (len(text_dataset), dimension(embedder)). + """ + + @abc.abstractmethod + def embed_text(self, text: str) -> np.ndarray: + """Embed a simple string of text. + + Args: + text (str): A single text sample. + + Returns: + A 1D ndarray with shape (dimensions(embedder),). + """ + + +@dataclass +class RetroBertEmbedders: + """Container dataclass for in-memory and on-disk Bert embedders.""" + + disk: Embedder + mem: Embedder diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py new file mode 100644 index 0000000..ac9ca84 --- /dev/null +++ b/megatron/core/datasets/retro/config/config.py @@ -0,0 +1,135 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Retro preprocessing config.""" + +from dataclasses import dataclass + +from megatron.core.transformer import TransformerConfig + +from .bert_embedders import RetroBertEmbedders +from .gpt_chunk_datasets import RetroGPTChunkDatasets +from .tokenizers import RetroTokenizers + + +@dataclass +class RetroPreprocessingConfig(TransformerConfig): + """Configuration object for Retro preprocessing. + + *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are + included and named as such to more easily handle managing both models + running at the same time. Megatron is not optimized to run two models at + once, so this naming convention makes it clearer. + + Args: + + retro_project_dir (str): Retro project directory, which contains the preprocessed data for for pretraining. This directory is built during preprocessing (see tools/retro/README.md), and contains subdirectories for the chunk database and pretraining neighbors. + retro_tasks (str): Comma-separated list of tasks to run. Run entire preprocesing pipeline by using '--retro-tasks build'. Alternatively, run individual stages with tasks (in this order) 'db-build', 'index-build', or 'query-pretraining-neighbors'. For example, '--retro-tasks db-build,index-build,query-pretraining-neighbors' is equivalent to '--retro-tasks build'; or the argument can contain a subset of these tasks. Stages must always be run in the correct order (listed above). + retro_task_validate (float): If defined, validate a randomly sampled subset of the existing results of the given task. Each task implements a 'validate' method that is responsible for sampling a `retro_task_validate` fraction of the existing results, and then checking for bitwise equality with the current code base. (E.g., `--retro-task-validate 0.01`.) + retro_block_size (int): Number of chunks to process at a time when generating Bert embeddings and querying the search index. Partial results for each block are generally saved to disk in separate files. + retro_doc_block_size (int): Number of documents to processe at time when processing token datasets into chunk databases. The partial chunk database for each block is saved into a separate file. + retro_gpt_seed (int): Random seed used for python, numpy, pytorch, and cuda. + retro_gpt_data_path (str): Path to the training dataset. Accepted format: 1) a single data path, 2) multiple datasets in the form: dataset1-weight dataset1-path dataset2-weight dataset2-path ... It is used with --split when a single dataset used for all three: train, valid and test. It is exclusive to the other --*-data-path args. + retro_gpt_data_cache_path (str): Path to a directory to hold cached index files. + retro_gpt_split (str): Comma-separated list of proportions for training, validation, and test split. For example the split `90,5,5` will use 90%% of data for training, 5%% for validation and 5%% for test. + retro_gpt_train_samples (int): Total number of samples to train over all training runs. + retro_gpt_eval_interval (int): GPT evaluation interval. + retro_gpt_eval_iters (int): GPT evaluation iterations. + retro_gpt_tokenizer_type (str): GPT tokenizer type. + retro_gpt_tokenizer_model (str): GPT tokenizer model file. + retro_gpt_vocab_file (str): GPT vocab file. + retro_gpt_merge_file (str): GPT merge file. + retro_gpt_seq_length (int): GPT sequence length. + retro_gpt_global_batch_size (int): GPT global batch size. + retro_gpt_chunk_length (int): GPT chunk length. + retro_bert_tokenizer_type (str): Bert tokenizer type (for when using '--bert-embedder-type megatron'). + retro_bert_vocab_file (str): Bert vocab file. + retro_bert_batch_size (int): Micro-batch size for processing Bert embeddings. + retro_bert_max_chunk_length (int): Maximum sequence length for Bert embeddings. (Named 'chunk' here in reference to these Bert sequences being converted from GPT chunks.) + retro_index_type (str): A 'faiss-base' index is a simple, un-optimized wrapper around a Faiss index. A 'faiss-par-add' index optimizes the 'add()' method by making it multi-node and multi-process, but with bit-wise equivalent results. + retro_index_str (str): Index string used for calling faiss.index_factory(). For example, 'IVF262144_HNSW32,Flat' or 'OPQ32_256,IVF4194304_HNSW32,PQ32'. + retro_index_ntrain (int): Number of database chunks to use for training the index. This value must be less or equal to the total number of chunks in the database. + retro_index_train_load_fraction (float): Fraction of sampled chunks to use for training the index. Useful when our total sampled embeddings use too much memory; lowering the load fraction is less costly than re-embedding a new sampled dataset from scratch. + retro_index_add_load_fraction (float): Fraction of database chunks to use for adding to the index. Useful when our total index size would use too much memory; lowering the load fraction is less costly than re-designing our token datasets. + retro_index_delete_training_embeddings (bool): Delete training embeddings for the search index. Useful for debugging. + retro_index_delete_added_codes (bool): Delete added codes for the search index. Useful for debugging. + retro_query_ef_search (int): Index ef-search parameter for Hierarchical Navigable Small Worlds (HNSW) during querying. + retro_query_nprobe (int): Index nprobe parameter for Inverted File (IVF) during querying. + retro_query_num_neighbors_query (int): Number of neighbors to retrieve when calling index.search(). + retro_query_num_neighbors_save (int): Number of neighbors to save to disk after the index's returned neighbors. If longer than target value, neighbors truncated; and if shorter than target value, neighbors are padded with -1's. + retro_bert_embedders (RetroBertEmbedders): Set of Bert embedders used for embedding chunks. Contains entries: 1) 'mem' for an in-memory embedder, and 2) 'disk' for an embedder that saves results in blocks to disk. + retro_gpt_chunk_datasets (RetroGPTChunkDatasets): GPT datasets for 'train', 'valid', and 'test'. + retro_tokenizers (RetroTokenizers): GPT ('gpt') and Bert ('bert') tokenizers. + """ + + # Basic. + retro_project_dir: str = None + retro_tasks: str = 'build' + retro_task_validate: float = None + retro_block_size: int = 100000 + retro_doc_block_size: int = 100000 + + # GPT. + retro_gpt_seed: int = 1234 + retro_gpt_data_path: list = None # basic list here, for parsing purposes + retro_gpt_data_cache_path: str = None + retro_gpt_split: str = '969,30,1' + retro_gpt_train_samples: int = None + retro_gpt_eval_interval: int = None + retro_gpt_eval_iters: int = None + retro_gpt_tokenizer_type: str = None + retro_gpt_tokenizer_model: str = None + retro_gpt_vocab_file: str = None + retro_gpt_merge_file: str = None + retro_gpt_seq_length: int = None + retro_gpt_global_batch_size: int = None + retro_gpt_chunk_length: int = 64 + + # Bert. + retro_bert_tokenizer_type: str = None + retro_bert_vocab_file: str = None + retro_bert_batch_size: int = 128 + retro_bert_max_chunk_length: int = 256 + + # Index. + retro_index_type: str = 'faiss-par-add' + retro_index_str: str = None + retro_index_ntrain: int = None + retro_index_train_load_fraction: float = 1.0 + retro_index_add_load_fraction: float = 1.0 + retro_index_delete_training_embeddings: bool = True + retro_index_delete_added_codes: bool = True + + # Query. + retro_query_ef_search: int = 256 + retro_query_nprobe: int = 65536 + retro_query_num_neighbors_query: int = 200 + retro_query_num_neighbors_save: int = 20 + + # Tools. + retro_bert_embedders: RetroBertEmbedders = None + retro_gpt_chunk_datasets: RetroGPTChunkDatasets = None + retro_tokenizers: RetroTokenizers = None + + def __post_init__(self) -> None: + """Validate Retro config.""" + + # Validate required attributes. + assert self.retro_project_dir is not None + assert self.retro_tasks is not None + assert self.retro_gpt_data_path is not None or self.retro_gpt_data_cache_path is not None + assert self.retro_gpt_train_samples is not None + assert self.retro_gpt_eval_interval is not None + assert self.retro_gpt_eval_iters is not None + assert self.retro_gpt_tokenizer_type is not None + assert self.retro_gpt_tokenizer_model is not None or ( + self.retro_gpt_vocab_file is not None and self.retro_gpt_merge_file is not None + ) + assert self.retro_gpt_seq_length is not None + assert self.retro_gpt_global_batch_size is not None + assert self.retro_bert_tokenizer_type is not None + assert self.retro_bert_vocab_file is not None + assert self.retro_index_str is not None + assert self.retro_index_ntrain is not None + + # Split retro tasks. + self.retro_tasks = self.retro_tasks.split(",") diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py new file mode 100644 index 0000000..831b1d8 --- /dev/null +++ b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Container dataclass for GPT chunk datasets (train, valid, and test).""" + +from dataclasses import dataclass + + +@dataclass +class RetroGPTChunkDatasets: + """Container dataclass for GPT chunk datasets.""" + + # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'. + train: dict = None + valid: dict = None + test: dict = None diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py new file mode 100644 index 0000000..2e731c8 --- /dev/null +++ b/megatron/core/datasets/retro/config/tokenizers.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Container class for GPT and Bert tokenizers.""" + +from dataclasses import dataclass + +from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer + + +@dataclass +class RetroTokenizers: + """Container class for GPT and Bert tokenizers.""" + + gpt: MegatronTokenizer = None + bert: MegatronTokenizer = None diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py new file mode 100644 index 0000000..f1f460b --- /dev/null +++ b/megatron/core/datasets/retro/db/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Exports: + + - build_db: Build a chunk database from a list of indexed datasets. +""" + +from .build import build_db diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py new file mode 100644 index 0000000..1469c08 --- /dev/null +++ b/megatron/core/datasets/retro/db/build.py @@ -0,0 +1,631 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Build a chunk database from a list of indexed datasets. + +Building a chunk database consists of. + + - Breaking each document of each indexed dataset into consecutive + retro_gpt_chunk_length chunks. + - Re-tokenize each chunk into Bert, and discard any chunks with empty Bert + tokens. + - Save chunk offsets to disk for each indexed dataset. +""" + +import glob +import os +import types +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Dict, List, Tuple + +import numpy as np +import torch +from tqdm import tqdm + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import h5py +from megatron.core.datasets.retro.utils import ( + extract_data_config, + get_blocks_by_rank, + log_retro_rank_0, + retro_makedir, +) + +from .utils import ( + get_indexed_dataset_infos, + get_indexed_dataset_infos_path, + get_individual_chunk_db, + get_individual_db_dir, + get_individual_db_paths, + get_individual_doc_offsets, + get_merged_db_path_map, + init_indexed_dataset_infos, + load_indexed_datasets, + save_indexed_dataset_infos, +) + + +def build_partial_db( + config: types.SimpleNamespace, + dataset_idx: int, + n_datasets: int, + indexed_dataset: IndexedDataset, + block_id: int, + n_blocks: int, + block: dict, + proc_id: int, + n_procs: int, +) -> Tuple[int, list, list, dict]: + """Process a document index range of the indexed dataset. + + The chunk database is built in parallel blocks, since de-tokenizing & + re-tokenizing for Bert-length computation is expensive. This method + iterates each document and extracts sequential 'chunk-length' sequences + from each document. + + Args: + config (types.SimpleNamespace): Subset of Retro config, containing 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'. + dataset_idx (int): Index of this dataset out of all blended datasets. + n_datasets (int): Total number of blended datasets. + indexed_dataset (IndexedDataset): Indexed dataset to be chunked. + block_id (int): Block index out of all blocks to be processed. + n_blocks (int): Total number of blocks to be processed. + block (dict): Range information such as start/end points for chunking idnexed dataset. + proc_id (int): Process ID for tracking parallel process order. + n_procs (int): Total number of parallel processes. + + Returns: + A tuple containing: + + - Process ID. + - List of valid chunks. + - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.). + - Dict mapping document ID to number of valid chunks. + """ + + # Document start/end indexes. + doc_range = block["range"] + n_docs = doc_range[1] - doc_range[0] + n_docs_per_proc = int(np.ceil(n_docs / n_procs)) + doc_start_id = doc_range[0] + proc_id * n_docs_per_proc + doc_end_id = min(doc_range[1], doc_start_id + n_docs_per_proc) + + # Print progress. + progress_proc_ids = set(range(n_procs)) if torch.distributed.get_rank() == 0 else set() + if proc_id in progress_proc_ids: + log_retro_rank_0( + " > building partial chunk db, proc %d / %d, docs %d:%d / %d." + % (proc_id, n_procs, doc_start_id, doc_end_id, n_docs,) + ) + + # Progress bars (snapshot of overall progress). + doc_id_iter = range(doc_start_id, doc_end_id) + pbar = ( + tqdm(doc_id_iter, "parse doc chunks", miniters=len(doc_id_iter) // 20,) + if proc_id in progress_proc_ids + else doc_id_iter + ) + + # Iterate documents & parse chunks. + chunk_db_valid: List[Tuple] = [] + chunk_db_invalid: List[Tuple] = [] + doc_size_map = {} + for doc_id in pbar: + + # Progress description. + try: + pbar.set_description( + "%sds %d / %d, block %d / %d, proc %d / %d." + % ( + "" if config.task_validate is None else "[validate] ", + dataset_idx, + n_datasets, + block_id, + n_blocks, + proc_id, + n_procs, + ) + ) + except: + pass + + # Remove EOD token. + doc = indexed_dataset.get(doc_id) + if doc[-1].item() == config.gpt_eod: + doc = doc[:-1] + doc_len = len(doc) + + # Chunk start/end indexes. + chunk_start_idxs = list(range(0, doc_len, config.chunk_length)) + chunk_end_idxs = [min(doc_len, s + config.chunk_length) for s in chunk_start_idxs] + + # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid'). + doc_size_map[doc_id] = 0 + for i, chunk_start_idx in enumerate(chunk_start_idxs): + + # Re-tokenize. + chunk_end_idx = chunk_end_idxs[i] + gpt_token_ids = indexed_dataset.get( + idx=doc_id, offset=chunk_start_idx, length=chunk_end_idx - chunk_start_idx, + ) + text = config.gpt_detokenize(gpt_token_ids.tolist()) + bert_token_ids = config.bert_tokenize(text) + + # 'Valid' for non-empty Bert chunks; 'invalid' otherwise. + if len(bert_token_ids) == 0: + _chunk_db = chunk_db_invalid + else: + _chunk_db = chunk_db_valid + doc_size_map[doc_id] += 1 + _chunk_db.append((doc_id, chunk_start_idx, chunk_end_idx, len(bert_token_ids),)) + + return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map + + +def build_block_db( + config: RetroPreprocessingConfig, + dataset_idx: int, + n_datasets: int, + indexed_dataset: IndexedDataset, + n_procs: int, + executor: ProcessPoolExecutor, + n_missing_blocks: int, + block_idx: int, + block: dict, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Split each document within block into consecutive retro_gpt_chunk_length size chunks. + + Args: + config (RetroPreprocessingConfig): For DB building, we make use of attributes 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'. + dataset_idx (int): Index of this dataset out of all blended datasets. + n_datasets (int): Total number of blended datasets. + indexed_dataset (IndexedDataset): Indexed dataset to be chunked. + n_procs (int): Total number of parallel processes. + executor (ProcessPoolExecutor): Executor for launching parallel processes. + n_missing_blocks (int): Total number of blocks to be processed. + block_idx (int): Block index out of all blocks to be processed. + block (dict): Range information such as start/end points for chunking idnexed dataset. + + Returns: + A tuple containing: + + - List of valid chunks. + - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.). + - Dict mapping document ID to number of valid chunks. + """ + + # Build partial dbs. + log_retro_rank_0(' > build partial dbs.') + futures = [] + for proc_id in range(n_procs): # not true process id + futures.append( + executor.submit( + build_partial_db, + types.SimpleNamespace( + chunk_length=config.retro_gpt_chunk_length, + gpt_eod=config.retro_tokenizers.gpt.eod, + gpt_detokenize=config.retro_tokenizers.gpt.detokenize, + bert_tokenize=config.retro_tokenizers.bert.tokenize, + task_validate=config.retro_task_validate, + ), + dataset_idx, + n_datasets, + indexed_dataset, + block_idx, + n_missing_blocks, + block, + proc_id, + n_procs, + ) + ) + partial_chunk_dbs = [] + for future in as_completed(futures): + partial_chunk_dbs.append(future.result()) + + # Concatenate chunks. + partial_chunk_dbs.sort(key=lambda item: item[0]) # sort by proc_id + chunk_db_valid = [ + item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[1] + ] + chunk_db_invalid = [ + item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[2] + ] + + # Convert to numpy. + log_retro_rank_0(' > converting chunk db to numpy.') + chunk_db_valid = np.array(chunk_db_valid, dtype="uint32") + chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32") + + # Document offsets. + doc_sizes = [ + (d, s) for partial_chunk_db in partial_chunk_dbs for d, s in partial_chunk_db[3].items() + ] + doc_sizes.sort(key=lambda item: item[0]) + doc_offsets = np.cumsum([item[1] for item in doc_sizes]).astype("uint64") + doc_offsets = np.stack( + (np.array([item[0] for item in doc_sizes], dtype="uint64"), doc_offsets), axis=1 + ) + + return chunk_db_valid, chunk_db_invalid, doc_offsets + + +def save_block_db( + block: dict, chunk_db_valid: np.ndarray, chunk_db_invalid: np.ndarray, doc_offsets: np.ndarray, +) -> None: + """Save block of chunked tokens to disk. These blocks are later used for + training and adding to the vector index. + + Args: + block (dict): Range information such as start/end points for chunking idnexed dataset. + chunk_db_valid (np.ndarray): Array of valid chunk indexes. + chunk_db_invalid (np.ndarray): Array of invalid chunk indexes. + doc_offsets (np.ndarray): Array of document offsets by chunks. + """ + log_retro_rank_0(" > saving individual db.") + with h5py.File(block["path"], "w") as f: + dset = f.create_dataset("chunks_valid", data=chunk_db_valid) + dset = f.create_dataset("chunks_invalid", data=chunk_db_invalid) + dset = f.create_dataset("doc_offsets", data=doc_offsets) + + +def build_individual_db( + config: RetroPreprocessingConfig, dataset_idx: int, n_datasets: int, dataset_info: dict, +) -> None: + """Process a single indexed dataset & extract chunks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + dataset_idx (int): Dataset index within blended dataset. + n_datasets (int): Total number of datasets within blended dataset. + dataset_info (dict): Metadata for dataset (see `save_indexed_dataset_infos()` in `utils.py` for more detail). + """ + + # Make directory. + db_dir = get_individual_db_dir(config.retro_project_dir, dataset_info["prefix"]) + retro_makedir(config, db_dir) + + # Indexed dataset. + indexed_dataset = dataset_info["dataset"] + + # Missing DB blocks (split by documents). + blocks = get_blocks_by_rank( + db_dir, + len(indexed_dataset), + config.retro_doc_block_size, + validate=lambda f: f["chunks_valid"].shape == (0,) or f["chunks_valid"].shape[1] == 4, + sample=config.retro_task_validate, + ) + if config.retro_task_validate is None: + active_blocks = blocks.missing + else: + assert blocks.n_missing_world == 0 + active_blocks = blocks.existing + + # Prevent missing-path-write race condition. + torch.distributed.barrier() + + # Nothing to do? + if config.retro_task_validate is None and not active_blocks: + return + + # Num processes. + if blocks.n_missing_world == 1: + n_procs = 128 + elif blocks.n_missing_world <= 2: + n_procs = 64 + elif blocks.n_missing_world <= 4: + n_procs = 32 + elif blocks.n_missing_world <= 8: + n_procs = 16 + else: + n_procs = 8 + + # Process documents in parallel. + with ProcessPoolExecutor(max_workers=n_procs) as executor: + for block_idx, block in enumerate(active_blocks): + + if block is not None: + + # Build block DB. + chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db( + config=config, + dataset_idx=dataset_idx, + n_datasets=n_datasets, + indexed_dataset=indexed_dataset, + n_procs=n_procs, + executor=executor, + n_missing_blocks=len(active_blocks), + block_idx=block_idx, + block=block, + ) + + if config.retro_task_validate is None: + # Save block DB. + save_block_db( + block=block, + chunk_db_valid=chunk_db_valid, + chunk_db_invalid=chunk_db_invalid, + doc_offsets=doc_offsets, + ) + + else: + + # Load existing block DB. + with h5py.File(block["path"]) as f: + existing_chunks_valid = np.copy(f["chunks_valid"]) + existing_chunks_invalid = np.copy(f["chunks_invalid"]) + existing_doc_offsets = np.copy(f["doc_offsets"]) + + # Check equality. + log_retro_rank_0(" > validate.") + assert np.array_equal(existing_chunks_valid, chunk_db_valid) + assert np.array_equal(existing_chunks_invalid, chunk_db_invalid) + assert np.array_equal(existing_doc_offsets, doc_offsets) + + # Wait for all ranks to finish block. + log_retro_rank_0(" > waiting for all ranks to finish block.") + torch.distributed.barrier() + + log_retro_rank_0(" > finished saving individual db.") + + +def build_individual_dbs( + config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict], +) -> None: + """Iterate each indexed dataset & process its chunks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset. + """ + + # Build individual DBs. + log_retro_rank_0(" > build individual chunk dbs.") + for ds_idx, ds_info in enumerate(indexed_dataset_infos): + + # Progress. + log_retro_rank_0( + " > building individual db, dataset %d / %d ... '%s'." + % (ds_idx, len(indexed_dataset_infos), ds_info["prefix"],) + ) + + # Process single dataset. + build_individual_db(config, ds_idx, len(indexed_dataset_infos), ds_info) + + +def update_chunk_counts( + config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict] +) -> None: + """Set n_chunks_train & n_chunks sampled for each individual DB. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.). + """ + + if torch.distributed.get_rank() != 0: + return + + # Data ratio sum (for setting index training chunks). + data_ratio_sum = sum([d["ratio"] for d in indexed_dataset_infos]) + + # Training split size (split at document level). + train_fraction = float(extract_data_config(config).split.split(",")[0]) / 100 + assert train_fraction > 0 and train_fraction <= 1 + + # Set n_chunks (including n_chunks_sampled for unambiguity). + log_retro_rank_0(" > compute n_chunks.") + for ds_index, ds_info in enumerate(indexed_dataset_infos): + + db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"]) + + # Update counts. + ds_info["n_docs"] = len(ds_info["dataset"].document_indices) - 1 + ds_info["n_docs_train"] = int(train_fraction * ds_info["n_docs"]) + ds_info["n_chunks"] = 0 # previously, 'n_chunks_valid' + ds_info["n_chunks_train"] = 0 + ds_info["n_chunks_invalid"] = 0 + for db_path in tqdm( + db_paths, "%d/%d, %s" % (ds_index, len(indexed_dataset_infos), ds_info["prefix"]) + ): + with h5py.File(db_path, "r") as f: + ds_info["n_chunks"] += len(f["chunks_valid"]) + ds_info["n_chunks_invalid"] += len(f["chunks_invalid"]) + ds_info["n_chunks_train"] += ( + (np.copy(f["chunks_valid"][:, 0]) < ds_info["n_docs_train"]).sum().item() + ) + + ds_info["n_chunks_sampled"] = int( + config.retro_index_ntrain * ds_info["ratio"] / data_ratio_sum + ) + + # Verify counts. + assert ds_info["n_chunks_train"] <= ds_info["n_chunks"], "n_train (%d) > n_total (%d)." % ( + ds_info["n_chunks_train"], + ds_info["n_chunks"], + ) + assert ds_info["n_chunks_sampled"] <= ds_info["n_chunks_train"], ( + "n_sampled (%d) > n_train (%d)." + % (ds_info["n_chunks_sampled"], ds_info["n_chunks_train"]) + ) + + +def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str) -> None: + """Merge individual DBs into single DB. + + Args: + project_dir (str): Retro project dir. + indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.). + db_type (str): DB type (e.g., 'sampled', 'train', or 'valid'). + """ + + if torch.distributed.get_rank() != 0: + return + + log_retro_rank_0(" > build %s chunk db." % db_type) + + # Count chunks. + if db_type == "sampled": + n_chunks_key = "n_chunks_sampled" + n_docs_key = None + elif db_type == "train": + n_chunks_key = "n_chunks_train" + n_docs_key = "n_docs_train" + elif db_type == "valid": + n_docs_key = None + else: + raise Exception("handle db_type '%s'." % db_type) + + if db_type == "valid": + n_chunks = sum(m["n_chunks"] - m["n_chunks_train"] for m in indexed_dataset_infos) + else: + n_chunks = sum(m[n_chunks_key] for m in indexed_dataset_infos) + n_docs = None if n_docs_key is None else sum(m[n_docs_key] for m in indexed_dataset_infos) + + # DB path. + db_path = get_merged_db_path_map(project_dir)[db_type] + + # Delete existing chunk db if incorrect size. + if os.path.exists(db_path): + + try: + + f = h5py.File(db_path) + n_alloc = len(f["chunks"]) # total allocated + n_written = f["n_written"][0].item() # total written + f.close() + + if n_chunks != n_alloc or n_chunks != n_written: + os.remove(db_path) + + except Exception as e: + if isinstance(e, OSError): + os.remove(db_path) + elif isinstance(e, KeyError): + f.close() + os.remove(db_path) + else: + raise e + + # Build merged chunk db. + if not os.path.exists(db_path): + + os.makedirs(os.path.dirname(db_path), exist_ok=True) + f = h5py.File(db_path, "w") + + # Initialize output arrays. + merged_chunk_db: np.ndarray = f.create_dataset("chunks", (n_chunks, 5), dtype="uint32") + merged_doc_offsets: np.ndarray = ( + None + if n_docs_key is None + else f.create_dataset("doc_offsets", (n_docs, 3), dtype="uint64") + ) + n_written = f.create_dataset("n_written", (1,), dtype="uint64") + n_written[0] = 0 + + # Iterate indexed datasets & collect chunks. + chunk_start_index = 0 + doc_start_index = 0 + doc_start_offset = 0 + for ds_idx, ds_info in enumerate(indexed_dataset_infos): + log_retro_rank_0( + " > merging dbs; '%s', dataset %d / %d ... '%s'." + % (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"]), + ) + individual_chunk_db: np.ndarray = get_individual_chunk_db(project_dir, ds_idx, ds_info) + individual_doc_offsets: np.ndarray = ( + None + if n_docs_key is None + else get_individual_doc_offsets(project_dir, ds_idx, ds_info) + ) + + if db_type == "valid": + individual_chunk_db = individual_chunk_db[ds_info["n_chunks_train"] :] + if n_docs_key is None: + individual_doc_offsets = None + else: + train_doc_offset = individual_doc_offsets[ds_info["n_docs_train"] - 1, 2] + individual_doc_offsets = np.copy( + individual_doc_offsets[ds_info["n_docs_train"] :] + ) + individual_doc_offsets[:, 2] -= train_doc_offset + + log_retro_rank_0("~~~") + log_retro_rank_0(individual_doc_offsets) + log_retro_rank_0(train_doc_offset) + raise Exception("test me.") + else: + individual_chunk_db = individual_chunk_db[: ds_info[n_chunks_key]] + individual_doc_offsets = ( + None + if n_docs_key is None + else np.copy(individual_doc_offsets[: ds_info[n_docs_key]]) + ) + + merged_chunk_db[ + chunk_start_index : chunk_start_index + len(individual_chunk_db) + ] = individual_chunk_db + chunk_start_index += len(individual_chunk_db) + n_written[0] = chunk_start_index + if n_docs_key is not None: + individual_doc_offsets[:, 2] += doc_start_offset + doc_end_index = doc_start_index + individual_doc_offsets.shape[0] + merged_doc_offsets[doc_start_index:doc_end_index] = individual_doc_offsets + doc_start_index = doc_end_index + doc_start_offset = individual_doc_offsets[-1, 2].item() + + f.close() + + +def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> None: + """Merge individual dataset components into single database. + + This method merges databases for DB types: + - 'sampled': used for training the vector index. + - 'train': used for adding to the trained vector index. + - 'valid': can be used for validating/testing the vector index. + + Args: + project_dir (str): Retro project dir. + indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.). + """ + merge_dbs(project_dir, indexed_dataset_infos, "sampled") + merge_dbs(project_dir, indexed_dataset_infos, "train") + merge_dbs(project_dir, indexed_dataset_infos, "valid") + + +def build_db(config: RetroPreprocessingConfig) -> None: + """Extract token chunks from each indexed dataset. + + Iterate each document of each indexed dataset, extract that document's chunks, and save to a 'DB' (hdf5 file). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + project_dir = config.retro_project_dir + + # Indexed dataset info. + if config.retro_task_validate is None: + indexed_dataset_infos = init_indexed_dataset_infos(config) + else: + indexed_dataset_infos = get_indexed_dataset_infos(config.retro_project_dir) + # Build individual dbs. + build_individual_dbs(config, indexed_dataset_infos) + + # If validating, return here. + if config.retro_task_validate is not None: + return + + # Single-process going forward. + if torch.distributed.get_rank() != 0: + return + + # Update n_chunks & save indexed dataset infos. + if not os.path.exists(get_indexed_dataset_infos_path(project_dir)): + update_chunk_counts(config, indexed_dataset_infos) + save_indexed_dataset_infos(project_dir, indexed_dataset_infos) + indexed_dataset_infos = get_indexed_dataset_infos(project_dir) + + # Builded merged dbs. + build_merged_dbs(project_dir, indexed_dataset_infos) diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py new file mode 100644 index 0000000..1de6e02 --- /dev/null +++ b/megatron/core/datasets/retro/db/dataset.py @@ -0,0 +1,108 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""A DBDataset is for iterating the chunks of the chunk database. + +This dataset is used for both training a vector index, and adding vectors to a +trained index. +""" + +from typing import List + +import numpy as np +import torch +from tqdm import tqdm + +from megatron.core.datasets.indexed_dataset import IndexedDataset + + +class DBDataset(torch.utils.data.Dataset): + """Dataset for iterating chunks. + + Args: + db_path (str): Path of HDF5-format chunk database. + indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database. + chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets. Format [dataset_idx, doc_id, start_idx, end_idx, bert_length]. + chunk_length (int): Max GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + """ + + def __init__( + self, + db_path: str, + indexed_datasets: List[IndexedDataset], + chunks: np.ndarray, + chunk_length: int, + eod_token_id: int, + ): + + assert chunks.shape[1] == 5, ( + "expected 5 columns (dataset_idx, " + "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " + "found %d columns." % chunks.shape[1] + ) + + self.db_path = db_path + self.indexed_datasets = indexed_datasets + self.chunks = chunks + self.doc_chunk_map = None + + self.max_chunk_length = chunk_length + self.eod_token_id = eod_token_id + + def __len__(self) -> int: + """Length of DB dataset. + + Returns: + Number of chunks contained in the dataset. + """ + return self.chunks.shape[0] + + def __getitem__(self, chunk_id: int) -> dict: + """DB dataset sample. + + Args: + chunk_id (int): Index of chunk within dataset. + + Returns: + A dict containing: + - 'doc_id': Document index within indexed dataset. + - 'text': GPT token IDs. + """ + + # Chunk start/end indexes. + indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = [ + value.item() for value in self.chunks[chunk_id] + ] + chunk_length = token_end_idx - token_start_idx + indexed_dataset = self.indexed_datasets[indexed_dataset_id] + + # Chunk token ids. + token_ids = indexed_dataset.get(doc_id, offset=token_start_idx, length=chunk_length) + + # Extend chunks to max_chunk_length by padding with EOD tokens. + if chunk_length != self.max_chunk_length: + assert chunk_length < self.max_chunk_length, "invalid chunk len." + token_ids = token_ids.tolist() + token_ids += [self.eod_token_id] * (self.max_chunk_length - chunk_length) + + return { + "doc_id": doc_id, + "text": np.array(token_ids, dtype=np.int64), + } + + def load_doc_tuples(self) -> None: + """Load the dataset & document ids. + + Load the dataset id & document id of each chunk in the database, to + be used for causality filtering during querying. + """ + self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32") + block_size = int(1e6) + for start_idx in tqdm( + range(0, len(self), block_size), + "load doc tuples", + miniters=(len(self) // block_size) // 10, + disable=torch.distributed.get_rank() != 0, + ): + end_idx = min(len(self), start_idx + block_size) + self.doc_tuples[start_idx:end_idx] = self.chunks[start_idx:end_idx, :2] diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py new file mode 100644 index 0000000..df13089 --- /dev/null +++ b/megatron/core/datasets/retro/db/utils.py @@ -0,0 +1,369 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for building a chunk database.""" + +import glob +import json +import os +from typing import Dict, List, Optional + +import numpy as np + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import h5py +from megatron.core.models.retro.utils import get_gpt_data_dir + +from .dataset import DBDataset + + +def get_db_dir(project_dir: str) -> str: + """Sub-directory for DB data. + + Args: + project_dir (str): Path to Retro project dir. + + Returns: + Path of the DB sub-directory within the project. + """ + return os.path.join(project_dir, "db") + + +def init_indexed_dataset_infos(config: RetroPreprocessingConfig) -> List[Dict]: + """Gather meta-info about each indexed dataset. + + The returned info array allows for easy access to the configuration, and + helps remove ambiguity. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + List of processing metadata for each dataset, including: + - ratio: Data split weight. + - prefix: Relative path to dataset under DB sub-directory. + """ + + data_dir = get_gpt_data_dir(config.retro_project_dir) + data_blend: List[str] = config.retro_gpt_data_path + assert len(data_blend) % 2 == 0, "currently, only blended dataset is supported." + + # Dataset infos. + infos = [] + for i in range(0, len(data_blend), 2): + ratio = float(data_blend[i]) + prefix = data_blend[i + 1] + path = os.path.join(data_dir, prefix + ".bin") + assert os.path.exists(path), "couldn't find '%s'." % path + infos.append( + {"ratio": ratio, "prefix": prefix,} + ) + + # Load indexed datasets. + load_indexed_datasets(config.retro_project_dir, infos) + + return infos + + +def get_indexed_dataset_infos_path(project_dir: str) -> str: + """Path to indexed dataset meta-infos. + + Args: + project_dir (str): Path to Retro project dir. + + Returns: + Path to the `indexed_dataset_infos.json` file. + """ + return os.path.join(get_db_dir(project_dir), "indexed_dataset_infos.json") + + +def save_indexed_dataset_infos(project_dir: str, indexed_dataset_infos: List[Dict]) -> None: + """Save dataset order & meta-info. + + Args: + project_dir (str): Path to Retro project dir. + indexed_dataset_infos (List[Dict]): List of metadata for each dataset, with each entry containing: + + - ratio: Data split weight. + - prefix: Relative path to dataset under DB sub-directory. + - n_docs: Number of documents. + - n_docs_train: Number of documents used for pretraining. + - n_chunks: Number of valid chunks. + - n_chunks_train: Number of valid chunks used for pretraining. + - n_chunks_invalid: Number of invalid chunks. + - n_chunks_sampled: Number of valid chunks used for vector index training. + """ + + # Remove 'dataset' field. + clean_infos = [] + for info in indexed_dataset_infos: + info = dict(info) + del info["dataset"] + clean_infos.append(info) + + # Save. + with open(get_indexed_dataset_infos_path(project_dir), "w") as f: + json.dump(clean_infos, f, indent=4) + + +def load_indexed_datasets(project_dir: str, indexed_dataset_infos: List[Dict]) -> None: + """Loaded indexed datasets into memory-mapped datasets. + + Args: + project_dir (str): Path to Retro project dir. + indexed_dataset_infos (List[Dict]): List of metadata for each dataset (see `save_indexed_dataset_infos()` for more details. + """ + data_dir = get_gpt_data_dir(project_dir) + for info in indexed_dataset_infos: + info["dataset"] = IndexedDataset(os.path.join(data_dir, info["prefix"]), mmap=True) + + +def get_indexed_dataset_infos(project_dir: str) -> List[Dict]: + """Load indexed dataset meta-infos. + + Args: + project_dir (str): Path to Retro project dir. + + Returns: + List of metadata for each dataset (see `save_indexed_dataset_infos()` for more details. + """ + + # Load json. + path = get_indexed_dataset_infos_path(project_dir) + with open(path) as f: + infos = json.load(f) + + # Load indexed datasets. + load_indexed_datasets(project_dir, infos) + + return infos + + +def get_individual_db_dir(project_dir: str, prefix: str) -> str: + """Individual DB's directory. + + Args: + project_dir (str): Path to Retro project dir. + prefix (str): Unique relative path to dataset within project dir. + + Returns: + Path to the given datasets's chunk database. + """ + return os.path.join(get_db_dir(project_dir), "individual", prefix) + + +def get_individual_db_paths(project_dir: str, prefix: str) -> List[str]: + """Get paths of all database blocks of an individual dataset. + + Args: + project_dir (str): Path to Retro project dir. + prefix (str): Unique relative path to dataset within project dir. + + Returns: + Paths to each HDF5 chunk database files that comprises this datasets full chunk database. + """ + return sorted(glob.glob(get_individual_db_dir(project_dir, prefix) + "/*hdf5")) + + +def get_individual_chunk_db(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray: + """Load individual dataset's chunk DB. + + Args: + project_dir (str): Path to Retro project dir. + ds_id (int): Index of dataset within blended dataset. + ds_info (dict): Preprocessing metadata for dataset (see `save_indexed_dataset_infos()` for more detail). + + Returns: + Array of chunk start/end indexes for this dataset, where the chunk indexes can be used for indexing into the corresponding indexed dataset. + """ + paths = get_individual_db_paths(project_dir, ds_info["prefix"]) + # *Note*: convert to dataset, rather than copying to memory. + db = np.zeros((ds_info["n_chunks"], 5), dtype="uint32") + db[:, 0] = ds_id + start_idx = 0 + for path in paths: + f = h5py.File(path, "r") + n_chunks_current = f["chunks_valid"].shape[0] + db[start_idx : (start_idx + n_chunks_current), 1:] = f["chunks_valid"] + start_idx += n_chunks_current + f.close() + + assert start_idx == ds_info["n_chunks"] + + return db + + +def get_individual_doc_offsets(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray: + """Load individual dataset's document offsets. + + Args: + project_dir (str): Path to Retro project dir. + ds_id (int): Index of dataset within blended dataset. + ds_info (dict): Preprocessing metadata for dataset (see `save_indexed_dataset_infos()` for more detail). + + Returns: + Array of document offsets by chunk index for this dataset. + """ + paths = get_individual_db_paths(project_dir, ds_info["prefix"]) + # *Note*: convert to dataset, rather than copying to memory. + doc_offsets = np.zeros((ds_info["n_docs"], 3), dtype="uint64") + doc_offsets[:, 0] = ds_id + start_idx = 0 + start_offset = 0 + for path in paths: + with h5py.File(path) as f: + current_doc_offsets = np.copy(f["doc_offsets"]) + current_doc_offsets[:, 1] += start_offset + current_ndocs = current_doc_offsets.shape[0] + doc_offsets[start_idx : (start_idx + current_ndocs), 1:] = current_doc_offsets + start_idx += current_ndocs + start_offset = current_doc_offsets[-1, 1].item() + + return doc_offsets + + +def get_merged_db_path_map(project_dir: str) -> dict: + """Paths to merged datasets. + + Args: + project_dir (str): Path to Retro project dir. + + Returns: + A dict of chunk databases, one for each of: + - sampled: Chunks used for training the vector index. + - train: Chunks used for pretraining 'train' dataset. + - valid: Chunks used for pretraining 'valid' dataset. + """ + base_dir = get_db_dir(project_dir) + return { + "sampled": os.path.join(base_dir, "merged", "sampled.hdf5"), + "train": os.path.join(base_dir, "merged", "train.hdf5"), + "valid": os.path.join(base_dir, "merged", "valid.hdf5"), + } + + +def get_merged_dataset( + project_dir: str, + chunk_length: int, + eod_token_id: int, + db_type: str, + indexed_dataset_infos: Optional[List[Dict]] = None, +) -> DBDataset: + """Get merged dataset. + + Args: + project_dir (str): Path to Retro project dir. + chunk_length (int): GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + db_type (str): DB type (e.g., 'sampled', 'train', or 'valid'). + indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk. + + Returns: + A DBDataset, which is a dataset that wraps the HDF5 chunk index array. + """ + + if not indexed_dataset_infos: + indexed_dataset_infos = get_indexed_dataset_infos(project_dir) + + # Load chunks. + db_path = get_merged_db_path_map(project_dir)[db_type] + f = h5py.File(db_path, "r") + chunks = f["chunks"] + + # DB dataset. + indexed_datasets = [info["dataset"] for info in indexed_dataset_infos] + dataset = DBDataset( + db_path=db_path, + indexed_datasets=indexed_datasets, + chunks=chunks, + chunk_length=chunk_length, + eod_token_id=eod_token_id, + ) + + return dataset + + +def get_merged_sampled_dataset( + project_dir: str, + chunk_length: int, + eod_token_id: int, + indexed_dataset_infos: Optional[List[Dict]] = None, +) -> DBDataset: + """Get sampled dataset (for training the vector index). + + Args: + project_dir (str): Path to Retro project dir. + chunk_length (int): GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk. + + Returns: + A DBDataset, which is a dataset that wraps the HDF5 chunk index array. + """ + return get_merged_dataset( + project_dir, chunk_length, eod_token_id, "sampled", indexed_dataset_infos + ) + + +def get_merged_train_dataset( + project_dir: str, + chunk_length: int, + eod_token_id: int, + indexed_dataset_infos: Optional[List[Dict]] = None, +) -> DBDataset: + """Get training dataset (for adding to the vector index). + + Args: + project_dir (str): Path to Retro project dir. + chunk_length (int): GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk. + + Returns: + A DBDataset, which is a dataset that wraps the HDF5 chunk index array. + """ + return get_merged_dataset( + project_dir, chunk_length, eod_token_id, "train", indexed_dataset_infos + ) + + +def get_merged_valid_dataset( + project_dir: str, + chunk_length: int, + eod_token_id: int, + indexed_dataset_infos: Optional[List[Dict]] = None, +) -> DBDataset: + """Get validation dataset (for testing the vector index). + + Args: + project_dir (str): Path to Retro project dir. + chunk_length (int): GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk. + + Returns: + A DBDataset, which is a dataset that wraps the HDF5 chunk index array. + """ + return get_merged_dataset( + project_dir, chunk_length, eod_token_id, "valid", indexed_dataset_infos + ) + + +def get_merged_datasets(project_dir: str, chunk_length: int, eod_token_id: int) -> dict: + """Get all merged datasets. + + Args: + project_dir (str): Path to Retro project dir. + chunk_length (int): GPT chunk length (e.g., 64). + eod_token_id (int): EOD token ID. + + Returns: + A dict mapping DB type ('sampled', 'train', or 'valid') to the corresponding DBDataset, which is a dataset that wraps the HDF5 chunk index array. + """ + fns = { + "sampled": get_merged_sampled_dataset, + "train": get_merged_train_dataset, + "valid": get_merged_valid_dataset, + } + datasets = {key: fn(project_dir, chunk_length, eod_token_id) for key, fn in fns.items()} + return datasets diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py new file mode 100644 index 0000000..98b2872 --- /dev/null +++ b/megatron/core/datasets/retro/external_libs.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Required external libraries for Retro preprocessing.""" + +import importlib + +required_libs = [ + "faiss", + "h5py", + "transformers", # for huggingface bert +] + +for lib in required_libs: + try: + globals()[lib] = importlib.import_module(lib) + except ImportError as e: + raise Exception( + f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'." + ) diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py new file mode 100644 index 0000000..d069f55 --- /dev/null +++ b/megatron/core/datasets/retro/index/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Exports: + + - train_index: Train an index on representative vectors. + - add_to_index: Add vectors to a trained index. + - build_index: Wrapper function that calls above two functions. +""" + +from .build import add_to_index, build_index, train_index diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py new file mode 100644 index 0000000..a5659e9 --- /dev/null +++ b/megatron/core/datasets/retro/index/build.py @@ -0,0 +1,313 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Construct an index. + +Constructing an index generally happens in two phases: + + - index.train(): Train an index on a representative set of vectors. + - index.add(): Add vectors to an index, to be available for retrieval. +""" + +import os +import shutil + +import numpy as np +import torch +from tqdm import tqdm + +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.db.utils import ( + get_merged_sampled_dataset, + get_merged_train_dataset, +) +from megatron.core.datasets.retro.external_libs import h5py +from megatron.core.datasets.retro.utils import GPTToTextDataset + +from .factory import IndexFactory +from .utils import ( + get_training_data_block_dir, + get_training_data_block_paths, + get_training_data_merged_path, + get_training_data_root_dir, +) + +################################################## +# Train index. +################################################## + + +def get_empty_index_path(config: RetroPreprocessingConfig) -> str: + """Path of empty index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the empty (trained, but without added samples) vector index. + """ + index = IndexFactory.get_index(config.retro_index_type) + empty_index_path = index.get_empty_index_path(config) + return empty_index_path + + +def get_block_nload(block_path: str, load_fraction: float) -> int: + """Compute number of blocks to load. + + This is computed by multiplying the total number of available blocks with the + fraction of blocks to load. + + Args: + block_path (str): Path to HDF5 file containing block of data. File must contain key 'data'. + load_fraction (float): Fraction (0 < load_fraction <= 1) of block samples to load. + + Returns: + Number of block samples to load. + """ + with h5py.File(block_path) as fi: + return int(load_fraction * fi["data"].shape[0]) + + +def merge_embedding_blocks(config: RetroPreprocessingConfig) -> None: + """Merge individual embedding blocks into a single binary mmap file. + + The embeddings are initially stored in block-sized (e.g., ~100k embeddings per + block) HDF5 files. These individual block files must be merged into a single + file before training, to be based as a numpy mmap array to the index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + if torch.distributed.get_rank() != 0: + return + + # Get block, merged paths. + load_fraction = config.retro_index_train_load_fraction + block_paths = get_training_data_block_paths(config) + bin_path = get_training_data_merged_path(config) + + # Skip, if already built. + if os.path.exists(bin_path): + return + + # Merge blocks. + with open(bin_path, "wb") as fo: + byte_offset = 0 + for block_idx, block_path in enumerate( + tqdm( + block_paths, + "merge train embeddings", + miniters=len(block_paths) // 10, + disable=torch.distributed.get_rank() != 0, + ) + ): + with h5py.File(block_path) as fi: + + nload = get_block_nload(block_path, load_fraction) + block = np.array(fi["data"][:nload], copy=False) + + fo.write(block.tobytes()) + + byte_offset += block.size * block.itemsize + fo.seek(byte_offset) + + +def get_text_dataset_for_training(config: RetroPreprocessingConfig) -> GPTToTextDataset: + """Convert GPT token chunk dataset to a text dataset for passing to the + embedder. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + The text dataset consisting of tokens converted from sampled chunk database. + """ + gpt_dataset = get_merged_sampled_dataset( + project_dir=config.retro_project_dir, + chunk_length=config.retro_gpt_chunk_length, + eod_token_id=config.retro_tokenizers.gpt.eod, + ) + text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt) + return text_dataset + + +def embed_training_chunks(config: RetroPreprocessingConfig) -> None: + """Embed DB chunks. + + Store chunks in blocks on disk. These blocks will later be merged into + a single dataset for training the index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + merged_train_data_path = get_training_data_merged_path(config) + if os.path.exists(merged_train_data_path): + return + + # Get training text dataset. + text_dataset = get_text_dataset_for_training(config) + + # Embed dataset. + embedder = config.retro_bert_embedders.disk + embedder.embed_text_dataset("index", get_training_data_block_dir(config), text_dataset) + + # Merge embeddings. + merge_embedding_blocks(config) + + +def train_on_embeddings(config: RetroPreprocessingConfig) -> None: + """Train index on embedded DB chunks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + index = IndexFactory.get_index(config.retro_index_type) + index.train(config) + + +def remove_embeddings(config: RetroPreprocessingConfig) -> None: + """Remove embeddings after training. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + torch.distributed.barrier() + if torch.distributed.get_rank() != 0: + return + empty_index_path = get_empty_index_path(config) + assert os.path.isfile(empty_index_path) + shutil.rmtree(get_training_data_root_dir(config), ignore_errors=True) + + +def _train_index(config: RetroPreprocessingConfig) -> None: + """Train index on DB chunks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Check if trained index already exists. + if not os.path.isfile(get_empty_index_path(config)): + + # Embed training chunks. + embed_training_chunks(config) + + # Train index on embeddings. + train_on_embeddings(config) + + # Wait for (single-process) training to complete. + torch.distributed.barrier() + + # Remove embeddings. + if config.retro_index_delete_training_embeddings: + remove_embeddings(config) + + +def train_index(config: RetroPreprocessingConfig) -> None: + """Entry point for training the index. + + We select whether to train a new index, or validate an existing index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Train new index. + if config.retro_task_validate is None: + _train_index(config) + + # Validate existing trained index. + else: + from .validate import validate_training_embeddings + + validate_training_embeddings(config) + + +################################################## +# Add to index. +################################################## + + +def get_text_dataset_for_adding(config: RetroPreprocessingConfig) -> GPTToTextDataset: + """Convert GPT token chunk dataset to a text dataset for passing to the + embedder. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + The text dataset that consists of tokens converted from the 'train' chunk database. These are the chunks used for retrieval by the pretraining 'train' dataset. + """ + gpt_dataset = get_merged_train_dataset( + project_dir=config.retro_project_dir, + chunk_length=config.retro_gpt_chunk_length, + eod_token_id=config.retro_tokenizers.gpt.eod, + ) + text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt) + return text_dataset + + +def _add_to_index(config: RetroPreprocessingConfig) -> str: + """Add DB chunks to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the populated index. + """ + + # Get index. + index = IndexFactory.get_index(config.retro_index_type) + + # Get text dataset. + text_dataset = get_text_dataset_for_adding(config) + + # Add to index. + output_index_path = index.add(config, text_dataset) + + return output_index_path + + +def add_to_index(config: RetroPreprocessingConfig) -> None: + """Entry point for adding to the index. + + We select whether to add to a new index, or validate an existing index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Add to new index. + if config.retro_task_validate is None: + _add_to_index(config) + + # Validate existing encodings. + else: + from .validate import validate_added_encodings + + validate_added_encodings(config) + + +################################################## +# Build index (train + add). +################################################## + + +def build_index(config: RetroPreprocessingConfig) -> None: + """Build index. + + Building index involves sequentially running stages above: + - Train index (on sampled training chunks). + - Add to index (on all training chunks). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Train index. + train_index(config) + + # Add to index. + add_to_index(config) diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py new file mode 100644 index 0000000..293d58c --- /dev/null +++ b/megatron/core/datasets/retro/index/factory.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""The IndexFactory constructs an index from an index type string.""" + +from megatron.core.datasets.retro.index.index import Index + +from .indexes import FaissBaseIndex, FaissParallelAddIndex + + +class IndexFactory: + """Get index. + + Index type generally read from argument '--retro-index-ty'. + """ + + @classmethod + def get_index_class(cls, index_type: str) -> type: + """Get an index class, given a type string. + + Args: + index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add(). + + Returns: + An `Index` sub-type corresponding to the `index_type`. + """ + return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex,}[index_type] + + @classmethod + def get_index(cls, index_type: str) -> Index: + """Construct an index from an index type string. + + Args: + index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add(). + + Returns: + An `Index` instance corresponding to the `index_type`. + """ + index_class = cls.get_index_class(index_type) + index = index_class() + return index diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py new file mode 100644 index 0000000..a8c086f --- /dev/null +++ b/megatron/core/datasets/retro/index/index.py @@ -0,0 +1,134 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Base class for all vector indexes. + +A vector index is a type of retrieval database that is queried using vectors, +and returns vectors that are 'similar' (e.g., by cosine distance) to the query +vector. The construction and usage of an index generally has the following +pattern: + + - Train the index on representative vectors. + - Add vectors to the index (i.e., vectors available for retrieval) + - Query index with new vector, to retrieve similar vector indexes. +""" + +import abc +import os +from typing import List, Tuple + +import numpy as np +import torch + +from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import faiss +from megatron.core.datasets.retro.utils import GPTToTextDataset + +from .utils import get_index_dir + + +class Index(abc.ABC): + + """Abstract base class for indexes. + + *Note* : While currently only Faiss-based classes are implemented, in the + future, this class will be extended with other types of indexes that have + different performance-accuracy trade-offs. + + The primary methods to override are: + - train() : Train index on the sampled training chunks. + - add() : Add all training chunks to index. + """ + + @classmethod + def make_object_verbose(cls, index: faiss.Index, verbose: bool) -> None: + """Make index object verbose. + + Args: + index (faiss.Index): Faiss object to set verbose. + verbose (bool): Sets whether index should log status updates during training and adding. + """ + assert isinstance(verbose, bool) + faiss.ParameterSpace().set_index_parameter(index, "verbose", verbose) + + def get_empty_index_path(self, config: RetroPreprocessingConfig) -> str: + """Get file path to empty index (i.e., trained, but unpopulated). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + File path to empty index (i.e., this index has had index.train() called, but not yet index.add()). + """ + return os.path.join( + get_index_dir(config), "empty_%.3f.faissindex" % config.retro_index_train_load_fraction, + ) + + def get_empty_index(self, config: RetroPreprocessingConfig) -> faiss.Index: + """Get empty index (i.e., trained, but unpopulated). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Empty Faiss index, loaded from storage. + """ + return faiss.read_index(self.get_empty_index_path(config)) + + def get_added_index_path(self, config: RetroPreprocessingConfig) -> str: + """Get file path to index that has been populated with vectors. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + File path to added index (i.e., this index has had both index.train() and index.add() called). + """ + return os.path.join( + get_index_dir(config), + "added_%.3f_%.3f.faissindex" + % (config.retro_index_train_load_fraction, config.retro_index_add_load_fraction,), + ) + + def get_added_index(self, config: RetroPreprocessingConfig) -> faiss.Index: + """Get index that has been populated with vectors. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + 'Added' (i.e., populated) Faiss index, loaded from storage. + """ + return faiss.read_index(self.get_added_index_path(config)) + + @abc.abstractmethod + def train(self, config: RetroPreprocessingConfig) -> None: + """Train index on a representative set of vectors. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + @abc.abstractmethod + def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None: + """Add vectors to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index. + """ + + def embed_text_dataset_block( + self, embedder: Embedder, text_dataset: GPTToTextDataset, _range: Tuple[int, int] + ) -> np.ndarray: + """Embed a range of a text dataset. + + Args: + embedder (Embedder): Embedder used for embedding a text dataset. + text_dataset (GPTToTextDataset): Text dataset that will be embedded. + _range (Tuple[int, int]): Start/end sample indices within text dataset used for embedding. + + Returns: + An array of embeddings, with shape (len(text_dataset), dimension(embedder)). + """ + sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range)) + return embedder.embed_text_dataset(sub_dataset) diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py new file mode 100644 index 0000000..c445909 --- /dev/null +++ b/megatron/core/datasets/retro/index/indexes/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Exports: +- FaissBaseIndex: Unoptimized Faiss index wrapper +- FaissParallelAddIndex: Optimized index.add() for Faiss index. +""" + +from .faiss_base import FaissBaseIndex +from .faiss_par_add import FaissParallelAddIndex diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py new file mode 100644 index 0000000..1ffc725 --- /dev/null +++ b/megatron/core/datasets/retro/index/indexes/faiss_base.py @@ -0,0 +1,150 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +This class implements a simple, un-optimized wrapper around a Faiss index, that +implements the Index interface (see ..index.py). While this class is +instantiable, it is meant to be extended with optimizations in classes that +inherit from this class (see FaissParAddIndex, for an example). +""" + +import os + +import numpy as np +import torch +from tqdm import tqdm + +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import faiss +from megatron.core.datasets.retro.index.index import Index +from megatron.core.datasets.retro.index.utils import ( + get_training_data_merged_path, + num_samples_to_block_ranges, +) +from megatron.core.datasets.retro.utils import GPTToTextDataset, log_retro_rank_0 + + +class FaissBaseIndex(Index): + """Base class for Faiss-base indexes. + + This class wraps a Faiss index, and adds additional functionality for training + and adding codes. This base class performs a naive sequential code adding, + while the optimized FaissParallelAddIndex class performs a parallel + index.add(). + """ + + def _train(self, config: RetroPreprocessingConfig) -> None: + """Train index (rank 0's method). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + assert torch.distributed.get_rank() == 0 + + # Set num threads (torch.distributed reset it to 1). + faiss.omp_set_num_threads(64) + + empty_index_path = self.get_empty_index_path(config) + + # Index already exists? -> return. + if os.path.isfile(empty_index_path): + return + + # Load data. + merged_path = get_training_data_merged_path(config) + inp = np.memmap(merged_path, dtype="f4", mode="r",).reshape((-1, config.hidden_size)) + + # Init index. + index = faiss.index_factory(config.hidden_size, config.retro_index_str) + + # Move to GPU. + log_retro_rank_0("> move faiss index to gpu.") + index_ivf = faiss.extract_index_ivf(index) + clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d)) + index_ivf.clustering_index = clustering_index + log_retro_rank_0("> finished moving to gpu.") + self.make_object_verbose(index, True) + self.make_object_verbose(index_ivf, True) + self.make_object_verbose(index_ivf.quantizer, True) + self.make_object_verbose(index_ivf.clustering_index, True) + + # Train index. + index.train(inp) + + # Save index. + faiss.write_index(index, empty_index_path) + + def train(self, config: RetroPreprocessingConfig) -> None: + """Train index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Single process only. + if torch.distributed.get_rank() == 0: + self._train(config) + + torch.distributed.barrier() + + def _add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None: + """Add to index (rank 0's method). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index. + """ + + assert torch.distributed.get_rank() == 0 + + dataset_sample_ranges = num_samples_to_block_ranges(len(text_dataset)) + + # Set num threads (torch.distributed reset it to 1). + faiss.omp_set_num_threads(64) + + # Bert embedder. + embedder = config.bert_embedders.mem + + # Empty/added index paths. + empty_index_path = self.get_empty_index_path() + added_index_path = self.get_added_index_path() + + # Skip adding, if index exists. + if os.path.isfile(added_index_path): + return + + # Read trained index. + index = faiss.read_index(empty_index_path) + + # Iterate data blocks & add. + for sample_range in tqdm(dataset_sample_ranges, "faiss_base.add"): + + # Embed text. + embeds = self.embed_text_dataset_block(embedder, text_dataset, sample_range) + + # Add to index. + index.add(embeds) + + # Write index. + faiss.write_index(index, added_index_path) + + def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> str: + """Add to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index. + + Returns: + File path to the populated index. + """ + + # Single process only. + if torch.distributed.get_rank() == 0: + self._add(config, text_dataset) + + # Wait for rank 0. + torch.distributed.barrier() + + # Get output index path, for return. + return self.get_added_index_path(config) diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py new file mode 100644 index 0000000..6d9d68f --- /dev/null +++ b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py @@ -0,0 +1,208 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Multi-process & multi-node version of Faiss's index.add(). + +This class inherits from FaissBaseIndex, and optimizes the 'add()' method by +making it multi-node and multi-process, with bit-wise equivalence to +FaissBaseIndex. This allows 'add()' to scale out to very large datasets, since +the vast majority of the computational effort is embarrassingly parallel. +""" + +import os +import shutil +from typing import Tuple + +import numpy as np +import psutil +import torch +from tqdm import tqdm + +from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import faiss, h5py +from megatron.core.datasets.retro.index.utils import get_added_code_paths, get_added_codes_dir +from megatron.core.datasets.retro.utils import ( + GPTToTextDataset, + get_blocks_by_rank, + log_retro_rank_0, + retro_makedir, +) + +from .faiss_base import FaissBaseIndex + + +class FaissParallelAddIndex(FaissBaseIndex): + """ + This class parallelizes both 1) encoding vectors, and 2) adding codes to the + index. This class is more performant than naive use of Faiss, because most + of the computational work is in encoding the vectors, which is an + embarassingly parallel operation. + """ + + def encode_block( + self, index: faiss.Index, embedder: Embedder, text_dataset: GPTToTextDataset, block: dict + ) -> Tuple[np.ndarray, np.ndarray]: + """Encode sub-dataset block, to be later added to index. + + Encode the data subset, generally in blocks of 1M vectors each. For + each block, the empty/trained index is loaded, codes are computed + via index.sa_encode(), and the resulting codes are saved to disk. + + Args: + index (faiss.Index): Faiss index object. + embedder (Embedder): Embedder used to embed text dataset. + text_dataset (GPTToTextDataset): Text dataset to be embedded and encoded. + block (dict): Range information specifying start/end indices within text dataset. + + Returns: + A tuple of (embeddings, encodings) for the given block subset of the text dataset. + """ + + # Embed block. + embeddings = self.embed_text_dataset_block(embedder, text_dataset, block["range"],) + + # Encode block. + log_retro_rank_0("encode.") + codes = index.sa_encode(embeddings) + + # Return embeddings for validation purposes. + return embeddings, codes + + def save_block(self, config: RetroPreprocessingConfig, block: dict, codes: np.ndarray) -> None: + """Save block of codes to disk. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + block (dict): Range information specifying the start/end indices within the encoded text dataset. Here, the 'path' item is used for writing the encodings to storage. + codes (np.ndarray): Block of encodings to be saved to storage. + """ + # Save neighbors. + log_retro_rank_0("save codes.") + retro_makedir(config, os.path.dirname(block["path"])) + with h5py.File(block["path"], "w") as f: + f.create_dataset("data", data=codes) + + def encode(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None: + """Encode text dataset, to be later added to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + text_dataset (GPTToTextDataset): Text dataset to be encoded by the index. + """ + + codes_dir = get_added_codes_dir(config) + retro_makedir(config, codes_dir) + + # Index. + index = self.get_empty_index(config) + + # Bert embedder. + embedder = config.retro_bert_embedders.mem + + # Missing code blocks. + def validate(f: h5py.File) -> None: + """Validation method for validating loaded encodings. + + Args: + f (h5py.File): File that contains encodings. + """ + assert len(f["data"].shape) == 2 + + blocks = get_blocks_by_rank( + codes_dir, len(text_dataset), config.retro_block_size, validate=validate, + ) + + # Encode each block. + for block_index, block in enumerate(blocks.missing): + + if block is not None: + + # Progress. + log_retro_rank_0( + "encode block %d / %d ... %s." + % (block_index, len(blocks.missing), block["path"],) + ) + + # Encode and save. + _, codes = self.encode_block(index, embedder, text_dataset, block) + self.save_block(config, block, codes) + + # Synchronize progress across all ranks. (for easier observation) + log_retro_rank_0(" > waiting for other ranks to finish block.") + torch.distributed.barrier() + + def add_codes(self, config: RetroPreprocessingConfig) -> None: + """Read codes from disk, and add them to the index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + if torch.distributed.get_rank() != 0: + return + + added_index_path = self.get_added_index_path(config) + if os.path.exists(added_index_path): + return + + # Index. + log_retro_rank_0("read empty index.") + index = self.get_empty_index(config) + index_ivf = faiss.extract_index_ivf(index) + + # Add codes. + log_retro_rank_0("add codes.") + code_paths = get_added_code_paths(config) + pbar = tqdm(code_paths) + for code_path in pbar: + pbar.set_description( + "add codes, mem %.3f gb, %.1f%%" + % (psutil.virtual_memory()[3] / 1024 ** 3, psutil.virtual_memory()[2],) + ) + with h5py.File(code_path) as f: + + nload = int(config.retro_index_add_load_fraction * f["data"].shape[0]) + offset = int(os.path.basename(code_path).split("-")[0]) + xids = np.arange(offset, offset + nload) + codes = np.copy(f["data"][:nload]) + index_ivf.add_sa_codes(codes, xids) + + # Update index's ntotal. + index.ntotal = index_ivf.ntotal + + # Write index. + log_retro_rank_0("write added index.") + faiss.write_index(index, added_index_path) + + def remove_codes(self, config: RetroPreprocessingConfig) -> None: + """Remove added codes after adding to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + if torch.distributed.get_rank() != 0: + return + assert os.path.isfile(self.get_added_index_path(config)) + + if config.retro_index_delete_added_codes: + raise Exception("remove?") + shutil.rmtree(get_added_codes_dir(config), ignore_errors=True) + + def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None: + """Add vectors to index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index. + """ + + # Encode chunks. + self.encode(config, text_dataset) + + # Add codes to index. + self.add_codes(config) + + # Wait for (single-process) adding to complete. + torch.distributed.barrier() + + # Remove codes. + self.remove_codes(config) diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py new file mode 100644 index 0000000..321cd65 --- /dev/null +++ b/megatron/core/datasets/retro/index/utils.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for building an index.""" + +import glob +import os +from typing import List, Tuple + +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.utils import retro_makedir + + +def get_index_dir(config: RetroPreprocessingConfig) -> str: + """Create sub-directory for this index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to index sub-directory within Retro project. + """ + + # Directory path. + index_dir_path = os.path.join( + config.retro_project_dir, "index", config.retro_index_type, config.retro_index_str, + ) + + # Make directory. + retro_makedir(config, index_dir_path) + + return index_dir_path + + +def num_samples_to_block_ranges( + config: RetroPreprocessingConfig, num_samples: int +) -> List[Tuple[int, int]]: + """Split a range (length num_samples) into sequence of block ranges + of size block_size. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + num_samples (int): Split `num_samples` into consecutive block ranges, where each block is size `config.retro_block_size`. + + Returns: + A list of tuples where each item is the (start, end) index for a given block. + """ + block_size = config.retro_block_size + start_idxs = list(range(0, num_samples, block_size)) + end_idxs = [min(num_samples, s + block_size) for s in start_idxs] + ranges = list(zip(start_idxs, end_idxs)) + return ranges + + +def get_training_data_root_dir(config: RetroPreprocessingConfig) -> str: + """Get root directory for embeddings (blocks and merged data). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the training data directory, which contains both training embedding blocks and the final merged training embeddings. + """ + return os.path.join(config.retro_project_dir, "index", "train_emb") + + +def get_training_data_block_dir(config: RetroPreprocessingConfig) -> str: + """Get directory for of saved embedding blocks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the directory containing the training embedding blocks, which will be later merged into a single embedding array. + """ + return os.path.join(get_training_data_root_dir(config), "blocks") + + +def get_training_data_block_paths(config: RetroPreprocessingConfig) -> List[str]: + """Get paths to saved embedding blocks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Paths of all training embedding blocks. + """ + return sorted(glob.glob(get_training_data_block_dir(config) + "/*.hdf5")) + + +def get_training_data_merged_path(config: RetroPreprocessingConfig) -> str: + """Get path to merged training embeddings. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the merged training embedding binary file. + """ + return os.path.join( + get_training_data_root_dir(config), + "train_%.3f.bin" % config.retro_index_train_load_fraction, + ) + + +def get_added_codes_dir(config: RetroPreprocessingConfig) -> str: + """Get directory of saved encodings. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Path to the directory containing the vector encodings for adding to the index. + """ + return os.path.join(get_index_dir(config), "add_codes") + + +def get_added_code_paths(config: RetroPreprocessingConfig) -> List[str]: + """Get paths to all saved encodings. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + Paths of all vector encoding blocks, for adding to the index. + """ + return sorted(glob.glob(get_added_codes_dir(config) + "/*.hdf5")) diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py new file mode 100644 index 0000000..6783df6 --- /dev/null +++ b/megatron/core/datasets/retro/index/validate.py @@ -0,0 +1,191 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Validate an index's data. + +This module contains functionality for checking for bitwise equality across code +changes. The training and adding steps of index construction can be validated +separately. The following high-level checks are supported: + + - Training: Validate that saved training embeddings are bitwise equal with a + sample set of freshly computed embeddings. (*Note*: + `--no-retro-index-delete-training-embeddings` must be used.) + - Adding: Validate that the saved encodings are bitwise equal with a sample of + sample set of freshly computed encodings. (*Note*: + `--no-retro-index-delete-added-codes` must be used.) +""" + +import typing + +import numpy as np +import torch +from torch.utils.data import Subset + +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.external_libs import h5py +from megatron.core.datasets.retro.utils import ( + GPTToTextDataset, + get_blocks_by_rank, + log_retro_rank_0, +) + +from .build import get_text_dataset_for_adding, get_text_dataset_for_training +from .factory import IndexFactory +from .utils import get_added_codes_dir, get_training_data_block_dir + +################################################## +# Validate trained index. +################################################## + + +def validate_training_embeddings(config: RetroPreprocessingConfig) -> None: + """Validate training embeddings. + + Steps: + - Randomly sample subset of text dataset blocks. + - Embed each block. + - Compare against saved embeddings. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Training text dataset. + text_dataset = get_text_dataset_for_training(config) + + # Sample existing blocks. + blocks = get_blocks_by_rank( + dirname=get_training_data_block_dir(config), + n_samples=len(text_dataset), + block_size=config.retro_block_size, + validate=None, + sample=config.retro_task_validate, + ) + + assert blocks.n_missing_world == 0 + + # Embed & validate blocks. + embedder = config.retro_bert_embedders.mem + for block_idx, block in enumerate(blocks.existing): + + # Missing block lists are extended with None to have equal-length + # lists. Skip the Nones. + if block is not None: + + # Progress. (*note*: move world progress to here.) + log_retro_rank_0( + "embed training block %d / %d ... %s." + % (block_idx, len(blocks.existing), block["path"],) + ) + + # Load existing block embeddings. + with h5py.File(block["path"]) as f: + existing_embeddings = np.copy(f["data"]) + + # Embed block. + sub_dataset = Subset(text_dataset, range(*block["range"])) + embeddings = embedder.embed_text_dataset(sub_dataset, "train") + + # Check equality. + log_retro_rank_0(" > validate.") + assert np.array_equal(existing_embeddings, embeddings) + + # Synchronize progress across all ranks. (for easier observation) + log_retro_rank_0(" > waiting for other ranks to finish block.") + torch.distributed.barrier() + + log_retro_rank_0(" > finished validating training embeddings.") + + +################################################## +# Validate filled index. +################################################## + + +def validate_added_encodings(config: RetroPreprocessingConfig) -> None: + """Validate added encodings. + + Steps: + - Randomly sample subset of text dataset blocks. + - Encode each block. + - Compare against saved encodings. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Index. + index = IndexFactory.get_index(config.retro_index_type) + inner_index = index.get_empty_index(config) + + # Text dataset. + text_dataset = get_text_dataset_for_adding(config) + + # Sample existing blocks. + def validate(f: h5py.File) -> None: + """Validation method for validating encoding blocks. + + Args: + f (h5py.File): File with block of encodings. + """ + assert len(f["data"].shape) == 2 + + blocks = get_blocks_by_rank( + dirname=get_added_codes_dir(config), + n_samples=len(text_dataset), + block_size=config.retro_block_size, + validate=validate, + sample=config.retro_task_validate, + ) + + assert blocks.n_missing_world == 0 + + # Encode and validate blocks. + embedder = config.retro_bert_embedders.mem + for block_idx, block in enumerate(blocks.existing): + + if block is not None: + + # Progress. + log_retro_rank_0( + "encode block %d / %d ... %s." % (block_idx, len(blocks.existing), block["path"],) + ) + + # Load existing codes. + with h5py.File(block["path"]) as f: + existing_codes = np.copy(f["data"]) + + # Encode block. + embeddings, codes = index.encode_block(inner_index, embedder, text_dataset, block) + + # Check equality. + log_retro_rank_0(" > validate.") + assert np.array_equal(existing_codes, codes) + + # Synchronize progress across all ranks. (for easier observation) + log_retro_rank_0(" > waiting for other ranks to finish block.") + torch.distributed.barrier() + + log_retro_rank_0(" > finished validating added encodings.") + + +################################################## +# Validate index (trained + filled). +################################################## + + +def validate_index(config: RetroPreprocessingConfig) -> None: + """Validate index. + + Validating index involves sequentially running stages above: + - Validate trained index. + - Validate filled index. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Validate training embeddings. + validate_training_embeddings(config) + + # Validate added codes. + validate_added_encodings(config) diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py new file mode 100644 index 0000000..ac94833 --- /dev/null +++ b/megatron/core/datasets/retro/query/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py new file mode 100644 index 0000000..34a2ee6 --- /dev/null +++ b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +A GPTChunkDataset is a wrapper around a regular GPTDataset, that sequentially +chunks the sample tokens into `retro_chunk_length` sized smaller samples. + +For example, if the GPTDataset has 100 samples and a sequence length of 2048, and +retro_chunk_length is 64, then the GPTChunkDataset will contain 100*(2048/64) = +3200 samples, each with length 64. +""" + +import torch + +from megatron.core.datasets.gpt_dataset import GPTDataset +from megatron.core.datasets.retro.utils import get_num_chunks_per_sample + +from .utils import get_neighbor_dir + + +class GPTChunkDataset(torch.utils.data.Dataset): + """Pretraining chunk dataset wraps a standard GPT dataset. + + This dataset conceptually divides each sample (e.g., length 2048) + into chunks (e.g., length 64) and restructures them into a list of + chunks (e.g., length num_samples * num_chunks_per_sample). + + Args: + sample_dataset (GPTDataset): Original GPT dataset, with `sequence_length` size samples. + sample_length (int): Alias for `sequence_length`. + chunk_length (int): Retro chunk length (e.g., 64). + """ + + def __init__(self, sample_dataset: GPTDataset, sample_length: int, chunk_length: int): + + super().__init__() + + self.sample_dataset = sample_dataset + self.chunk_length = chunk_length + self.n_chunks_per_sample = get_num_chunks_per_sample(sample_length, chunk_length) + self.n_samples = len(sample_dataset) + self.n_chunks = self.n_samples * self.n_chunks_per_sample + + def __len__(self) -> int: + """Get dataset length. + + Returns: + Dataset length. + """ + return self.n_chunks + + def __getitem__(self, idx: int) -> dict: + """Get sample, including represented document IDs. + + Args: + idx (int): Sample index. + + Returns: + A sample, which contains both the chunk-length token sample ('text') along with all document_ids ('doc_ids') contained withing the full `sequence_length` sample. + """ + + # Convert global chunk index to global sample index & local chunk index. + sample_idx = idx // self.n_chunks_per_sample + chunk_idx = idx % self.n_chunks_per_sample + + # Extract sample data. + sample = self.sample_dataset[sample_idx] + sample_token_ids = sample["text"] + sample_doc_ids = sample["document_ids"] + + # Chunk start/end token idxs. + token_start_idx = chunk_idx * self.chunk_length + token_end_idx = token_start_idx + self.chunk_length + chunk_token_ids = sample_token_ids[token_start_idx:token_end_idx] + + # Sample. + return { + "doc_ids": sample_doc_ids, + "text": chunk_token_ids, + } + + +def build_gpt_chunk_datasets_from_gpt_datasets( + project_dir: str, gpt_datasets: dict, sample_length: int, chunk_length: int, +) -> dict: + """Get train, valid, test GPT chunk datasets. + + Args: + project_dir (str): Retro project dir. + gpt_datasets (dict): Mapping of 'train', 'valid', and 'test' GPT datasets (original, unchunked datasets). + sample_length (int): Alias of `sequence_length`. + chunk_length (int): Retro chunk length (e.g., 64). + + Returns: + A ? + """ + + # GPT chunk datasets. + chunk_datasets = { + key: { + "dataset": GPTChunkDataset(sample_ds, sample_length, chunk_length), + "neighbor_dir": get_neighbor_dir(project_dir, key, sample_ds), + "num_active_chunks": num_active_samples + * get_num_chunks_per_sample(sample_length, chunk_length), + } + if sample_ds + else None + for key, (sample_ds, num_active_samples) in gpt_datasets.items() + } + + return chunk_datasets diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py new file mode 100644 index 0000000..97a891f --- /dev/null +++ b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""A MultiSplitGPTDataset can handle multiple intersecting split strings, as well +as returning all of the document IDs of a sample.""" + +import logging +from dataclasses import dataclass +from typing import Dict, List + +import numpy + +from megatron.core.datasets.blended_megatron_dataset_config import ( + convert_split_vector_to_split_matrix, + parse_and_normalize_split, +) +from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.utils import Split +from megatron.core.utils import log_single_rank + +logger = logging.getLogger(__name__) + + +@dataclass +class MultiSplitGPTDatasetConfig(GPTDatasetConfig): + """Configuration object for Megatron Core blended and Retro datasets. + + Args: + return_document_ids (bool): Whether to return the document ids when querying the dataset. Turn this option on during preprocessing. + split_preprocessing (str): The Retro preprocessing split string. It follows the same pattern convention as 'split'. Not to be used with 'blend_per_split'. + """ + + return_document_ids: bool = None + + split_preprocessing: str = None + + def __post_init__(self) -> None: + """Validate config attributes.""" + super().__post_init__() + assert self.split is not None, "the Retro data pipeline does not support 'blend_per_split'" + assert self.return_document_ids is not None, "this attribute must be user defined" + assert self.split_preprocessing is not None, "this attribute must be user defined" + split_vector = parse_and_normalize_split(self.split) + split_preprocessing_vector = parse_and_normalize_split(self.split_preprocessing) + if not numpy.allclose(split_vector, split_preprocessing_vector): + self.split_matrix = convert_split_vector_to_split_matrix( + split_vector, split_preprocessing_vector + ) + log_single_rank( + logger, + logging.WARNING, + f"split =/= split_preprocessing. Let split_matrix = {self.split_matrix}", + ) + + +class MultiSplitGPTDataset(GPTDataset): + """Retro's customized GPT dataset. + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset. + dataset_path (str): The real path on disk to the dataset, for bookkeeping. + indexed_indices (numpy.ndarray): The set of the documents indices to expose. + num_samples (int): The number of samples to draw from the indexed dataset. + index_split (Split): The indexed_indices Split. + config (MultiSplitGPTDatasetConfig): The Retro-specific container for all config sourced parameters. + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: str, + indexed_indices: numpy.ndarray, + num_samples: int, + index_split: Split, + config: MultiSplitGPTDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + + def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]: + """Get dataset sample. + + Args: + idx (int): The index into the dataset. + + Returns: + Dict[str, numpy.ndarray]: The text ids and (optionally) the document ids wrapped in a dictionary. + """ + text, document_ids = self._query_document_sample_shuffle_indices(idx) + if self.config.return_document_ids: + return {"text": text, "document_ids": document_ids} + else: + return {"text": text} + + @staticmethod + def _key_config_attributes() -> List[str]: + """Add custom attributes for building unique dataset hash. + + The preprocessing split used for preprocessing will constrain the samples available for pretraining. + + Returns: + List[str]: The key config attributes. + """ + return super(MultiSplitGPTDataset, MultiSplitGPTDataset)._key_config_attributes() + [ + "split_preprocessing" + ] diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py new file mode 100644 index 0000000..165792f --- /dev/null +++ b/megatron/core/datasets/retro/query/query.py @@ -0,0 +1,394 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Entry point for querying an index using a GPTChunkDataset. + +Querying involves: + + - Iterate all chunks in the GPTChunkDataset. + - Query index for neighbor chunk IDs (i.e., chunks from the chunk database). + - Save neighbor chunk IDs to disk, for use in building a RetroDataset sample + during pretraining. +""" + +import os +import time +import typing + +import numpy as np +import psutil +import torch +from tqdm import tqdm + +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.db.dataset import DBDataset +from megatron.core.datasets.retro.db.utils import ( + get_merged_train_dataset as get_db_merged_train_dataset, +) +from megatron.core.datasets.retro.external_libs import faiss, h5py +from megatron.core.datasets.retro.index.factory import IndexFactory +from megatron.core.datasets.retro.index.index import Index +from megatron.core.datasets.retro.index.utils import get_index_dir +from megatron.core.datasets.retro.query.gpt_chunk_dataset import GPTChunkDataset +from megatron.core.datasets.retro.utils import ( + GPTToTextDataset, + get_blocks_by_rank, + log_retro_rank_0, + retro_makedir, +) + +from .gpt_chunk_dataset import build_gpt_chunk_datasets_from_gpt_datasets + + +def get_index(config: RetroPreprocessingConfig, ondisk: bool = False,) -> faiss.Index: + """Read index from disk. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + ondisk (bool): If `ondisk = True`, memory map the index. (For debugging purposes only; very non-performant.) + + Returns: + A Faiss index, loaded from storage. + """ + + # Load index. + index_wrapper = IndexFactory.get_index(config.retro_index_type) + index_dir = get_index_dir(config) + added_index_path = index_wrapper.get_added_index_path(config) + if ondisk: + index = faiss.read_index(added_index_path, faiss.IO_FLAG_MMAP) + else: + index = faiss.read_index(added_index_path) + + # Search parameters. + faiss.ParameterSpace().set_index_parameter(index, "efSearch", config.retro_query_ef_search) + faiss.ParameterSpace().set_index_parameter(index, "nprobe", config.retro_query_nprobe) + + return index + + +def embed_block( + config: RetroPreprocessingConfig, gpt_dataset: GPTChunkDataset, block: dict, +) -> np.ndarray: + """Embed block of chunks. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + gpt_dataset (GPTChunkDataset): Chunk dataset to be embedded. + block (dict): Range information containing start/end indices of subset of chunk dataset. + + Returns: + Embeddings array, with shape (len(block["range"]), dimension(embedder)). + """ + text_block_dataset = torch.utils.data.Subset( + GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt), range(*block["range"]), + ) + return config.retro_bert_embedders.mem.embed_text_dataset(text_block_dataset) + + +def query_embeddings( + config: RetroPreprocessingConfig, + db_dataset: DBDataset, + index: Index, + embeddings: np.ndarray, + chunk_id_range: range, + sample_map: dict, + n_chunks_per_sample: int, + verbose: bool = True, +) -> typing.Tuple[np.ndarray, np.ndarray]: + """Query neighbors of a block of embeddings. + + Querying includes: + - Query index for neighbor chunk IDs. + - Filter chunk IDs that have the same document ID as the queried embedding. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + db_dataset (DBDataset): Dataset containing chunk database entries. + index (Index): Vector index populated with chunk database indices. + embeddings (np.ndarray): Embeddings from GPT chunk dataset. + chunk_id_range (range): Chunk ID range from GPT chunk dataset. + sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids. Used for document filtering. + n_chunks_per_sample (int): Number of chunks per sample (e.g., sequence_length / chunk_length). + verbose (bool): Log querying progress. + + Returns: + A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs. + """ + + # Query neighbor ids. + if verbose: + log_retro_rank_0("search.") + t = time.time() + assert index.ntotal > 0, "check we don't accidentally have an empty index." + _, query_neighbor_ids = index.search(embeddings, config.retro_query_num_neighbors_query) + if verbose: + log_retro_rank_0(" time : %.3f sec." % (time.time() - t)) + + # Filter banned neighbor ids. + if verbose: + log_retro_rank_0("filter banned neighbor ids.") + filtered_neighbor_ids = np.full( + shape=(len(query_neighbor_ids), config.retro_query_num_neighbors_save), + fill_value=-1, + dtype="int64", + ) + min_chunk_id, max_chunk_id = chunk_id_range + for chunk_id in range(min_chunk_id, max_chunk_id): + + sample_id = chunk_id // n_chunks_per_sample + sample = sample_map[sample_id] + sample_dataset_idx = sample["dataset_idx"].item() + sample_doc_ids = sample["doc_ids"].tolist() + sample_doc_tuples = [(sample_dataset_idx, d) for d in sample_doc_ids] + + # Get valid neighbors (!= -1). + query_row = [i for i in query_neighbor_ids[chunk_id - min_chunk_id] if i >= 0] + + # Filter row. + filtered_row = [ + i + for i in query_row + if tuple(db_dataset.doc_tuples[i].tolist()) not in sample_doc_tuples + ] + filtered_row = filtered_row[: config.retro_query_num_neighbors_save] + filtered_row += [-1] * (config.retro_query_num_neighbors_save - len(filtered_row)) + filtered_neighbor_ids[chunk_id - min_chunk_id] = filtered_row + + return query_neighbor_ids, filtered_neighbor_ids + + +def query_embedding_block( + config: RetroPreprocessingConfig, + db_dataset: DBDataset, + index: Index, + embeddings: np.ndarray, + chunk_id_range: range, + sample_map: dict, + n_chunks_per_sample: int, +) -> typing.Tuple[np.ndarray, np.ndarray]: + """Query a block of embeddings. + + The block is broken into smaller sub-blocks, for easier tracking of progress. + Both the raw neighbor IDs and the filtered neighbor IDs (i.e., chunks with the + same document ID are removed) are collected. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + db_dataset (DBDataset): Dataset containing chunk database entries. + index (Index): Vector index populated with chunk database indices. + embeddings (np.ndarray): Embeddings from GPT chunk dataset. + chunk_id_range (range): Chunk ID range from GPT chunk dataset. + sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids. Used for document filtering. + n_chunks_per_sample (int): Number of chunks per sample (e.g., sequence_length / chunk_length). + + Returns: + A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs. + """ + + query_neighbor_ids = [] + filtered_neighbor_ids = [] + + # Query in sub-blocks. + partial_block_size = 1000 + for partial_start_idx in tqdm( + range(0, len(embeddings), partial_block_size), + " search", + miniters=(len(embeddings) // partial_block_size) // 10, + disable=torch.distributed.get_rank() != 0, + ): + partial_end_idx = min(len(embeddings), partial_start_idx + partial_block_size) + partial_embeddings = embeddings[partial_start_idx:partial_end_idx] + partial_chunk_id_range = ( + chunk_id_range[0] + partial_start_idx, + chunk_id_range[0] + partial_end_idx, + ) + partial_query_neighbor_ids, partial_filtered_neighbor_ids = query_embeddings( + config, + db_dataset, + index, + partial_embeddings, + partial_chunk_id_range, + sample_map, + n_chunks_per_sample, + verbose=False, + ) + query_neighbor_ids.append(partial_query_neighbor_ids) + filtered_neighbor_ids.append(partial_filtered_neighbor_ids) + + # Concatenate. + query_neighbor_ids = np.concatenate(query_neighbor_ids, axis=0) + filtered_neighbor_ids = np.concatenate(filtered_neighbor_ids, axis=0) + + return query_neighbor_ids, filtered_neighbor_ids + + +def query_block_neighbors( + config: RetroPreprocessingConfig, + db_dataset: DBDataset, + query_dataset: GPTChunkDataset, + index: Index, + block: dict, +) -> None: + """Query neighbors of a dataset block (i.e., range). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + db_dataset (DBDataset): Dataset containing chunk database entries. + query_dataset (GPTChunkDataset): GPT chunk dataset to be queried. + index (Index): Vector index populated with chunk database indices. + block (dict): Range information containing start/end indices for querying GPT chunk dataset. + """ + + n_chunks_per_sample = query_dataset.n_chunks_per_sample + + # Sample map. + sample_ids = sorted( + list(set(chunk_id // n_chunks_per_sample for chunk_id in range(*block["range"]))) + ) + sample_map = {} + for i in sample_ids: + sample = query_dataset.sample_dataset[i] + sample_map[i] = { + "dataset_idx": sample["dataset_id"], + "doc_ids": sample["document_ids"], + } + + # Embed block. + embeddings = embed_block(config, query_dataset, block) + + # Query embeddings. + _, filtered_neighbor_ids = query_embedding_block( + config, db_dataset, index, embeddings, block["range"], sample_map, n_chunks_per_sample, + ) + + if config.retro_task_validate is None: + # Save neighbors. + log_retro_rank_0("save neighbors.") + retro_makedir(config, os.path.dirname(block["path"])) + f = h5py.File(block["path"], "w") + f.create_dataset("neighbors", data=filtered_neighbor_ids) + f.close() + + else: + # Validate neighbors. + with h5py.File(block["path"]) as f: + existing_neighbor_ids = np.copy(f["neighbors"]) + assert np.array_equal(existing_neighbor_ids, filtered_neighbor_ids) + + +def query_dataset_neighbors( + config: RetroPreprocessingConfig, + db_dataset: DBDataset, + query_dataset: GPTChunkDataset, + num_active_chunks: int, + prefix: str, + neighbor_dir: str, + index: Index, +) -> None: + """Query neighbors of each chunk within a dataset. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + db_dataset (DBDataset): Dataset containing chunk database entries. + query_dataset (GPTChunkDataset): GPT chunk dataset to be queried. + num_active_chunks (int): The 'active' chunks are the subset of the GPT chunk dataset that aren't being queried. This argument is used when validating the correctness of a subset of the GPT chunk dataset. + prefix (str): Extra string for logging progress. + neighbor_dir (str): File path to directory for saving neighbor IDs. + index (Index): Vector index populated with chunk database indices. + """ + + def validate(f: h5py.File) -> None: + """Validation method for validating saved neighbor IDs. + + Args: + f (h5py.File): File containing save neighbor IDs. + """ + assert f["neighbors"].shape[1] == config.retro_query_num_neighbors_save, ( + "neighbors.shape == %s; num_neighbors_target == %d." + % (str(f["neighbors"].shape), config.retro_num_neighbors_target,) + ) + + if config.retro_task_validate is None: + retro_makedir(config, neighbor_dir) + blocks = get_blocks_by_rank( + neighbor_dir, num_active_chunks, config.retro_block_size, validate=validate, + ) + active_blocks = blocks.missing + else: + blocks = get_blocks_by_rank( + neighbor_dir, + num_active_chunks, + config.retro_block_size, + validate=validate, + sample=config.retro_task_validate, + ) + assert blocks.n_missing_world == 0 + active_blocks = blocks.existing + + # Query each block. + for block_index, block in enumerate(active_blocks): + + if block is not None: + + # Progress. + log_retro_rank_0( + "%squery '%s' block %d / %d ... %s ... mem %.3f gb, %.1f%%." + % ( + "" if config.retro_task_validate is None else "[validate] ", + prefix, + block_index, + len(active_blocks), + os.path.basename(block["path"]), + psutil.virtual_memory()[3] / 1024 ** 3, + psutil.virtual_memory()[2], + ) + ) + + # Query block neighbors. + query_block_neighbors(config, db_dataset, query_dataset, index, block) + + # Synchronize progress across all ranks. (for easier observation) + log_retro_rank_0(" > waiting for other ranks to finish block.") + torch.distributed.barrier() + + +def query_neighbors(config: RetroPreprocessingConfig) -> None: + """Query pretraining datasets (train & valid). + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + """ + + # Num threads. + faiss.omp_set_num_threads(64) + + # Load chunk db dataset. + log_retro_rank_0("load chunk db dataset.") + db_dataset = get_db_merged_train_dataset( + project_dir=config.retro_project_dir, + chunk_length=config.retro_gpt_chunk_length, + eod_token_id=config.retro_tokenizers.gpt.eod, + ) + db_dataset.load_doc_tuples() + + # Load index. + log_retro_rank_0(" > get index.") + index = get_index(config) + + # Query each (i.e., train, valid, test) dataset. + log_retro_rank_0(" > query.") + for prefix, info in vars(config.retro_gpt_chunk_datasets).items(): + if info is None: + continue + log_retro_rank_0( + " > query '%s' dataset ... %d samples." % (prefix, info["num_active_chunks"]) + ) + query_dataset_neighbors( + config, + db_dataset, + info["dataset"], + info["num_active_chunks"], + prefix, + info["neighbor_dir"], + index, + ) diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py new file mode 100644 index 0000000..07af161 --- /dev/null +++ b/megatron/core/datasets/retro/query/retro_dataset.py @@ -0,0 +1,242 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +A RetroDataset wraps both: + + - A GPTDataset (which is nested as GPTChunkDataset -> MultiSplitGPTDataset -> + GPTDataset). + - Neighbor IDs of chunks in the chunk database, that were saved during + preprocessing. + +Both the GPT sample data and the neighbor IDs are returned within a sample from +this dataset. +""" + +import os +from typing import Any, Dict, Optional, Tuple + +import numpy as np +import torch + +from megatron.core.datasets.retro.db.dataset import DBDataset +from megatron.core.datasets.retro.db.utils import get_merged_train_dataset as get_db_dataset +from megatron.core.datasets.retro.external_libs import h5py +from megatron.core.datasets.retro.utils import BlockPathMap, log_retro_rank_0 +from megatron.core.models.retro import RetroConfig + +from .gpt_chunk_dataset import GPTChunkDataset, build_gpt_chunk_datasets_from_gpt_datasets +from .utils import get_query_dir + + +class RetroDataset(torch.utils.data.Dataset): + """Dataset of retro samples. + + Each sample contains the original GPT sample, along with the token IDs + of each neighbor of each chunk within the sequence. Neighbor array has + shape (num_chunks_per_sample, num_neighbors, num_retrieved_tokens). + + ** Note: chunk dataset wraps original GPT dataset (see gpt_chunk_dataset.py). + + Args: + num_queried_samples (int): Total number of queried samples. + num_neighbors (int): Total number of saved neighbors. + num_retrieved_chunks (int): Number of retrieved chunks (e.g., 2 for neighbor + continuation). + block_size (int): Number of neighbor entries per file. + db_dataset (DBDataset): Chunk database used for retrieval. + chunk_dataset (GPTChunkDataset): GPT chunk dataset, which is a wrapper around a standard GPT dataset that breaks each sample into chunks. + neighbor_path_map (BlockPathMap): Mapping of neighbor ID to file path. + """ + + def __init__( + self, + num_queried_samples: int, + num_neighbors: int, + num_retrieved_chunks: int, + block_size: int, + db_dataset: DBDataset, + chunk_dataset: GPTChunkDataset, + neighbor_path_map: BlockPathMap, + ): + super().__init__() + + self.num_queried_samples = num_queried_samples + self.num_neighbors = num_neighbors + self.num_retrieved_chunks = num_retrieved_chunks + self.block_size = block_size + self.db_dataset = db_dataset + self.chunk_dataset = chunk_dataset + self.neighbor_path_map = neighbor_path_map + + def __len__(self) -> int: + """Dataset length. + + Returns: + Number of samples in dataset. + """ + return len(self.chunk_dataset.sample_dataset) + + def __getitem__(self, sample_idx: int) -> dict: + """Get dataset sample. + + Args: + sample_idx (int): Index of sample in dataset. + + Returns: + A dict consisting of GPT sample (attribute 'text') and corresponding neighbor chunk IDs ('neighbor_chunks', for indexing chunk database) and neighbor token IDs (corresponding chunk database GPT tokens). + """ + n_chunks_per_sample = self.chunk_dataset.n_chunks_per_sample + + # Wrap sample idx around number of queried samples. + sample_idx = sample_idx % self.num_queried_samples + + # Get standard sample. + sample = self.chunk_dataset.sample_dataset[sample_idx] + + # Sample idx to chunk idxs. + chunk_idxs = list( + range(sample_idx * n_chunks_per_sample, (sample_idx + 1) * n_chunks_per_sample,) + ) + + # Collect retrieved tokens. + all_retrieved_chunk_ids = [] + all_retrieved_token_ids = [] + for chunk_idx in chunk_idxs: + + # Neighbor chunk ids. + neighbor_path = self.neighbor_path_map[chunk_idx] + with h5py.File(neighbor_path, "r") as f: + neighbor_chunk_ids = f["neighbors"][ + chunk_idx % self.block_size, : self.num_neighbors + ].tolist() + + # Retrieved (neighbor + continuation) token ids. + retrieved_chunk_ids = [] + retrieved_token_ids = [] + for neighbor_chunk_id in neighbor_chunk_ids: + current_chunk_ids = [ + i % len(self.db_dataset) + for i in range(neighbor_chunk_id, neighbor_chunk_id + self.num_retrieved_chunks) + ] + current_token_ids = [self.db_dataset[ci]["text"] for ci in current_chunk_ids] + retrieved_chunk_ids.append(current_chunk_ids) + retrieved_token_ids.append(current_token_ids) + + # Collect retrieved tokens. + all_retrieved_chunk_ids.append(retrieved_chunk_ids) + all_retrieved_token_ids.append(retrieved_token_ids) + + # Reshape retrieved tokens. + all_retrieved_chunk_ids = np.array(all_retrieved_chunk_ids).reshape( + (n_chunks_per_sample, self.num_neighbors, -1) + ) + all_retrieved_token_ids = np.array(all_retrieved_token_ids).reshape( + (n_chunks_per_sample, self.num_neighbors, -1) + ) + + # Sample. + sample: Dict[str, np.ndarray] = { + **sample, + "neighbor_chunks": all_retrieved_chunk_ids, + "neighbor_tokens": all_retrieved_token_ids, + } + + return sample + + +def get_retro_datasets( + config: RetroConfig, gpt_datasets: dict, sample_length: int, eod_token_id: int, +) -> Tuple[Optional[RetroDataset], Optional[RetroDataset], Optional[RetroDataset]]: + """Get train, valid, test retro datasets. + + Args: + config (RetroConfig): Retro preprocessing config. + gpt_datasets (dict): Mapping of data split key ('train', 'valid', or 'test') to the original sequence-length GPT dataset (i.e., not the chunk dataset). + sample_length (int): Alias to `sequence_length`. + eod_token_id (int): GPT EOD token ID. + + Returns: + A tuple of 'train', 'valid', and 'test' `RetroDataset`s. + """ + + # DB dataset. + db_dataset = get_db_dataset( + project_dir=config.retro_project_dir, + chunk_length=config.retro_chunk_length, + eod_token_id=eod_token_id, + ) + + # GPT chunk datasets. + chunk_ds_info_map = build_gpt_chunk_datasets_from_gpt_datasets( + project_dir=config.retro_project_dir, + gpt_datasets=gpt_datasets, + sample_length=sample_length, + chunk_length=config.retro_chunk_length, + ) + + # Retro datasets. + retro_dataset_map: Dict[str, Optional[RetroDataset]] = {} + query_dir = get_query_dir(config.retro_project_dir) + for data_key, chunk_ds_info in chunk_ds_info_map.items(): + + # Skip unused datasets. + if chunk_ds_info is None: + retro_dataset_map[data_key] = None + continue + + # For consistency with preprocessing, the neighbor_dir is overwritten + # (from its setting in `build_gpt_chunk_datasets_from_gpt_datasets()` + # above). This is one piece -- along with setting data_path and + # train_samples from config.json -- of ensuring consistency between + # preprocessing and pretraining. + chunk_dataset = chunk_ds_info["dataset"] + chunk_ds_info["neighbor_dir"] = os.path.join( + query_dir, config.retro_neighbor_dirs[data_key], + ) + neighbor_dir = chunk_ds_info["neighbor_dir"] + neighbor_path_map = BlockPathMap.from_dir( + dir=neighbor_dir, block_size=config.retro_block_size + ) + + # Verify num chunks. + n_active_chunks = chunk_ds_info["num_active_chunks"] + n_neighbor_chunks = neighbor_path_map.max_idx + + if not os.path.isdir(neighbor_dir): + if torch.distributed.get_rank() == 0: + raise Exception( + "neighbor directory '%s' not found; please " + "compare --train-samples, --seq-length, --seed, " + "--eval-iters, and --eval-interval, with " + "retro preprocessing args." % neighbor_dir + ) + torch.distributed.barrier() + exit() + + if config.retro_verify_neighbor_count and n_active_chunks != n_neighbor_chunks: + if torch.distributed.get_rank() == 0: + log_retro_rank_0("neighbor_dir : %s" % neighbor_dir) + log_retro_rank_0("neighbor_path_map : %s" % neighbor_path_map) + raise Exception( + "num sampled chunks (%d) != num neighbor chunks " + "(%d); did you complete querying the entire " + "pretraining dataset?" % (n_active_chunks, n_neighbor_chunks) + ) + torch.distributed.barrier() + exit() + + # Retro dataset. + retro_dataset_map[data_key] = RetroDataset( + num_queried_samples=gpt_datasets[data_key][1], + num_neighbors=config.retro_num_neighbors, + num_retrieved_chunks=config.retro_num_retrieved_chunks, + block_size=config.retro_block_size, + db_dataset=db_dataset, + chunk_dataset=chunk_dataset, + neighbor_path_map=neighbor_path_map, + ) + + return ( + retro_dataset_map["train"], + retro_dataset_map["valid"], + retro_dataset_map["test"], + ) diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py new file mode 100644 index 0000000..f07920d --- /dev/null +++ b/megatron/core/datasets/retro/query/utils.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for querying the pretraining dataset.""" + +import os + +from megatron.core.datasets.megatron_dataset import MegatronDataset + + +def get_query_dir(project_dir: str) -> str: + """Get root directory of all saved query data. + + Args: + project_dir (str): Retro project dir. + + Returns: + Path to query sub-directory in Retro project. + """ + return os.path.join(project_dir, "query") + + +def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str: + """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test). + + Args: + project_dir (str): Retro project dir. + key (str): Dataset split key; 'train', 'valid', or 'test'. + dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors. + + Returns: + Path to directory containing this dataset's neighbors within Retro project. + """ + return os.path.join( + get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}"), + ) diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py new file mode 100644 index 0000000..2d87630 --- /dev/null +++ b/megatron/core/datasets/retro/utils.py @@ -0,0 +1,349 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for Retro preprocessing.""" + +import glob +import logging +import os +from collections import defaultdict +from types import SimpleNamespace +from typing import Any, Callable, Dict, List, Optional + +import numpy as np +import torch +from tqdm import tqdm + +from megatron.core import parallel_state +from megatron.core.datasets.retro.config import RetroPreprocessingConfig +from megatron.core.datasets.retro.query.multi_split_gpt_dataset import ( + MultiSplitGPTDataset, + MultiSplitGPTDatasetConfig, +) +from megatron.core.utils import log_single_rank + +from .external_libs import h5py + +logger = logging.getLogger(__name__) + + +def log_retro_rank_0(message: str) -> None: + """Log on rank 0. + + Args: + message (str): Message to log. + """ + log_single_rank(logger, logging.INFO, "[RETRO] " + message) + + +def retro_makedir(config: RetroPreprocessingConfig, path: str) -> None: + """Make a directory, conditional on not being in validation mode. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + path (str): Path to directory. + """ + if config.retro_task_validate is None: + os.makedirs(path, exist_ok=True) + + +def extract_data_config(config: RetroPreprocessingConfig) -> MultiSplitGPTDatasetConfig: + """Extract data config from dataset. + + Args: + config (RetroPreprocessingConfig): Retro preprocessing config. + + Returns: + The config object used to build the dataset. + """ + return config.retro_gpt_chunk_datasets.train["dataset"].sample_dataset.config + + +def get_num_chunks_per_sample(sample_length: int, chunk_length: int) -> int: + """Compute seq_length // chunk_length. + + Args: + sample_length (int): Alias of `sequence_length`. + chunk_length (int): Retro chunk length (e.g., 64). + + Returns: + Number of chunks per sample (i.e., `sequence_length` / `chunk_length`). + """ + assert sample_length % chunk_length == 0 + return sample_length // chunk_length + + +class GPTToTextDataset(torch.utils.data.Dataset): + """Dataset to convert GPT tokens to text. + + Args: + gpt_dataset (MultiSplitGPTDataset): GPT dataset, which outputs GPT token samples. + gpt_tokenizer (Any): GPT tokenizer. + """ + + def __init__(self, gpt_dataset: MultiSplitGPTDataset, gpt_tokenizer: Any): + + super().__init__() + + self.gpt_dataset = gpt_dataset + self.gpt_tokenizer = gpt_tokenizer + + def __len__(self) -> int: + """Dataset length. + + Returns: + Number of samples in the dataset. + """ + return len(self.gpt_dataset) + + def __getitem__(self, idx: int) -> dict: + """Get dataset sample. + + Args: + idx (int): Index of sample. + + Returns: + A dict containing attribute 'text' of type string. + """ + gpt_token_ids = self.gpt_dataset[idx]["text"].tolist() + text = self.gpt_tokenizer.detokenize(gpt_token_ids) + return {"text": text} + + +def get_blocks( + dirname: str, n_samples: int, block_size: int, validate: Callable = None, +) -> SimpleNamespace: + """Divide range [0, num_samples) to sequence of block ranges. + + This is a core method within the concept of block processing. The idea + is to divide a range (size n_samples) into a sequence of blocks. Each + block corresponds to a file within 'dirname' with name + '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of + these files, and returns two lists, one for existing blocks and one for + missing blocks. + + Args: + dirname (str): Path to directory containing block files. + n_samples (int): Ideal number of samples. The total number of saved block data is <=n_samples. + block_size (int): Max number of samples per block file (e.g., 100000). + validate (Callable): Method for validating each block file during load. + + Returns: + A namespace consisting of 2 lists: existing blocks, and missing blocks. The total number of samples between the existing and missing blocks should equal n_samples above. + """ + + assert os.path.isdir(dirname), "missing directory '%s.'" % dirname + + # Block ranges. + block_start_idxs = list(range(0, n_samples, block_size)) + block_end_idxs = [min(n_samples, i + block_size) for i in block_start_idxs] + block_ranges = list(zip(block_start_idxs, block_end_idxs)) + + # All block files (existing + missing). + n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1) + all_blocks = [ + { + "range": r, + "path": os.path.join( + dirname, "%s-%s.hdf5" % tuple([str(i).zfill(n_digits) for i in r]), + ), + } + for r in block_ranges + ] + all_block_path_set = set(block["path"] for block in all_blocks) + + # Validate function. + validate = (lambda f: None) if validate is None else validate + + # Delete corrupt files. + if torch.distributed.get_rank() == 0: + existing_block_paths = [ + block["path"] for block in all_blocks if os.path.exists(block["path"]) + ] + for index, path in enumerate(tqdm(existing_block_paths, "validating block.")): + + assert path in all_block_path_set, "unexpected filename, '%s'." % path + + try: + f = h5py.File(path, "r") + except: + os.remove(path) + continue + + try: + validate(f) + except: + os.remove(path) + finally: + f.close() + + # Wait for files to be deleted. + torch.distributed.barrier() + + # Collect blocks. + blocks = SimpleNamespace( + existing=[b for b in all_blocks if os.path.exists(b["path"])], + missing=[b for b in all_blocks if not os.path.exists(b["path"])], + ) + + return blocks + + +def get_blocks_by_rank( + dirname: str, + n_samples: int, + block_size: int, + validate: Callable = None, + sample: Optional[float] = None, +) -> SimpleNamespace: + """Divide existing and missing blocks evenly across all ranks. + + See 'get_blocks()' above for description. The returned lists of existing and + missing blocks are split evenly across ranks via interleaving. This way, + each rank has a roughly equal number of blocks to process for a + downstream operation. + + Args: + dirname (str): Path to directory containing block files. + n_samples (int): Ideal number of samples. The total number of saved block data is <=n_samples. + block_size (int): Max number of samples per block file (e.g., 100000). + validate (Callable): Method for validating each block file during load. + sample (Optional[float]): If provided, sample a random subset of the blocks. Used for validating preprocessing correctness. + + Returns: + A namespace consisting of 2 lists: existing blocks, and missing blocks. Each of these two lists is potentially a sub-sample of the total set of existing and missing blocks, depending on whether sampling is used. Additionally, the attributes n_existing_world and n_missing_world are the total number of existing and missing blocks, independent of samples. Therefore, (n_existing_world + n_missing_world) * block_size == n_samples. + """ + + # Get world blocks. + blocks = get_blocks(dirname, n_samples, block_size, validate) + + # This rank's existing and missing files. + data_parallel_rank = parallel_state.get_data_parallel_rank() + data_parallel_world_size = parallel_state.get_data_parallel_world_size() + rank_existing_blocks = blocks.existing[ + data_parallel_rank : len(blocks.existing) : data_parallel_world_size + ] + rank_missing_blocks = blocks.missing[ + data_parallel_rank : len(blocks.missing) : data_parallel_world_size + ] + + # Extend rank's existing and missing blocks (with None) such that all ranks + # have equal length lists. This allows for easier tracking of global progress. + def get_world_max(n: int) -> int: + """Get max value across ranks. + + Args: + n (int): Value on this rank. + + Returns: + Max value across all ranks. + """ + n_tensor = torch.cuda.LongTensor([n]) + torch.distributed.all_reduce(n_tensor, op=torch.distributed.ReduceOp.MAX) + return n_tensor.item() + + max_n_existing = get_world_max(len(rank_existing_blocks)) + max_n_missing = get_world_max(len(rank_missing_blocks)) + + rank_existing_blocks += [None] * (max_n_existing - len(rank_existing_blocks)) + rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks)) + + # Collect blocks. + blocks = SimpleNamespace( + n_existing_world=len(blocks.existing), + n_missing_world=len(blocks.missing), + existing=rank_existing_blocks, + missing=rank_missing_blocks, + ) + + if sample is not None: + # Sample existing and missing blocks evenly across all ranks. The + # returned lists of blocks are randomly sampled (without replacement) + # to yield `sample * len(blocks)` number of blocks. + + # Randomly sample blocks. + def sample_blocks(_blocks: List[Optional[Dict]]) -> List[Optional[Dict]]: + """Sample a random subset of all blocks. + + Args: + _blocks (List[Optional[Dict]]): List of all blocks. + + Returns: + A random subset of the blocks. + """ + n_blocks_sample = int(np.ceil(sample * len(_blocks))) + sampled_blocks: List[Optional[Dict]] = [b for b in _blocks if b is not None] + + np.random.seed(None) + np.random.shuffle(sampled_blocks) + + sampled_blocks = sampled_blocks[:n_blocks_sample] + sampled_blocks += [None] * (n_blocks_sample - len(sampled_blocks)) + + return sampled_blocks + + blocks.existing = sample_blocks(blocks.existing) + blocks.missing = sample_blocks(blocks.missing) + + return blocks + + +class BlockPathMap: + """Map an index to its containing block path. + + The common use for this class is to have a directory of files containing + blocks of processed data, of uniform block size (e.g., 100k samples per + file). Each file must follow a naming convention of 'startIdx-endIdx.[ext]', + where 'endIdx' minus 'startIdx' must equal the block size, with the possible + exception of the final block. Given an input index, this class maps the + index to the containing block file. + + Args: + block_paths (List[str]): List of paths to saved block files. + block_size (int): Max number of samples per block file (e.g., 100000). + """ + + @classmethod + def from_dir(cls, dir: str, block_size: int, ext: str = "hdf5") -> Any: + """Get list of block files, and create map. + + Args: + dir (str): Path to directory containing saved block files. + block_size (int): Max number of samples per block file (e.g., 100000). + ext (str): Block file extension (e.g., 'hdf5'). + + Returns: + A mapping of sample index to block file path. + """ + assert os.path.isdir(dir), f"directory not found, '{dir}'." + return cls(sorted(glob.glob(dir + f"/*.{ext}")), block_size) + + def __init__(self, block_paths: List[str], block_size: int): + self.max_idx = 0 + self.block_path_map = {} + for block_path in block_paths: + name = os.path.splitext(os.path.basename(block_path))[0] + start_idx, end_idx = [int(i) for i in name.split("-")] + self.block_path_map[start_idx] = block_path + self.max_idx = max(self.max_idx, end_idx) + self.block_size = block_size + + def __str__(self) -> str: + """Stringify the mapping. + + Returns: + A string representation of this block path map. + """ + return "%d paths" % len(self.block_path_map) + + def __getitem__(self, idx: int) -> str: + """Get block path from index. + + Args: + idx (int): Index of sample. + + Returns: + The path to the block file containing the sample index. + """ + block_start_idx = self.block_size * (idx // self.block_size) + block_path = self.block_path_map[block_start_idx] + return block_path diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py new file mode 100644 index 0000000..33792c8 --- /dev/null +++ b/megatron/core/datasets/t5_dataset.py @@ -0,0 +1,231 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from collections import deque +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +import numpy + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.masked_dataset import ( + MaskedWordPieceDataset, + MaskedWordPieceDatasetConfig, +) +from megatron.core.datasets.utils import Split + + +@dataclass +class T5MaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig): + """Configuration object for Megatron Core T5 WordPiece datasets + + NB: As a temporary holdover from Megatron-LM. The T5 tokenizer has an attribute which defines + a number of special sentinel tokens used during sampling. The assert in __post_init__ serves to + preserve compatibility with Megatron-LM until the T5 tokenizer is in Megatron Core. + """ + + sequence_length_encoder: Optional[int] = field(init=False, default=None) + """A sequence_length alias and the sequence length for the encoder""" + + sequence_length_decoder: int = None + """The sequence length for the decoder""" + + def __post_init__(self) -> None: + """Do asserts and set fields post init + """ + super().__post_init__() + + self.sequence_length_encoder = self.sequence_length + + assert self.sequence_length_encoder is not None + assert self.sequence_length_decoder is not None + + assert len(self.tokenizer.additional_special_tokens_ids) > 0 + + +class T5MaskedWordPieceDataset(MaskedWordPieceDataset): + """The T5 dataset that assumes WordPiece tokenization + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset + + dataset_path (str): The real path on disk to the dataset, for bookkeeping + + indexed_indices (numpy.ndarray): The set of the documents indices to expose + + num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch. + + index_split (Split): The indexed_indices Split + + config (T5MaskedWordPieceDatasetConfig): The config + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: str, + indexed_indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: T5MaskedWordPieceDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + + self.token_lookup = list(self.config.tokenizer.inv_vocab.keys()) + # Account for the single and single token ids + self.sample_index = self._build_sample_index(self.config.sequence_length - 2, 1) + + @staticmethod + def _key_config_attributes() -> List[str]: + """Inherited method implementation + + Returns: + List[str]: The key config attributes + """ + return super( + T5MaskedWordPieceDataset, T5MaskedWordPieceDataset + )._key_config_attributes() + ["sequence_length_decoder",] + + def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: + """Abstract method implementation + + Args: + idx (int): The index into the dataset + + Returns: + Dict[str, Union[int, numpy.ndarray]]: The + """ + idx_beg, idx_end, target_sequence_length = self.sample_index[idx] + sample = [self.dataset[i] for i in range(idx_beg, idx_end)] + + numpy_random_state = numpy.random.RandomState( + seed=(self.config.random_seed + idx) % 2 ** 32 + ) + + assert target_sequence_length <= self.config.sequence_length + + # Flatten the sample into a list of tokens + tokens = [token for sentence in sample for token in sentence] + + # Truncate the list of tokens to a desired length + truncated = len(tokens) > target_sequence_length + tokens = tokens[:target_sequence_length] + + # Masking + (tokens, _, _, _, masked_spans,) = self._create_masked_lm_predictions( + tokens, target_sequence_length, numpy_random_state + ) + + # Prepare the encoder input and decoder input and output + sentinels = deque(self.config.tokenizer.additional_special_tokens_ids) + encoder_input = [] + decoder_input = [self.config.tokenizer.bos] + decoder_output = [] + idx_beg = 0 + for indices, labels in masked_spans: + sentinel = sentinels.popleft() + + # set the end index + idx_end = indices[0] + + encoder_input.extend(tokens[idx_beg:idx_end]) + encoder_input.append(sentinel) + + decoder_input.append(sentinel) + decoder_input.extend(labels) + + decoder_output.append(sentinel) + decoder_output.extend(labels) + + # set the start index + idx_beg = indices[-1] + 1 + + encoder_input.extend(tokens[idx_beg:]) + decoder_output.append(self.config.tokenizer.eos) + + # Pad the sequences and convert to NumPy + length_toks_encoder = len(encoder_input) + length_toks_decoder = len(decoder_input) + length_pads_encoder = self.config.sequence_length_encoder - length_toks_encoder + length_pads_decoder = self.config.sequence_length_decoder - length_toks_decoder + assert length_pads_encoder >= 0 + assert length_pads_decoder >= 0 + + encoder_input = numpy.array(encoder_input, dtype=numpy.int64) + encoder_input = numpy.pad( + encoder_input, (0, length_pads_encoder), constant_values=self.config.tokenizer.pad + ) + + decoder_input = numpy.array(decoder_input, dtype=numpy.int64) + decoder_input = numpy.pad( + decoder_input, (0, length_pads_decoder), constant_values=self.config.tokenizer.pad + ) + + # Create attention and history masks + mask_encoder = self._make_attention_mask(encoder_input, encoder_input) + mask_encoder_decoder = self._make_attention_mask(decoder_input, encoder_input) + mask_decoder = self._make_attention_mask(decoder_input, decoder_input) + mask_decoder = mask_decoder * self._make_history_mask(decoder_input) + + # Mask the labels + decoder_output = numpy.array(decoder_output, dtype=numpy.int64) + decoder_output = numpy.pad(decoder_output, (0, length_pads_decoder), constant_values=-1) + + # Get the loss mask + loss_mask = numpy.zeros(self.config.sequence_length_decoder, dtype=numpy.int64) + loss_mask[:length_toks_decoder] = 1 + + return { + "text_enc": encoder_input, + "text_dec": decoder_input, + "labels": decoder_output, + "loss_mask": loss_mask, + "truncated": int(truncated), + "enc_mask": mask_encoder, + "dec_mask": mask_decoder, + "enc_dec_mask": mask_encoder_decoder, + } + + @staticmethod + def _make_attention_mask( + source_block: numpy.ndarray, target_block: numpy.ndarray + ) -> numpy.ndarray: + """Return a 2-D attention mask + + Args: + source_block (numpy.ndarray): A 1-D array + target_block (numpy.ndarray): A 1-D array + + Returns: + numpy.ndarray: The 2-D attention mask + """ + mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) + return mask.astype(numpy.int64) + + @staticmethod + def _make_history_mask(block: numpy.ndarray) -> numpy.ndarray: + """Return a 2-D history (lower-left-triangular) mask + + Args: + block (numpy.ndarray): A 1-D array + + Returns: + numpy.ndarray: The 2-D history (lower-left-triangular) mask + """ + arange = numpy.arange(block.shape[0]) + mask = arange[None,] <= arange[:, None] + return mask.astype(numpy.int64) + + def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> int: + """Abstract method implementation + + 100% of the time, replace the token id with mask token id. + + Args: + numpy_random_state (RandomState): The NumPy random state + + Returns: + int: The mask token id + """ + return self.config.tokenizer.mask diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py new file mode 100644 index 0000000..45203c2 --- /dev/null +++ b/megatron/core/datasets/utils.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import logging +from enum import Enum +from typing import List, Optional, Tuple + +import numpy +import torch + +from ..utils import log_single_rank + +logger = logging.getLogger(__name__) + + +class Split(Enum): + train = 0 + valid = 1 + test = 2 + + +def compile_helpers(): + """Compile C++ helper functions at runtime. Make sure this is invoked on a single process. + """ + import os + import subprocess + + command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))] + if subprocess.run(command).returncode != 0: + import sys + + log_single_rank(logger, logging.ERROR, "Failed to compile the C++ dataset helper functions") + sys.exit(1) + + +def normalize(weights: List[float]) -> List[float]: + """Do non-exponentiated normalization + + Args: + weights (List[float]): The weights + + Returns: + List[float]: The normalized weights + """ + w = numpy.array(weights, dtype=numpy.float64) + w_sum = numpy.sum(w) + w = (w / w_sum).tolist() + return w + + +def get_blend_from_list( + blend: Optional[List[str]], +) -> Optional[Tuple[List[str], Optional[List[float]]]]: + """Get the megatron.core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig blend from the blend list + + Args: + blend (Optional[List[str]]): The blend list, which can be either (1) a list of prefixes, e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"] + + Returns: + Optional[Tuple[List[str], Optional[List[float]]]]: The blend, consisting of a list of dataset prefixes and optionally a list of dataset weights, e.g. [["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], [30.0, 70.0]]. + """ + if blend is None: + return None + + if len(blend) % 2 == 1: + weight_per_dataset = None + raw_prefix_per_dataset = blend + else: + raw_weight_per_dataset, raw_prefix_per_dataset = zip( + *[(blend[i], blend[i + 1]) for i in range(0, len(blend), 2)] + ) + + weight_per_dataset = [] + for rwpd in raw_weight_per_dataset: + try: + weight = float(rwpd) + except ValueError: + weight = None + weight_per_dataset.append(weight) + + is_none = map(lambda _: _ is None, weight_per_dataset) + if any(is_none): + assert all(is_none) + weight_per_dataset = None + raw_prefix_per_dataset = blend + + prefix_per_dataset = [rppd.strip() for rppd in raw_prefix_per_dataset] + + return prefix_per_dataset, weight_per_dataset diff --git a/megatron/core/datasets/utils_s3.py b/megatron/core/datasets/utils_s3.py new file mode 100644 index 0000000..f0a1f03 --- /dev/null +++ b/megatron/core/datasets/utils_s3.py @@ -0,0 +1,163 @@ +import os +from typing import Any, Dict, NamedTuple, Protocol, Tuple + +import torch + +try: + import boto3 + import botocore.exceptions as exceptions +except ModuleNotFoundError: + pass + +S3_PREFIX = "s3://" + + +class S3Config(NamedTuple): + """Config when the data (.bin) file and the index (.idx) file are in S3 + + TODO: These parameters are few and can be consolidated with parameters specific to bin reader + classes - @jkamalu + + Attributes: + + path_to_idx_cache (str): The local directory where we will store the index (.idx) file + + bin_chunk_nbytes (int): If the number of bytes is too small, then we send a request to S3 at each call of the `read` method in _S3BinReader, which is slow, because each request has a fixed cost independent of the size of the byte range requested. If the number of bytes is too large, then we only rarely have to send requests to S3, but it takes a lot of time to complete the request when we do, which can block training. We've found that 256 * 1024 * 1024 (i.e., 256 MiB) has worked well (though we have not put that much effort into tuning it), so we default to it. + """ + + path_to_idx_cache: str + + bin_chunk_nbytes: int = 256 * 1024 * 1024 + + +class S3Client(Protocol): + """The protocol which all s3 clients should abide by""" + + def download_file(self, Bucket: str, Key: str, Filename: str) -> None: ... + + def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: ... + + def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: ... + + def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, Any]: ... + + def close(self) -> None: ... + + +def is_s3_path(path: str) -> bool: + """Ascertain whether a path is in S3 + + Args: + path (str): The path + + Returns: + bool: True if the path is in S3, False otherwise + """ + return path.startswith(S3_PREFIX) + + +def parse_s3_path(path: str) -> Tuple[str, str]: + """Parses the given S3 path returning correspsonding bucket and key. + + Args: + path (str): The S3 path + + Returns: + Tuple[str, str]: A (bucket, key) tuple + """ + assert is_s3_path(path) + parts = path.replace(S3_PREFIX, "").split("/") + bucket = parts[0] + if len(parts) > 1: + key = "/".join(parts[1:]) + assert S3_PREFIX + bucket + "/" + key == path + else: + key = "" + return bucket, key + + +def object_exists(client: S3Client, path: str) -> bool: + """Ascertain whether the object at the given S3 path exists in S3 + + Args: + client (S3Client): The S3 client + + path (str): The S3 path + + Raises: + botocore.exceptions.ClientError: The error code is 404 + + Returns: + bool: True if the object exists in S3, False otherwise + """ + parsed_s3_path = parse_s3_path(path) + try: + response = client.head_object(bucket=parsed_s3_path[0], key=parsed_s3_path[1]) + except exceptions.ClientError as e: + if e.response["Error"]["Code"] != "404": + raise e + return True + + +def _download_file(client: S3Client, s3_path: str, local_path: str) -> None: + """Download the object at the given S3 path to the given local file system path + + Args: + client (S3Client): The S3 client + + s3_path (str): The S3 source path + + local_path (str): The local destination path + """ + dirname = os.path.dirname(local_path) + os.makedirs(dirname, exist_ok=True) + parsed_s3_path = parse_s3_path(s3_path) + client.download_file(parsed_s3_path[0], parsed_s3_path[1], local_path) + + +def maybe_download_file(s3_path: str, local_path: str) -> None: + """Download the object at the given S3 path to the given local file system path + + In a distributed setting, downloading the S3 object proceeds in stages in order + to try to have the minimum number of processes download the object in order for + all the ranks to have access to the downloaded object. + + Args: + s3_path (str): The S3 source path + + local_path (str): The local destination path + """ + + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + local_rank = rank % torch.cuda.device_count() + else: + rank = 0 + local_rank = 0 + + s3_client = boto3.client("s3") + + if (not os.path.exists(local_path)) and (rank == 0): + _download_file(s3_client, s3_path, local_path) + + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + # If the `local_path` is in a file system that is not + # shared across all the ranks, then we assume it's in the + # host file system and each host needs to download the file. + if (not os.path.exists(local_path)) and (local_rank == 0): + _download_file(s3_client, s3_path, local_path) + + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + # If the `local_path` still does not exist, then we assume + # each rank is saving to a separate location. + if not os.path.exists(local_path): + _download_file(s3_client, s3_path, local_path) + + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + assert os.path.exists(local_path) diff --git a/megatron/core/dist_checkpointing/__init__.py b/megatron/core/dist_checkpointing/__init__.py new file mode 100644 index 0000000..a065b5f --- /dev/null +++ b/megatron/core/dist_checkpointing/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +from .core import check_is_distributed_checkpoint +from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor +from .serialization import ( + load, + load_common_state_dict, + load_plain_tensors, + load_tensors_metadata, + save, +) diff --git a/megatron/core/dist_checkpointing/core.py b/megatron/core/dist_checkpointing/core.py new file mode 100644 index 0000000..50384e6 --- /dev/null +++ b/megatron/core/dist_checkpointing/core.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Module for managing distributed checkpoints metadata. """ + +import json +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Optional + +CONFIG_FNAME = 'metadata.json' + + +class CheckpointingException(Exception): + """ Base checkpointing related exception """ + + pass + + +@dataclass +class CheckpointingConfig: + """ Documents backends used in the checkpoint. + + Checkpoint config keeps track of formats used for storing the sharded tensors + (sharded_backend) and other objects (common_backend). + + Note that versioning is not for the checkpoint content (which is application specific), + but for the checkpoint format itself. + """ + + sharded_backend: str + sharded_backend_version: int = 1 + common_backend: str = 'torch' + common_backend_version: int = 1 + + +def check_is_distributed_checkpoint(checkpoint_dir): + """ Checks if `metadata.json` exists in the checkpoint and is a valid config. + + Args: + checkpoint_dir: checkpoint directory + + Returns: + bool: True if `metadata.json` exists in the checkpoint and is a valid config. + """ + return maybe_load_config(checkpoint_dir) is not None + + +def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]: + """ Returns checkpoint config if `checkpoint_dir` is a distributed checkpoint and None otherwise + + Args: + checkpoint_dir: checkpoint directory + + Returns: + CheckpointingConfig (optional): None if checkpoint is not a valid distributed checkpoint + """ + config_path = Path(checkpoint_dir, CONFIG_FNAME) + if not config_path.exists(): + return None + with config_path.open() as f: + config_dict = json.load(f) + return CheckpointingConfig(**config_dict) + + +def save_config(config: CheckpointingConfig, checkpoint_dir: str): + """ Save given config to checkpoint directory. + + Args: + config: checkpoint config + checkpoint_dir: checkpoint directory + + Returns: + None + """ + config_path = Path(checkpoint_dir, CONFIG_FNAME) + with config_path.open('w') as f: + json.dump(asdict(config), f) diff --git a/megatron/core/dist_checkpointing/dict_utils.py b/megatron/core/dist_checkpointing/dict_utils.py new file mode 100644 index 0000000..95591cd --- /dev/null +++ b/megatron/core/dist_checkpointing/dict_utils.py @@ -0,0 +1,232 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Utilities for operating with dicts and lists. + +All functions in this module handle nesting of dicts and lists. +Other objects (e.g. tuples) are treated as atomic leaf types that cannot be traversed. +""" + +from collections import defaultdict +from typing import Any, Callable, Iterable, Optional, Tuple, Union + +import torch + + +def extract_matching_values( + x: Union[dict, list], predicate: Callable[[Any], bool], return_lists_as_dicts: bool = False +) -> Tuple[Union[dict, list], Union[dict, list]]: + """ Return matching and nonmatching values. Keeps hierarchy. + + Args: + x (Union[dict, list]) : state dict to process. Top-level argument must be a dict or list + predicate (object -> bool): determines matching values + return_lists_as_dicts (bool): if True, matching lists will be turned + into dicts, with keys indicating the indices of original elements. + Useful for reconstructing the original hierarchy. + """ + + def _set_elem(target, k, v): + if return_lists_as_dicts: + target[k] = v + else: + target.append(v) + + if isinstance(x, dict): + matching_vals = {} + nonmatching_vals = {} + for k, v in x.items(): + if isinstance(v, (list, dict)): + match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts) + if match: + matching_vals[k] = match + if nonmatch or not v: + nonmatching_vals[k] = nonmatch + elif predicate(v): + matching_vals[k] = v + else: + nonmatching_vals[k] = v + elif isinstance(x, list): + matching_vals = {} if return_lists_as_dicts else [] + nonmatching_vals = {} if return_lists_as_dicts else [] + for ind, v in enumerate(x): + if isinstance(v, (list, dict)) and v: + match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts) + if match: + _set_elem(matching_vals, ind, match) + if nonmatch or not v: + _set_elem(nonmatching_vals, ind, nonmatch) + else: + target = matching_vals if predicate(v) else nonmatching_vals + _set_elem(target, ind, v) + else: + raise ValueError(f'Unexpected top-level object type: {type(x)}') + return matching_vals, nonmatching_vals + + +def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]: + """ Recursive diff of dicts. + + Args: + x1 (object): left dict + x2 (object): right dict + prefix (tuple): tracks recursive calls. Used for reporting differing keys. + + Returns: + Tuple[list, list, list]: tuple of: + - only_left: Prefixes present only in left dict + - only_right: Prefixes present only in right dict + - mismatch: values present in both dicts but not equal across dicts. + For tensors equality of all elems is checked. + Each element is a tuple (prefix, type of left value, type of right value). + """ + mismatch = [] + if isinstance(x1, dict) and isinstance(x2, dict): + only_left = [prefix + (k,) for k in x1.keys() - x2.keys()] + only_right = [prefix + (k,) for k in x2.keys() - x1.keys()] + for k in x2.keys() & x1.keys(): + _left, _right, _mismatch = diff(x1[k], x2[k], prefix + (k,)) + only_left.extend(_left) + only_right.extend(_right) + mismatch.extend(_mismatch) + elif isinstance(x1, list) and isinstance(x2, list): + only_left = list(range(len(x1) - 1, len(x2) - 1, -1)) + only_right = list(range(len(x1) - 1, len(x2) - 1, -1)) + for i, (v1, v2) in enumerate(zip(x1, x2)): + _left, _right, _mismatch = diff(v1, v2, prefix + (i,)) + only_left.extend(_left) + only_right.extend(_right) + mismatch.extend(_mismatch) + else: + only_left = [] + only_right = [] + if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor): + _is_mismatch = not torch.all(x1 == x2) + else: + try: + _is_mismatch = bool(x1 != x2) + except RuntimeError: + _is_mismatch = True + + if _is_mismatch: + mismatch.append((prefix, type(x1), type(x2))) + + return only_left, only_right, mismatch + + +def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4): + """ Helper to print types of (nested) dict values. """ + print_indent = lambda: print(' ' * indent * len(prefix), end='') + if isinstance(x, dict): + print() + for k, v in x.items(): + print_indent() + print(f'> {k}: ', end='') + inspect_types(v, prefix + (k,), indent) + elif isinstance(x, list): + print() + for i, v in enumerate(x): + print_indent() + print(f'- {i}: ', end='') + inspect_types(v, prefix + (i,), indent) + else: + if isinstance(x, torch.Tensor): + print(f'Tensor of shape {x.shape}') + else: + try: + x_str = str(x) + except: + x_str = '' + if len(x_str) > 30: + x_str = x_str[:30] + '... (truncated)' + print(f'[{type(x)}]: {x_str}') + + +def nested_values(x: Union[dict, list]): + """ Returns iterator over (nested) values of a given dict or list. """ + x_iter = x.values() if isinstance(x, dict) else x + for v in x_iter: + if isinstance(v, (dict, list)): + yield from nested_values(v) + else: + yield v + + +def nested_items_iter(x: Union[dict, list]): + """ Returns iterator over (nested) tuples (container, key, value) of a given dict or list. """ + x_iter = x.items() if isinstance(x, dict) else enumerate(x) + for k, v in x_iter: + if isinstance(v, (dict, list)): + yield from nested_items_iter(v) + else: + yield x, k, v + + +def dict_map(f: Callable, d: dict): + """ `map` equivalent for dicts. """ + for sub_d, k, v in nested_items_iter(d): + sub_d[k] = f(v) + + +def dict_map_with_key(f: Callable, d: dict): + """ `map` equivalent for dicts with a function that accepts tuple (key, value). """ + for sub_d, k, v in nested_items_iter(d): + sub_d[k] = f(k, v) + + +def dict_list_map_inplace(f: Callable, x: Union[dict, list]): + """ Maps dicts and lists *in-place* with a given function. """ + if isinstance(x, dict): + for k, v in x.items(): + x[k] = dict_list_map_inplace(f, v) + elif isinstance(x, list): + x[:] = (dict_list_map_inplace(f, v) for v in x) + else: + return f(x) + return x + + +def dict_list_map_outplace(f: Callable, x: Union[dict, list]): + """ Maps dicts and lists *out-of-place* with a given function. """ + if isinstance(x, dict): + return {k: dict_list_map_outplace(f, v) for k, v in x.items()} + elif isinstance(x, list): + return [dict_list_map_outplace(f, v) for v in x] + else: + return f(x) + + +def merge(x1: dict, x2: dict, key: Tuple[str, ...] = ()): + """ Merges dicts and lists recursively. """ + if isinstance(x1, dict) and isinstance(x2, dict): + for k, v2 in x2.items(): + if k not in x1: + x1[k] = v2 + else: + x1[k] = merge(x1[k], v2, key=key + (k,)) + elif isinstance(x1, list) and isinstance(x2, list): + if len(x1) != len(x2): + raise ValueError( + f'Cannot merge two lists with different lengths ({len(x1)} and {len(x2)}, encountered at level {key})' + ) + for i, v2 in enumerate(x2): + x1[i] = merge(x1[i], v2, key=key + (i,)) + else: + raise ValueError( + f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2}` (at level {key})' + ) + return x1 + + +def map_reduce( + xs: Iterable, + key_fn: Callable = lambda x: x, + value_fn: Callable = lambda x: x, + reduce_fn: Callable = lambda x: x, +) -> dict: + """ Simple map-reduce implementation following `more_itertools.map_reduce` interface. """ + res = defaultdict(list) + for x in xs: + res[key_fn(x)].append(value_fn(x)) + for k in res: + res[k] = reduce_fn(res[k]) + return dict(res) diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py new file mode 100644 index 0000000..3393c3e --- /dev/null +++ b/megatron/core/dist_checkpointing/mapping.py @@ -0,0 +1,523 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Core library classes for representing sharding of tensors and objects. + +The main expected usage is wrapping torch.Tensors in state dicts with +ShardedTensor class (mostly with the ShardedTensor.from_rank_offsets classmethod). +""" + +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field, replace +from itertools import chain +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import numpy as np +import torch + +from .core import CheckpointingException +from .dict_utils import dict_list_map_inplace, dict_list_map_outplace + +logger = logging.getLogger(__name__) + +# These type definitions are just hints to differentiate a plain model state +# dict (StateDict) from a state dict with tensors replaced with ShardedTensors +# (ShardedStateDict). +StateDict = Dict[str, Any] +ShardedStateDict = Dict[str, Any] +ReplicaId = Union[int, Tuple[int, ...]] + + +class ShardedBase(ABC): + key: str + data: object + replica_id: ReplicaId + + @abstractmethod + def validate_metadata_integrity(self): + """Codifies the constraints on metadata attributes.""" + + @abstractmethod + def without_data(self) -> 'ShardedBase': + raise NotImplementedError + + +@dataclass +class ShardedTensor(ShardedBase): + """Represents a mapping between a local tensor and a global tensor. + + Global tensor is assumed to consist of many local tensors distributed + between different processes. + + Args: + key: unique identifier of a global tensor + data: local tensor data. Can be None only for consistency validation + dtype: tensor dtype + local_shape: local tensor shape + global_shape: global tensor shape + global_offset: offset of a local tensor in a global tensor, specified in number of tensor elements + axis_fragmentations: global tensor fragmentation of each axis + replica_id: indicates given local tensor's replication wrt. local tensors in different processes + prepend_axis_num: number of axes prepended to the local tensor to reflect global tensor shape. The behavior is similar to unsqueezing the local tensor. + allow_shape_mismatch: if True, during loading, the global shape of a stored tensor does not have to match the expected global shape. Useful for representing tensors with flexible shape, e.g. padded. + flattened_range: specifies a slice that should be applied to a flattened tensor with `local_shape` in order to get the tensor stored as `data` + """ + + key: str + data: Optional[torch.Tensor] = field(repr=False) + dtype: torch.dtype + local_shape: Tuple[int, ...] + global_shape: Tuple[int, ...] + global_offset: Tuple[int, ...] + axis_fragmentations: Optional[Tuple[int, ...]] + replica_id: ReplicaId = 0 + prepend_axis_num: int = 0 + allow_shape_mismatch: bool = False + flattened_range: Optional[slice] = None + + def __post_init__(self): + self.validate_metadata_integrity() + + def validate_metadata_integrity(self) -> None: + """Codifies the constraints on metadata attributes. + + Meeting those constraints is guaranteed when instantiating a ShardedTensor + class with `from_rank_offsets` or `from_rank_offsets_flat` constructors. + + Returns: + None + """ + has_flattened_range = self.flattened_range is not None + if self.data is not None: + if self.data.dtype != self.dtype: + raise CheckpointingException( + f'Data dtype should match `dtype` attribute for {self}' + ) + if not has_flattened_range and self.data.shape != self.local_shape: + raise CheckpointingException( + f'Data shape should match `local_shape` attribute for {self}' + ) + if has_flattened_range: + if self.data.ndim != 1: + raise CheckpointingException(f'Data should be 1D for a flattened {self}') + real_data = self.data + try: + self.data = None + self.init_data(device='meta') + if self.data.shape != real_data.shape: + raise CheckpointingException( + f'Data shape doesnt match expected {self.data.shape} for {self}' + ) + finally: + self.data = real_data + + if len(self.global_shape) != len(self.global_offset): + raise CheckpointingException( + f'Global offset dimensions should be equal to global shape dimensions for {self}' + ) + if len(self.local_shape) + self.prepend_axis_num != len(self.global_shape): + raise CheckpointingException( + f'Local shape together with `prepend_axis_num` dimensions should be equal to global shape dimensions for {self}' + ) + + for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape): + if off % sh != 0: + raise CheckpointingException( + f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.' + ) + + if has_flattened_range and self.flattened_range.step is not None: + raise CheckpointingException( + f'`step` argument in the flattened range of a ShardedTensor is not supported.' + ) + + def global_slice(self) -> Tuple[Union[int, slice], ...]: + assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num + return tuple( + chain( + (off for off in self.global_offset[: self.prepend_axis_num]), + ( + slice(off, off + sh) + for off, sh in zip( + self.global_offset[self.prepend_axis_num :], self.local_shape + ) + ), + ) + ) + + def global_coordinates(self) -> Tuple[np.ndarray, ...]: + if self.flattened_range is None: + raise CheckpointingException( + f'`global_coordinates` is undefined for' + f' {self.__class__.__name__} without `flattened_range`' + ) + + local_coords = self.local_coordinates() + assert len(local_coords) + self.prepend_axis_num == len(self.global_offset), ( + len(local_coords), + self, + ) + global_coords = tuple( + c + off + for c, off in zip((0,) * self.prepend_axis_num + local_coords, self.global_offset) + ) + return global_coords + + def local_coordinates(self) -> Tuple[np.ndarray, ...]: + if self.flattened_range is None: + raise CheckpointingException( + f'`local_coordinates` is undefined for' + f' {self.__class__.__name__} without `flattened_range`' + ) + + # TODO: np.unravel_index? + mask = np.zeros(np.product(self.local_shape), dtype=bool) + mask[self.flattened_range] = True + return np.nonzero(mask.reshape(self.local_shape)) + + def local_chunk_offset_in_global(self) -> Tuple[int, ...]: + """Offset of a local chunk in a global array of chunks. + + Returns: + Tuple[int, ...]: the offset of the whole local chunk in a global array of chunks. + """ + assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num + chunk_offset = list(self.global_offset[: self.prepend_axis_num]) + for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape): + assert off % sh == 0, str(self) + chunk_offset.append(off // sh) + return tuple(chunk_offset) + + def max_allowed_chunks(self) -> Tuple[int, ...]: + chunks = [] + for axis_sh, axis_fragm in zip(self.global_shape, self.axis_fragmentations): + if not self.allow_shape_mismatch and axis_sh % axis_fragm != 0: + raise CheckpointingException( + f'Axis shape ({axis_sh}) not divisible by axis fragmentation ({axis_fragm}' + ) + axis_chunk_size = axis_sh // axis_fragm + chunks.append(axis_chunk_size) + return tuple(chunks) + + def without_data(self): + return replace(self, data=None) + + @classmethod + def from_rank_offsets( + cls, + key: str, + data: torch.Tensor, + *rank_offsets: Tuple[int, int, int], + replica_id: ReplicaId = 0, + prepend_axis_num: int = 0, + flattened_range: None = None, + **init_kwargs, + ): + """Allows to construct the ShardedTensor given offset specified in process ranks. + + Args: + key (str): unique key + data (torch.Tensor): local tensor data + rank_offsets (Tuple[int, int, int]): each tuple (axis, axis_rank_offset, axis_fragm) says that if global tensor is divided into `axis_fragm` fragment along `axis` axis, then local tensor data corresponds to the `axis_rank_offset` chunk. + replica_id (ReplicaId): see ShardedTensor + prepend_axis_num (int): see ShardedTensor + flattened_range (None): must be None when using this constructor + init_kwargs: passed to ShardedTensor.__init__ + """ + if flattened_range is not None: + raise ValueError( + 'Cannot instantiate a flat ShardedTensor with `from_rank_offsets` method.' + ' Use `from_rank_offsets_flat` instead' + ) + global_offset = [0] * (data.ndim + prepend_axis_num) + global_shape = ([1] * prepend_axis_num) + list(data.shape) + axis_fragmentations = [1] * (data.ndim + prepend_axis_num) + _seen_axis = set() + for axis, axis_rank_offset, axis_fragm in rank_offsets: + assert axis >= 0 and axis_rank_offset >= 0 and axis_fragm >= 0, ( + axis, + axis_rank_offset, + axis_fragm, + ) + assert ( + axis_rank_offset < axis_fragm + ), 'Rank offset must be lower than axis fragmentation' + if axis in _seen_axis: + raise CheckpointingException('Duplicated axis specified') + _seen_axis.add(axis) + + local_axis_shape = 1 if axis < prepend_axis_num else data.shape[axis - prepend_axis_num] + global_shape[axis] = axis_fragm * local_axis_shape + global_offset[axis] = axis_rank_offset * local_axis_shape + axis_fragmentations[axis] = axis_fragm + + return cls( + key, + data, + data.dtype, + tuple(data.shape), + tuple(global_shape), + tuple(global_offset), + tuple(axis_fragmentations), + replica_id, + prepend_axis_num, + flattened_range=flattened_range, + **init_kwargs, + ) + + @classmethod + def from_rank_offsets_flat( + cls, + key: str, + data: torch.Tensor, + non_flat_local_shape: Tuple[int, ...], + *args, + flattened_range: Optional[slice] = None, + **kwargs, + ): + """Allows to construct a *flattened* ShardedTensor given offset specified in process ranks. + + Args: + key (str): + data (torch.Tensor): this should be a flattened data tensor + non_flat_local_shape (Tuple[int, ...]): expected local shape of a non-flat chunk + *args: passed unchanged to the `from_rank_offsets` constructor + flattened_range (slice): see ShardedTensor. Defaults to None, but must be set to + a non-None slice. + **kwargs: + + Returns: + ShardedTensor: constructed ShardedTensor instance + """ + if flattened_range is None: + raise CheckpointingException( + 'Cannot instantiate a non-flat ShardedTensor with `from_rank_offsets_flat` method.' + ' Use `from_rank_offsets` instead' + ) + if data.ndim != 1: + raise CheckpointingException( + f'Flattened ShardedTensor requires 1D data, got shape: {data.shape}' + ) + if flattened_range.stop - flattened_range.start != data.numel(): + raise CheckpointingException( + f'Flattened ShardedTensor data length ({data.numel()}) must meet the slice length: {flattened_range.stop - flattened_range.start}' + ) + + non_flat_data_meta = torch.empty(*non_flat_local_shape, dtype=data.dtype, device='meta') + sh_ten = cls.from_rank_offsets(key, non_flat_data_meta, *args, **kwargs) + instance = replace(sh_ten, data=data, flattened_range=flattened_range) + instance.validate_metadata_integrity() + return instance + + def init_data(self, device: Union[str, torch.device], init_fn=torch.empty): + if self.data is not None: + return + self.data = init_fn(self.local_shape, dtype=self.dtype, device=device) + if self.flattened_range is not None: + self.data = self.data.flatten()[self.flattened_range.start : self.flattened_range.stop] + + +def is_main_replica(replica_id: ReplicaId): + """Checks if given `replica_id` is considered as main. + + "Main" replica is: + - integer 0 + - or an iterable with all 0 elements + + It is the application responsibility to set correct replicas for sharded tensors. + + Args: + replica_id (Union[int, Tuple[int, ...]]): replica id + + Returns: + (bool): True for a "main" replica + """ + if isinstance(replica_id, int): + return replica_id == 0 + return all(r == 0 for r in replica_id) + + +class LocalNonpersistentObject: + """Object that should not be stored in a checkpoint, but restored locally. + + Wrapping any object inside the state dict with LocalNonpersistentObject + will result in: + - during saving, this object will *not* be stored in the checkpoint + - during loading, a local version of this object will be placed in a state dict + """ + + def __init__(self, obj): + self.obj = obj + + def unwrap(self): + return self.obj + + +# TODO: Delete once NeMo fixes typo. +LocalNonpersitentObject = LocalNonpersistentObject + + +@dataclass +class ShardedObject(ShardedBase): + """Represents a mapping between a local object and a global object. + + Global object is assumed to consist of many local objects distributed + between different processes. + + NOTE: Contrary to ShardedTensor, it's impossible to change global object + sharding. Conceptually, ShardedObject is a fully-sharded ShardedTensor + with atomic arbitrary typed elements. + + Args: + key: unique identifier of a global tensor + data: local object data. Can be None only for consistency validation + global_shape: global object shape + global_offset: offset of a local object in a global object, specified in number of shards + replica_id: indicates local object replication wrt. local objects in different processes + """ + + key: str + data: object + global_shape: Tuple[int, ...] + global_offset: Tuple[int, ...] + replica_id: ReplicaId = 0 + + def __post_init__(self): + self.validate_metadata_integrity() + + def validate_metadata_integrity(self): + if len(self.global_shape) != len(self.global_offset): + raise CheckpointingException( + f'Global offset dimensions should be equal to global shape dimensions for {self}' + ) + + def without_data(self): + return replace(self, data=None) + + @property + def unique_key(self): + return f'{self.key}/shard_{".".join(map(str, self.global_offset))}_{".".join(map(str, self.global_shape))}' + + def __str__(self): + return f'{self.__class__.__name__}(key=\'{self.key}\')' + + @classmethod + def empty_from_unique_key(cls, unique_key, replica_id: ReplicaId = 0) -> 'ShardedObject': + key, shard_key = unique_key.split('/') + shard_str, offset, shape = shard_key.split('_') + assert shard_str == 'shard' + offset = tuple(map(int, offset.split('.'))) + shape = tuple(map(int, shape.split('.'))) + if len(shape) + 1 == len(offset): + # This is a backward-compatible fix. We don't know the last element of global shape so set it to -1. + shape += (-1,) + return cls(key, None, shape, offset, replica_id) + + +@dataclass +class ShardedTensorFactory(ShardedBase): + """Allows to apply transformations to tensors before/after serialization. + + The essence of those transformations is that they can be applied to + optimizer states the same way they are applied to the model params. + The ultimate state dict with sharded tensors must depend functionally on + `build_fn` arguments (key, data, replica_id, flattened_range), + which will be provided by the optimizer. + + Builder creates a sub-state-dict out of a tensor before saving, and merger + merges the corresponding state dict after loading. + + Args: + key (str): unique identifier of the factory + data (torch.Tensor): original model parameter that will be further transformed by this factory + build_fn (callable): function that transforms the original tensor to a sharded state dict + merge_fn (callable): function that transforms loaded subtree back into a single tensor (inverse of `build_fn`) + replica_id (ReplicaId): indicates factory replication wrt. factories in different processes + flattened_range (slice, optional): indicates additional flattening applied to the ShardedTensors produced by the factory + """ + + key: str + data: torch.Tensor + build_fn: Callable[[str, torch.Tensor, ReplicaId, Optional[slice]], ShardedStateDict] + merge_fn: Callable[[StateDict], torch.Tensor] + replica_id: ReplicaId = 0 + flattened_range: Optional[slice] = None + + def build(self): + return self.build_fn(self.key, self.data, self.replica_id, self.flattened_range) + + def validate_metadata_integrity(self): + """No reasonable checks can be applied""" + pass + + def without_data(self): + return replace(self, data=None) + + +def apply_factories(sharded_state_dict: ShardedStateDict): + """Turn ShardedTensorFactories into ShardedTensors *in-place*. + + Args: + sharded_state_dict (ShardedStateDict): state dict possibly containing ShardedTensorFactory objects + + Returns: + None: state dict is modified in place + """ + + def apply(x): + if isinstance(x, ShardedTensorFactory): + x = x.build() + return x + + dict_list_map_inplace(apply, sharded_state_dict) + + +def apply_factory_merges( + x1: StateDict, x2: ShardedStateDict, key: Tuple[str, ...] = () +) -> StateDict: + """Apply merges defined by ShardedTensorFactories *in-place*. + + Args: + x1 (StateDict): state dict loaded from the checkpoint + x2 (ShardedStateDict): subset of `x1` (in terms of dict keys) with ShardedTensorFactory + as (possibly nested) values that define how to merge objects from the `x1` state dict + key (Tuple[str, ...]): current key in a recursive call. Used only for reporting meaningful errors + + Returns: + StateDict: `x1` modified in-place + """ + if isinstance(x2, ShardedTensorFactory): + return x2.merge_fn(x1) + + # There rest is almost the same as the `merge` function from `dict_utils` + if isinstance(x1, dict) and isinstance(x2, dict): + for k, v2 in x2.items(): + if k not in x1: + raise ValueError( + f'Different dict keys encountered in `apply_factory_merges` ({x1.keys()} vs {x2.keys()})' + ) + else: + x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,)) + elif isinstance(x1, list) and isinstance(x2, list): + if len(x1) != len(x2): + err_msg = f'Cannot merge two lists with different lengths ({len(x1)} and {len(x2)}, encountered at key {key})' + logger.error(err_msg + f'\nx1: {x1}\nx2: {x2}') + raise ValueError(err_msg) + for i, v2 in enumerate(x2): + x1[i] = apply_factory_merges(x1[i], v2, key=key + (i,)) + elif isinstance(x1, list) and isinstance(x2, dict): + for k, v2 in x2.items(): + if not isinstance(k, int): + raise ValueError( + f'Invalid dict key {k} non-integer type encountered in a list-dict merge at level {key}' + ) + if k >= len(x1): + raise ValueError( + f'Dict key {k} out of bound for list of length {len(x1)} (encountered at level {key})' + ) + x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,)) + else: + raise ValueError( + f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2} (at key {key})`' + ) + return x1 diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py new file mode 100644 index 0000000..2d231a2 --- /dev/null +++ b/megatron/core/dist_checkpointing/optimizer.py @@ -0,0 +1,129 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Helpers for defining sharding for optimizer states based on existing sharding for model parameters. """ + +import logging +from copy import deepcopy +from dataclasses import replace +from itertools import chain +from typing import Dict, Iterable, List, Tuple, Union + +logger = logging.getLogger(__name__) + +import torch + +from .dict_utils import nested_values +from .mapping import ( + LocalNonpersistentObject, + ShardedStateDict, + ShardedTensor, + ShardedTensorFactory, + StateDict, +) +from .utils import extract_sharded_tensors_and_factories + + +def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]: + param_mappings = {} + for i, param in enumerate(optim_params_iter): + if id(param) not in param_mappings: + param_mappings[id(param)] = i + return param_mappings + + +def get_param_id_to_sharded_param_map( + model_sharded_state_dict: ShardedStateDict, optim_params_iter: Iterable[torch.nn.Parameter] +) -> Dict[int, Union[ShardedTensor, ShardedTensorFactory]]: + """Generate mapping from optimizer state ids to model sharded parameters. + + Args: + model_sharded_state_dict: sharded state dict with all model sharded tensors (can have any structure) + optim_params_iter: iterable which iterates over model parameters tracked by the optimizer. + The iteration must be in the same order as in the optimizer parameters. + + Returns: + Dict[int, Union[ShardedTensor, ShardedTensorFactory]]: mapping from optimizer state ids + to model sharded parameters. + """ + model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(model_sharded_state_dict) + id_to_sharded_param_map = {} + param_to_id_map = get_optim_param_to_id_map(optim_params_iter) + for ten in nested_values(model_sharded_state_dict): + if id(ten.data) in param_to_id_map: + id_to_sharded_param_map[param_to_id_map[id(ten.data)]] = ten + else: + logger.debug(f'{ten} is not tracked by the optimizer') + + if not id_to_sharded_param_map: + logger.warning( + "Sharded parameters mapping is empty. It means tensors in model state dict" + " do not correspond to tensors in optimizer parameters map." + " Make sure to call state_dict with `keep_vars=True`." + ) + return id_to_sharded_param_map + + +def make_sharded_optimizer_tensor( + model_param: Union[ShardedTensor, ShardedTensorFactory], optim_param: torch.Tensor, prefix: str +) -> Union[ShardedTensor, ShardedTensorFactory]: + """Build a ShardedTensor or ShardedTensorFactory for optimizer param based on model param + + Args: + model_param (Union[ShardedTensor, ShardedTensorFactory]): model param + optim_param (torch.Tensor): corresponding optimizer param + prefix (str): optimizer prefix for the ShardedTensor or ShardedTensorFactory + + Returns: + Union[ShardedTensor, ShardedTensorFactory]: wrapped optimizer parameter + """ + if isinstance(model_param, ShardedTensorFactory): + return replace(model_param, key=f'{prefix}.{model_param.key}', data=optim_param) + + assert ( + tuple(optim_param.shape) == model_param.local_shape + ), f'Optimizer shape ({tuple(optim_param.shape)} does not match model shape ({model_param.local_shape})' + sh_ten = replace( + model_param, key=f'{prefix}.{model_param.key}', data=optim_param, dtype=optim_param.dtype + ) + sh_ten.validate_metadata_integrity() + return sh_ten + + +def optim_state_to_sharding_state( + optim_state_dict: StateDict, + id_to_sharded_param_map: Dict[int, ShardedTensor], + exclude_keys: Tuple[str] = (), +): + """Turn optimizer state dict to sharded state dict based on model state dict *in-place*. + + Can be used to add sharding information to most common optimizer state dict. + Creates separate ShardedTensors for each key in `optim_state_dict['state']` + (e.g. for torch.optim.Adam there will be separate tensors for `exp_avg` and `exp_avg_sq`) + + Args: + optim_state_dict (StateDict): optimizer state dict with + state parameters under `state` key and group hyperparameters under `param_groups` -> `params` key. + id_to_sharded_param_map (Dict[int, ShardedTensor]): mapping from optimizer param ids to model sharded tensors. + Can be generated with `get_param_id_to_sharded_param_map` function + exclude_keys (Tuple[str]): optimizer state keys to exclude from the final state dict. + + Returns: + None: state dict is modified in place + """ + sharded_state = {} + for param_id, param_state in optim_state_dict['state'].items(): + sharded_state[param_id] = {} + for state_key, param in param_state.items(): + if state_key in exclude_keys: + continue + if param_id in id_to_sharded_param_map: + sharded_state[param_id][state_key] = make_sharded_optimizer_tensor( + id_to_sharded_param_map[param_id], param, prefix=f'optimizer.state.{state_key}' + ) + else: + raise ValueError(f'Param id {param_id} does not match any model sharded param') + + optim_state_dict['param_groups'] = deepcopy(optim_state_dict['param_groups']) + for group in optim_state_dict['param_groups']: + group['params'] = LocalNonpersistentObject(group['params']) + optim_state_dict['state'] = sharded_state diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py new file mode 100644 index 0000000..f37aadc --- /dev/null +++ b/megatron/core/dist_checkpointing/serialization.py @@ -0,0 +1,420 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Entrypoints for saving and loading the distributed checkpoints. + +Functions `load` and `save` are equivalents of `torch.load` and `torch.save` +but expect torch.Tensors to be wrapped with classes from the `mapping module`. +Additionally, `load` expects the sharded state dict argument as a guidance for loading the sharded tensors. +""" + +import logging +from pathlib import Path +from typing import Dict, Optional, Set, Tuple, Union + +import torch + +from . import ShardedTensor +from .core import CheckpointingConfig, save_config +from .dict_utils import dict_list_map_inplace, extract_matching_values, merge +from .mapping import ( + CheckpointingException, + ShardedObject, + ShardedStateDict, + ShardedTensorFactory, + StateDict, + apply_factories, + apply_factory_merges, +) +from .strategies.async_utils import AsyncRequest +from .strategies.base import ( + AsyncSaveShardedStrategy, + LoadCommonStrategy, + LoadShardedStrategy, + SaveCommonStrategy, + SaveShardedStrategy, + StrategyAction, + get_default_strategy, +) +from .utils import extract_nonpersistent, extract_sharded_base +from .validation import ( + StrictHandling, + determine_global_metadata, + parse_strict_flag, + validate_integrity_and_strict_load, + validate_sharded_objects_handling, + validate_sharding_integrity, + verify_checkpoint_and_load_strategy, +) + +logger = logging.getLogger(__name__) + + +# flat state dict with sharded objects without any data +CkptShardedMetadata = Dict[str, Union[ShardedTensor, ShardedObject]] + + +def load( + sharded_state_dict: ShardedStateDict, + checkpoint_dir: str, + sharded_strategy: Union[LoadShardedStrategy, Tuple[str, int], None] = None, + common_strategy: Union[LoadCommonStrategy, Tuple[str, int], None] = None, + validate_access_integrity: bool = True, + strict: Union[str, StrictHandling] = StrictHandling.ASSUME_OK_UNEXPECTED, +) -> Union[StateDict, Tuple[StateDict, Set[str], Set[str]]]: + """Loading entrypoint. + + In the steps below, the following verbs refer to corresponding objects: + - load = load from checkpoint + - extract = extract from sharded_state_dict + - add = add to the final state dict + Steps: + 1. Load common state dict and form the base of the result state dict + 2. Apply factories to sharded_state_dict + 3. Extract LocalNonPersistentObject and add + 4. (optional) Extract ShardedObjects, load and add + 5. Extract ShardedBase, load, apply factory merges and add + + Args: + sharded_state_dict (ShardedStateDict): state dict of the existing model + populated with ShardedTensors. Used as a mapping to determine which + parts of global tensors stored in the checkpoint should be loaded. + checkpoint_dir (str): directory with the checkpoint + sharded_strategy (LoadShardedStrategy, Tuple[str, int], optional): configures loading behavior for sharded tensors + common_strategy (LoadCommonStrategy, Tuple[str, int], optional): configures loading behavior for common data + validate_access_integrity (bool default = True): checks if each tensor shard is accessed + exactly once (as main replica) by some process + strict (StrictHandling, str, optional): determines the behavior in case of a mismatch + between the requested sharded state dict and the checkpoint. See `StrictHandling` docs + for more details. Some values affect the return value of this function + (missing and unexpected keys are returned). + Defaults to `True` (StrictHandling.ASSUME_OK_UNEXPECTED) which doesn't + incur any performance overhead. Other recommended values + are: `False` (StrictHandling.LOG_UNEXPECTED) which logs only unexpected keys + or `StrictHandling.RETURN_ALL` which returns all mismatch keys. + + Returns: + StateDict or Tuple[StateDict, Set[str], Set[str]]: in most cases only + the loaded state dict is returned. If `strict` flag was set to + """ + sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy( + checkpoint_dir, sharded_strategy, common_strategy + ) + + checkpoint_dir = Path(checkpoint_dir) + common_state_dict = common_strategy.load_common(checkpoint_dir) + if not sharded_state_dict: + return common_state_dict + + # Create a copy of sharded_state_dict as the passed in state dict may have + # references that prevent tensors from being deallocated + sharded_state_dict, _ = extract_matching_values(sharded_state_dict, lambda x: True) + + sh_ten_factories, _ = extract_matching_values( + sharded_state_dict, + lambda x: isinstance(x, ShardedTensorFactory), + return_lists_as_dicts=True, + ) + apply_factories(sharded_state_dict) + + # Data inside sh_ten_factories no longer needed so delete them to reduce memory usage + dict_list_map_inplace(ShardedTensorFactory.without_data, sh_ten_factories) + # Non-persistent objects + nonpersistent_state_dict, sharded_state_dict = extract_nonpersistent(sharded_state_dict) + dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict) + merge(common_state_dict, nonpersistent_state_dict) + + # At this point we are only dealing with ShardedBase objects + sharded_state_dict, _ = extract_sharded_base(sharded_state_dict) + + # Validation + ckpt_sharded_metadata = None + local_metadata, global_metadata = None, None + strict = parse_strict_flag(strict) + if StrictHandling.requires_explicit_ckpt_mismatch_check(strict): + ckpt_sharded_metadata = load_sharded_metadata( + str(checkpoint_dir), sharded_strategy, common_strategy + ) + if validate_access_integrity or StrictHandling.requires_global_app_metadata(strict): + local_metadata, global_metadata = determine_global_metadata(sharded_state_dict) + + sharded_state_dict, missing_keys, unexpected_keys = validate_integrity_and_strict_load( + sharded_state_dict, + strict, + validate_access_integrity, + local_metadata, + global_metadata, + ckpt_sharded_metadata, + ) + + # ShardedBase loading + if not sharded_strategy.can_handle_sharded_objects: + validate_sharded_objects_handling(sharded_strategy, common_strategy) + sharded_objects_state_dict, sharded_state_dict = extract_matching_values( + sharded_state_dict, lambda v: isinstance(v, ShardedObject) + ) + sharded_objects = common_strategy.load_sharded_objects( + sharded_objects_state_dict, checkpoint_dir + ) + merge(common_state_dict, sharded_objects) + + loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir) + + loaded_state_dict = apply_factory_merges(loaded_state_dict, sh_ten_factories) + + merge(common_state_dict, loaded_state_dict) + if StrictHandling.requires_returning_mismatch_keys(strict): + return common_state_dict, missing_keys, unexpected_keys + else: + return common_state_dict + + +def load_common_state_dict(checkpoint_dir: Path) -> StateDict: + """Load common (non-sharded) objects state dict from the checkpoint. + + Args: + checkpoint_dir (Path): checkpoint directory + + Returns: + StateDict: state dict with non-sharded objects from the checkpoint + """ + sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(str(checkpoint_dir)) + return common_strategy.load_common(checkpoint_dir) + + +def load_tensors_metadata( + checkpoint_dir: str, + sharded_strategy: Union[LoadShardedStrategy, None] = None, +) -> CkptShardedMetadata: + """Load tensors metadata from the checkpoint. + + Returns a dictionary similar to a sharded state dict, but note that + the dictionary keys are simply ShardedTensor keys (contrary to the + actual sharded state dicts where keys correspond to state dict keys). + + Dict values are ShardedTensors without any sharding (so, the only useful + information is tensors global shape and dtype). + + Concrete implementation depends on the loading strategy. If no strategy is + given, a default for a given backend is used. + + Args: + checkpoint_dir (str): checkpoint directory to load from + sharded_strategy (LoadShardedStrategy, optional): sharded strategy to load metadata. + Defaults to None - in this case a default load strategy for a given checkpoint type is used. + + Returns: + CkptShardedMetadata: flat state dict without data describing ShardedTensors in the checkpoint + """ + sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy( + checkpoint_dir, sharded_strategy + ) + return sharded_strategy.load_tensors_metadata(Path(checkpoint_dir)) + + +def load_sharded_metadata( + checkpoint_dir: str, + sharded_strategy: Union[LoadShardedStrategy, None] = None, + common_strategy: Union[LoadCommonStrategy, None] = None, +) -> CkptShardedMetadata: + """Load sharded metadata from the checkpoint. + + Similar to `load_tensors_metadata`, but includes also ShardedObjects. + + Returns a dictionary similar to a sharded state dict, but note that + the dictionary keys are simply ShardedTensor keys (contrary to the + actual sharded state dicts where keys correspond to state dict keys). + + Dict values are ShardedTensors without any sharding (so, the only useful + information is tensors global shape and dtype). + + Concrete implementation depends on the loading strategy. If no strategy is + given, a default for a given backend is used. + + Args: + checkpoint_dir (str): checkpoint directory to load from + sharded_strategy (LoadShardedStrategy, optional): sharded strategy to load metadata. + Defaults to None - in this case a default load strategy for a given checkpoint type is used. + common_strategy (LoadCommonStrategy, optional): common strategy to load metadata. + Defaults to None - in this case a default load strategy for a given checkpoint type is used. + This strategy won't be used unless `sharded_strategy` can't handle ShardedObjects + + Returns: + CkptShardedMetadata: flat state dict without data describing ShardedTensors + and ShardedObjects in the checkpoint + """ + sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy( + checkpoint_dir, sharded_strategy, common_strategy + ) + sharded_metadata = sharded_strategy.load_sharded_metadata(Path(checkpoint_dir)) + if not sharded_strategy.can_handle_sharded_objects: + validate_sharded_objects_handling(sharded_strategy, common_strategy) + common_metadata = common_strategy.load_sharded_metadata(Path(checkpoint_dir)) + sharded_metadata = merge(sharded_metadata, common_metadata) + return sharded_metadata + + +def load_plain_tensors(checkpoint_dir: str) -> StateDict: + """Load checkpoint tensors without any sharding and plain structure. + + NOTE: common state dict is NOT included. + + Args: + checkpoint_dir (str): checkpoint directory to load the tensors from. + + Returns: + StateDict: checkpoint state dict containing only torch.Tensors. + """ + sharded_state_dict = load_tensors_metadata(checkpoint_dir) + # Don't validate integrity because shards will be overlapped + # if world_size > 1 (all processes load whole tensors) + return load(sharded_state_dict, checkpoint_dir, validate_access_integrity=False) + + +# +# def load_plain_tensors_and_objects(checkpoint_dir: str) -> StateDict: +# """Load checkpoint tensors and objects without any sharding and plain structure. +# +# NOTE: state dict structure might be different than the one used for checkpoint saving. +# NOTE: common state dict is NOT included. +# +# Args: +# checkpoint_dir (str): checkpoint directory to load the state dict from. +# +# Returns: +# StateDict: complete checkpoint state dict without any sharding. +# """ +# sharded_state_dict = load_tensors_metadata(checkpoint_dir) +# # Don't validate integrity because shards will be overlapped +# # if world_size > 1 (all processes load whole tensors) +# return load(sharded_state_dict, checkpoint_dir, validate_access_integrity=False) + + +def save( + sharded_state_dict: ShardedStateDict, + checkpoint_dir: str, + sharded_strategy: Union[SaveShardedStrategy, Tuple[str, int], None] = None, + common_strategy: Union[SaveCommonStrategy, Tuple[str, int], None] = None, + validate_access_integrity: bool = True, + async_sharded_save: bool = False, +) -> Optional[AsyncRequest]: + """Saving entrypoint. + + Extracts ShardedTensors from the given state dict. Rank 0 saves the + "regular" part of the checkpoint to common torch file. + The ShardedTensors are saved according to a strategy specified by the + config. + + Steps: + 1. Apply factories + 2. Extract and discard LocalNonPersistentObject + 3. Extract all ShardedBase object + 4. Save all other objects to common.pt + 5. (optional) Extract and save ShardedObjects + 6. Save all ShardedBase objects + 7. Write metadata.json file with backend and version metadata. + + Step (6) can be performed asynchronously (see `async_sharded_save`), in this + case the actual save is embodied in the returned async request and can be + scheduled by the external caller. For async request, step (7) is added as + one of the finalization functions, so that metadata.json is written only + if the checkpoint is complete. + + Args: + sharded_state_dict (ShardedStateDict): state dict of the populated with + ShardedTensors. Used as a mapping to determine how local tensors + should be saved as global tensors in the checkpoint. + checkpoint_dir (str): directory to save the checkpoint to + sharded_strategy (SaveShardedStrategy, Tuple[str, int], optional): configures sharded tensors saving behavior and backend + common_strategy (SaveCommonStrategy, Tuple[str, int], optional): configures common data saving behavior and backend + validate_access_integrity (bool default = True): checks if each tensor shard is accessed + exactly once (as main replica) by some process + async_sharded_save (bool, optional): if True, for the sharded state dict part + an async save implementation will be called, with the AsyncRequest + being returned to the caller. Note that it is the caller responsibility to + actually schedule the async save. Defaults to False. + + Returns: + AsyncRequest (optional): if `async_sharded_save` is True, returns + async request that should be scheduled by the caller of this function. + None otherwise. + """ + checkpoint_dir = Path(checkpoint_dir) + + if torch.distributed.get_rank() == 0: + if not checkpoint_dir.exists(): + raise CheckpointingException( + f'Checkpoint destination directory does not exist: {checkpoint_dir}' + ) + + if next(checkpoint_dir.iterdir(), None) is not None: + raise CheckpointingException( + f'Checkpoint destination directory ({checkpoint_dir}) is not empty' + ) + + if common_strategy is not None: + raise NotImplementedError('The only supported common strategy is torch') + + if sharded_strategy is None: + sharded_strategy = get_default_save_sharded_strategy() + if not isinstance(sharded_strategy, SaveShardedStrategy): + assert isinstance(sharded_strategy, tuple), type(sharded_strategy) + sharded_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, *sharded_strategy) + + if common_strategy is None: + common_strategy = get_default_save_common_strategy() + if not isinstance(common_strategy, SaveCommonStrategy): + assert isinstance(common_strategy, tuple), type(common_strategy) + common_strategy = get_default_strategy(StrategyAction.SAVE_COMMON, *common_strategy) + + apply_factories(sharded_state_dict) + _, sharded_state_dict = extract_nonpersistent(sharded_state_dict) + sharded_state_dict, state_dict = extract_sharded_base(sharded_state_dict) + + common_strategy.save_common(state_dict, checkpoint_dir) + + if validate_access_integrity: + validate_sharding_integrity(determine_global_metadata(sharded_state_dict)[1]) + + if not sharded_strategy.can_handle_sharded_objects: + validate_sharded_objects_handling(sharded_strategy, common_strategy) + sharded_objects_state_dict, sharded_state_dict = extract_matching_values( + sharded_state_dict, lambda v: isinstance(v, ShardedObject) + ) + common_strategy.save_sharded_objects(sharded_objects_state_dict, checkpoint_dir) + + def metadata_finalize_fn(): + if torch.distributed.get_rank() == 0: + save_config( + CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version), + checkpoint_dir, + ) + torch.distributed.barrier() + + if not async_sharded_save: + sharded_strategy.save(sharded_state_dict, checkpoint_dir) + metadata_finalize_fn() + return + + if not isinstance(sharded_strategy, AsyncSaveShardedStrategy): + raise CheckpointingException( + f'Cannot apply async_save to non-async strategy {sharded_strategy}' + ) + async_request = sharded_strategy.async_save(sharded_state_dict, checkpoint_dir) + async_request.finalize_fns.append(metadata_finalize_fn) + return async_request + + +def get_default_save_sharded_strategy( + backend: str = 'torch_dist', version: int = 1 +) -> SaveShardedStrategy: + return get_default_strategy(StrategyAction.SAVE_SHARDED, backend, version) + + +def get_default_save_common_strategy( + backend: str = 'torch', version: int = 1 +) -> SaveCommonStrategy: + return get_default_strategy(StrategyAction.SAVE_COMMON, backend, version) + + +def get_default_load_sharded_strategy(checkpoint_dir: str) -> LoadShardedStrategy: + return verify_checkpoint_and_load_strategy(checkpoint_dir)[0] diff --git a/megatron/core/dist_checkpointing/strategies/__init__.py b/megatron/core/dist_checkpointing/strategies/__init__.py new file mode 100644 index 0000000..db8093f --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Various loading and saving strategies """ + +from .common import _import_trigger diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py new file mode 100644 index 0000000..24ee43d --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/async_utils.py @@ -0,0 +1,231 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +This module provides an async utilities which allow to start +a checkpoint save process in the background. +""" +import logging +from collections import deque +from time import time +from typing import Callable, List, NamedTuple, Optional, Tuple + +import torch +from torch import multiprocessing as mp + +logger = logging.getLogger(__name__) + + +class AsyncRequest(NamedTuple): + """Represents an async request that needs to be scheduled for execution. + + Args: + async_fn (Callable, optional): async function to call. None represents noop. + async_fn_args (Tuple): args to pass to `async_fn`. + finalize_fns (List[Callable]): list of functions to call to finalize the request. + These functions will be called synchronously after `async_fn` is done + *on all ranks*. + """ + + async_fn: Optional[Callable] + async_fn_args: Tuple + finalize_fns: List[Callable] + is_frozen: bool = False + + def add_finalize_fn(self, fn: Callable) -> None: + """Adds a new finalize function to the request. + + Args: + fn (Callable): function to add to the async request. This function + will be called *after* existing finalization functions. + + Returns: + None + """ + if self.is_frozen: + raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest') + self.finalize_fns.append(fn) + + def execute_sync(self) -> None: + """Helper to synchronously execute the request. + + This logic is equivalent to what should happen in case of the async call. + """ + if self.async_fn is not None: + self.async_fn(*self.async_fn_args) + torch.distributed.barrier() + for finalize_fn in self.finalize_fns: + finalize_fn() + + def freeze(self) -> 'AsyncRequest': + """Freezes the async request, disallowing adding new finalization functions. + + Returns: + AsyncRequest: new async request with all same fields except for the + `is_frozen` flag. + """ + return self._replace(is_frozen=True) + + +class DistributedAsyncCaller: + """Wrapper around mp.Process that ensures correct semantic of distributed finalization. + + Starts process asynchronously and allows checking if all processes on all ranks are done. + """ + + def __init__(self): + self.process: Optional[mp.Process] = None + self.start_time: Optional[float] = None + + def schedule_async_call( + self, + async_fn: Optional[Callable], + save_args: Tuple, + ) -> None: + """Spawn a process with `async_fn` as the target. + + This method must be called on all ranks. + + Args: + async_fn (Callable, optional): async function to call. If None, + no process will be started. + save_args (Tuple): async function args. + """ + if async_fn is None: + return # nothing to do + start_sync = time() + torch.cuda.synchronize() + end_sync = time() + logger.debug( + f"rank: {torch.distributed.get_rank()}, takes {end_sync - start_sync} to finish D2H " + ) + + ctx = mp.get_context('fork') + self.start_time = time() + self.process = ctx.Process( + target=async_fn, + args=save_args, + ) + self.process.start() + init_time = time() + logger.debug( + f"rank: {torch.distributed.get_rank()}, takes {init_time - self.start_time} to schedule async ckpt " + ) + + def is_current_async_call_done(self, blocking=False) -> bool: + """Check if async save is finished on all ranks. + + For semantic correctness, requires rank synchronization in each check. + This method must be called on all ranks. + + Args: + blocking (bool, optional): if True, will wait until the call is done + on all ranks. Otherwise, returns immediately if at least one rank + is still active. Defaults to False. + + Returns: + bool: True if all ranks are done (immediately of after active wait + if `blocking` is True), False if at least one rank is still active. + """ + # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce) + is_alive = int(self.process.is_alive()) if self.process is not None else 0 + ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device()) + logger.debug( + f"rank: {torch.distributed.get_rank()}, DistributedAsyncCaller is_alive: {is_alive}" + ) + torch.distributed.all_reduce(ten) + if ten[0] > 0 and not blocking: + return False + else: + if self.process is not None: + logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process") + self.process.join() + self.process = None + + logger.debug( + f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking" + ) + self.start_time = None + return True + + +class _ActiveAsyncRequest(NamedTuple): + """Helper to represent an active async call. + + Args: + idx (int): index of the call (starting from 0) + async_caller (DistributedAsyncCaller): async caller instance that represents + the async process handling the async request + async_request (AsyncRequest): async request that is being called + """ + + idx: int + async_caller: DistributedAsyncCaller + async_request: AsyncRequest + + +class AsyncCallsQueue: + """Manages a queue of async calls. + + Allows adding a new async call with `schedule_async_request` and finalizing + active calls with `maybe_finalize_async_calls`. + """ + + def __init__(self): + self.async_calls: deque[_ActiveAsyncRequest] = deque([]) + self.call_idx: int = -1 + + def schedule_async_request(self, async_request: AsyncRequest) -> int: + """Start a new async call and add it to a queue of active async calls. + + This method must be called on all ranks. + + Args: + async_request (AsyncRequest): async request to start. + + Returns: + int: index of the async call that was started. + This can help the user keep track of the async calls. + """ + self.call_idx += 1 + async_caller = DistributedAsyncCaller() + async_request = async_request.freeze() + async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args) + self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request)) + return self.call_idx + + def maybe_finalize_async_calls(self, blocking=False) -> List[int]: + """Finalizes all available calls. + + This method must be called on all ranks. + + Args: + blocking (bool, optional): if True, will wait until all active requests + are done. Otherwise, finalizes only the async request that already + finished. Defaults to False. + Returns: + List[int]: list of indices (as returned by `schedule_async_request`) + of async calls that have been successfully finalized. + """ + call_idx_finalized = [] + while self.async_calls: + next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking) + if not next_async_done: + break + call_idx, _, async_request = self.async_calls.popleft() + for finalize_fn in async_request.finalize_fns: + finalize_fn() + ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device()) + torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX) + assert ( + ten.item() == call_idx + ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization' + call_idx_finalized.append(call_idx) + return call_idx_finalized + + def get_num_unfinalized_calls(self): + """Get the number of active async calls.""" + return len(self.async_calls) + + def close(self): + """Finalize all calls upon closing.""" + self.maybe_finalize_async_calls(blocking=True) diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py new file mode 100644 index 0000000..daa37fe --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/base.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Strategies base interfaces. """ + +from abc import ABC, abstractmethod +from collections import defaultdict +from enum import Enum +from pathlib import Path + +from ..mapping import CheckpointingException, ShardedStateDict, StateDict +from .async_utils import AsyncCallsQueue, AsyncRequest + + +class StrategyAction(Enum): + LOAD_COMMON = 'load_common' + LOAD_SHARDED = 'load_sharded' + SAVE_COMMON = 'save_common' + SAVE_SHARDED = 'save_sharded' + + +default_strategies = defaultdict(dict) + +async_calls = AsyncCallsQueue() + + +def get_default_strategy(action: StrategyAction, backend: str, version: int): + """Retrieves a default strategy for a given action, backend and version.""" + try: + if backend == 'zarr': + error_hint = ' Please install `zarr` and `tensorstore<=0.1.45` packages' + from .tensorstore import _import_trigger + from .zarr import _import_trigger + elif backend == 'torch_dist': + error_hint = ' Please use PyTorch version >=2.1' + from .torch import _import_trigger + except ImportError as e: + raise CheckpointingException( + f'Cannot import a default strategy for: {(action.value, backend, version)}. Error: {e}. Hint: {error_hint}' + ) from e + try: + return default_strategies[action.value][(backend, version)] + except KeyError as e: + raise CheckpointingException( + f'Cannot find a default strategy for: {(action.value, backend, version)}' + ) from e + + +class LoadStrategyBase(ABC): + """Base class for a load strategy. Requires implementing checks for compatibility with a given checkpoint version.""" + + @abstractmethod + def check_backend_compatibility(self, loaded_version): + raise NotImplementedError + + @abstractmethod + def check_version_compatibility(self, loaded_version): + raise NotImplementedError + + @property + def can_handle_sharded_objects(self): + """Returns whether or not this strategy can handle loading ShardedObjects.""" + return False + + +class SaveStrategyBase(ABC): + """Base class for a save strategy. Requires defining a backend type and version of the saved format.""" + + def __init__(self, backend: str, version: int): + self.backend = backend + self.version = version + + @property + def can_handle_sharded_objects(self): + """Returns whether or not this strategy can handle saving ShardedObjects.""" + return False + + def __str__(self): + return f'{self.__class__.__name__}({self.backend}, {self.version})' + + +class LoadCommonStrategy(LoadStrategyBase): + """Load strategy for common (non-sharded) objects""" + + @abstractmethod + def load_common(self, checkpoint_dir: Path): + raise NotImplementedError + + @abstractmethod + def load_sharded_objects( + self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path + ): + raise NotImplementedError + + def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict: + if not self.can_handle_sharded_objects: + return {} + raise NotImplementedError + + +class LoadShardedStrategy(LoadStrategyBase): + """Load strategy for sharded tensors""" + + @abstractmethod + def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + raise NotImplementedError + + @abstractmethod + def load_tensors_metadata(self, checkpoint_dir: Path): + """Load tensors metadata from the checkpoint for ShardedTensors. + + Returns a dictionary similar to a sharded state dict, but note that + the dictionary keys are simply ShardedTensor keys (contrary to the + actual sharded state dicts where keys correspond to state dict keys). + + Dict values are ShardedTensors without any data and sharding (so, the + only useful information is tensors global shape and dtype). + """ + raise NotImplementedError( + f'Loading only tensors metadata not implemented for {self.__class__.__name__}' + ) + + def load_sharded_metadata(self, checkpoint_dir: Path): + """Load sharded metadata from the checkpoint for ShardedTensors and ShardedObjects. + + Returns a dictionary similar to a sharded state dict, but note that + the dictionary keys are simply sharded keys (contrary to the + actual sharded state dicts where keys correspond to state dict keys). + + Dict values are ShardedTensors or ShardedObjects without any data and sharding. + """ + if not self.can_handle_sharded_objects: + return self.load_tensors_metadata(checkpoint_dir) + raise NotImplementedError( + f'Loading only sharded metadata not implemented for {self.__class__.__name__}' + ) + + +class SaveCommonStrategy(SaveStrategyBase): + """Save strategy for common (non-sharded) objects""" + + @abstractmethod + def save_common(self, common_state_dict: StateDict, checkpoint_dir: Path): + raise NotImplementedError + + def save_sharded_objects( + self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path + ): + raise NotImplementedError + + +class SaveShardedStrategy(SaveStrategyBase): + """Save strategy for sharded tensors""" + + @abstractmethod + def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + raise NotImplementedError + + +class AsyncSaveShardedStrategy(SaveShardedStrategy): + """Save strategy suitable for async save.""" + + @abstractmethod + def async_save( + self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path + ) -> AsyncRequest: + """Perform preparation and return an AsyncRequest to the external caller. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to save + checkpoint_dir (Path): checkpoint target directory + + Returns: + AsyncRequest: represents the async save function and finalization function. + It is the caller responsibility to actually schedule the async save. + """ + raise NotImplementedError + + def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + """Each async strategy can be trivially used as a sync strategy.""" + async_request = self.async_save(sharded_state_dict, checkpoint_dir) + # multiprocessing routines may cause issue when called on parent process + # We keep this verbose call for now + global async_calls + async_calls.schedule_async_request(async_request) + async_calls.maybe_finalize_async_calls(blocking=True) diff --git a/megatron/core/dist_checkpointing/strategies/common.py b/megatron/core/dist_checkpointing/strategies/common.py new file mode 100644 index 0000000..cfa55ab --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/common.py @@ -0,0 +1,147 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" Common strategies. """ + +import logging +import os +from itertools import product +from pathlib import Path + +import torch + +from megatron.core.dist_checkpointing.mapping import ShardedStateDict, StateDict +from megatron.core.dist_checkpointing.strategies.base import ( + SaveCommonStrategy, + StrategyAction, + default_strategies, +) + +from ..dict_utils import dict_list_map_inplace, nested_values +from ..mapping import CheckpointingException, ShardedObject, is_main_replica +from ..strategies.base import LoadCommonStrategy + +_import_trigger = None + +COMMON_STATE_FNAME = 'common.pt' + +logger = logging.getLogger(__name__) + + +class TorchCommonSaveStrategy(SaveCommonStrategy): + def save_common(self, common_state_dict: StateDict, checkpoint_dir: Path): + if torch.distributed.get_rank() == 0: + torch.save(common_state_dict, checkpoint_dir / COMMON_STATE_FNAME) + + def save_sharded_objects( + self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path + ): + + for sh_obj in nested_values(sharded_objects_state_dict): + if is_main_replica(sh_obj.replica_id): + save_path = checkpoint_dir / f'{sh_obj.unique_key}.pt' + os.makedirs(save_path.parent, exist_ok=True) + torch.save(sh_obj.data, save_path) + + def can_handle_sharded_objects(self): + return True + + +class TorchCommonLoadStrategy(LoadCommonStrategy): + def load_common(self, checkpoint_dir: Path): + """Load common (non-sharded) objects state dict from the checkpoint. + + Args: + checkpoint_dir (Path): checkpoint directory + + Returns: + StateDict: state dict with non-sharded objects from the checkpoint + """ + load_path = Path(checkpoint_dir) / COMMON_STATE_FNAME + try: + return torch.load(load_path, map_location='cpu') + except FileNotFoundError as e: + err_msg = f'Common file {load_path} does not exist' + ckpt_files = [f.name for f in checkpoint_dir.iterdir()] + logger.debug(f'{err_msg}. Checkpoint directory content: {ckpt_files}') + raise CheckpointingException(err_msg) from e + + def load_sharded_objects( + self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path + ): + """Replaces all ShardedObject from a given state dict with values loaded from the checkpoint. + + Args: + sharded_objects_state_dict (ShardedStateDict): sharded state dict defining what objects should be loaded. + checkpoint_dir (Path): checkpoint directory + + Returns: + None: sharded state dict is modified in place + """ + + def load_sharded_object(sh_obj: ShardedObject): + sh_obj.data = None + load_path = checkpoint_dir / f'{sh_obj.unique_key}.pt' + try: + loaded_obj = torch.load(load_path) + except FileNotFoundError as e: + # Backward compatible logic: previously the save format was incorrect + old_load_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt') + try: + loaded_obj = torch.load(old_load_path) + except FileNotFoundError: + err_msg = f'Object shard {load_path} not found' + obj_subdir = checkpoint_dir / sh_obj.key + if obj_subdir.exists(): + obj_files = [f.name for f in obj_subdir.iterdir()] + logger.debug( + f'{err_msg}. Object {sh_obj.key} directory content: {obj_files}' + ) + else: + ckpt_files = [f.name for f in checkpoint_dir.iterdir()] + logger.debug( + f'{err_msg}. Object {sh_obj.key} directory does not exist. Checkpoint directory content: {ckpt_files}' + ) + raise CheckpointingException(err_msg) from e + return loaded_obj + + return dict_list_map_inplace(load_sharded_object, sharded_objects_state_dict) + + def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict: + sharded_metadata = {} + for subdir in checkpoint_dir.iterdir(): + if not subdir.is_dir(): + continue + shard_files = list(subdir.glob('shard_*.pt')) + if not shard_files: + continue + sh_objs = [] + for shard_file in shard_files: + full_key = f'{subdir.name}/{shard_file.stem}' + sh_objs.append(ShardedObject.empty_from_unique_key(full_key)) + + # This is a backward-compatibility fix, where the last global shape is missing in the name + if sh_objs[0].global_shape[-1] < 0: + max_last_offset = max(map(lambda sh_obj: sh_obj.global_offset[-1], sh_objs)) + for sh_obj in sh_objs: + sh_obj.global_shape = (*sh_obj.global_shape[:-1], max_last_offset + 1) + + # Update the sharded state dict + for sh_obj in sh_objs: + sharded_metadata[sh_obj.unique_key] = sh_obj + return sharded_metadata + + @property + def can_handle_sharded_objects(self): + return True + + def check_backend_compatibility(self, loaded_version): + pass + + def check_version_compatibility(self, loaded_version): + pass + + +default_strategies[StrategyAction.LOAD_COMMON.value][('torch', 1)] = TorchCommonLoadStrategy() +default_strategies[StrategyAction.SAVE_COMMON.value][('torch', 1)] = TorchCommonSaveStrategy( + 'torch', 1 +) diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py new file mode 100644 index 0000000..bfa6091 --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/filesystem_async.py @@ -0,0 +1,390 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +""" Storage writer for PyT Distributed format allowing asynchronous save. """ +import gc +import logging +import os +import queue +from contextlib import contextmanager +from itertools import chain +from pathlib import Path +from time import time +from typing import Callable, Dict, List, Optional, Tuple, Union + +import psutil +import torch +from torch import multiprocessing as mp +from torch.distributed.checkpoint import FileSystemWriter +from torch.distributed.checkpoint.filesystem import DEFAULT_SUFFIX, _StoragePrefix, _write_item +from torch.distributed.checkpoint.planner import SavePlan, SavePlanner, WriteItem, WriteItemType +from torch.distributed.checkpoint.storage import WriteResult +from torch.futures import Future + +logger = logging.getLogger(__name__) + +WriteBucket = Tuple[Path, str, Tuple[list, list]] # represents writes to a single file + +_results_queue = None + + +def _get_write_results_queue(): + global _results_queue + if _results_queue is None: + ctx = mp.get_context('spawn') + _results_queue = ctx.Manager().Queue() + return _results_queue + + +@contextmanager +def _disable_gc(): + """Temporarily disables GC.""" + gc_enabled = gc.isenabled() + try: + if gc_enabled: + gc.disable() + yield + finally: + if gc_enabled: + gc.enable() + + +class FileSystemWriterAsync(FileSystemWriter): + """ + Async-enabled implementation of FileSystemWriter using file IO. + + This class doesn't spawn the async process itself, relies on the external async mechanism. + + Flow: + 1. Call `write_data` + 2. Externally start async process with `get_save_function_and_args` function and args + 3. The async function to call is `writer_proxy_func` which calls + `write_preloaded_data` in multiple processes + + After saving is finalized on all ranks: + 4. Call `super().finish` with the results gathered in `self.writer_result` + + Note that step (3) above can also be called synchronously. + + Currently, it's assumed that a separate writer is created for each ckpt save + (intermediate state is stored as writer attributes). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self.single_file_per_rank: + raise NotImplementedError( + 'single_file_per_rank flag not supported for FileSystemWriterAsync' + ) + + # Intermediate state between preparation and finalization + self.write_buckets: Optional[List[WriteBucket]] = None + self.results_queue: Optional[mp.Queue] = None + + def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> None: + """ + First stage of async saving. Copy data to CPU and plan the local saving. + + Args: + plan (SavePlan): save plan generated by the PyT Distributed compatible planner + planner (SavePlanner): save planner used to resolve the bytes and tensor data + + Returns: None, but stores the save plan in `self.write_buckets` + """ + storage_plan: _StoragePrefix = plan.storage_data + start = time() + logger.debug(f"thread_count: {self.thread_count}, time: {start}") + item_buckets = _split_by_size_and_type(self.thread_count, plan.items) + logger.debug(f"bucket_prep, time: {time() - start}") + + start = time() + # move tensors from GPU to CPU before starting async writing + # We do D2H synchronously for now + file_count = 0 + + def gen_file(): + nonlocal file_count + file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}" + file_count += 1 + return file_name + + # Prepare bytes / tensor data in each bucket, which will be assigned to each writer process + self.write_buckets = [] + for bucket in item_buckets: + bytes_data = [ + (item, planner.resolve_data(item)) + for item in bucket + if item.type == WriteItemType.BYTE_IO + ] + tensor_data = [ + (item, planner.resolve_data(item).detach().to("cpu", non_blocking=True)) + for item in bucket + if item.type != WriteItemType.BYTE_IO + ] + if len(bytes_data) > 0 or len(tensor_data) > 0: + file_name = gen_file() + self.write_buckets.append( + (self.path / file_name, file_name, (bytes_data, tensor_data)) + ) + + # Check if there is anything to write on this rank + if len(self.write_buckets) > 0: + assert len(self.write_buckets) <= self.thread_count, ( + len(self.write_buckets), + self.thread_count, + ) + self.results_queue = _get_write_results_queue() + else: + self.results_queue = None + end = time() + logger.debug(f"D2H and push, time: {end - start}") + + def get_save_function_and_args(self) -> Tuple[Optional[Callable], Tuple]: + """ + Get function that saves the data to storage along with its arguments. + Allows the external caller to apply the save function synchronously or asynchronously. + + Returns: None (if there is nothing to write on this rank) or a tuple of: + - the function that saves the data + - arguments to that function + """ + if not self.write_buckets: + return None, () + return (self.write_preloaded_data_multiproc, (self.write_buckets, self.results_queue)) + + @staticmethod + @_disable_gc() + def write_preloaded_data_multiproc( + write_buckets: List[WriteBucket], global_results_queue: mp.Queue + ) -> None: + """ + Performs saving data to storage with multiple processes. + + Starts predefined number of processes and uses 2 queues to make sure the results + are complete: + - local_results_queue - to send the actual results + - count_queue - small queue to mark worker as completed + + Using just one queue disallowed proper exception handling. + + This method is meant to be run in a forked subprocess. + Triggering GC during execution leads to CUDA errors + (cleaning up tensors owned by the parent process). + To prevent this, we disable the GC explicitly for this function with _disable_gc. + + Args: + write_buckets (List[WriteBucket]): write plan + global_results_queue (mp.Queue): mp.Queue to collect Dict[List[WriteResults]] (or an Exception) + from parallel write processes to the main training process + Returns: None + """ + w_start = time() + write_results_or_exc: Union[dict, Exception] = dict() + ctx = mp.get_context('fork') + local_results_queue = ctx.Queue() + count_queue = ctx.JoinableQueue() + p_list = [] + for i, write_bucket in enumerate(write_buckets): + try: + count_queue.put(i) + p_list.append( + ctx.Process( + target=FileSystemWriterAsync.write_preloaded_data, + args=(i, write_bucket, local_results_queue, count_queue, True), + ) + ) + except Exception as e: + err_msg = f'An error is caught while a proc {i} is created, error: {e}' + logger.error(err_msg) + write_results_or_exc = RuntimeError(err_msg) + + if not isinstance(write_results_or_exc, Exception): + for p in p_list: + p.start() + + logger.debug('FileSystemWriterAsync: collecting worker results...') + + # To make sure all nodes are completed + count_queue.join() + # At this point, all workers completed, so the queue should have exactly `len(write_buckets)` items + for proc_idx in range(len(write_buckets)): + try: + local_proc_idx, local_results_or_exc = local_results_queue.get() + except queue.Empty: + write_results_or_exc = RuntimeError( + f'Unexpected empty `local_results_queue` (got only {proc_idx}/{len(write_buckets)} items)' + ) + break + else: + if isinstance(local_results_or_exc, Exception): + err_msg = f"Local process {local_proc_idx} encountered an error: {local_results_or_exc}" + logger.error(err_msg) + write_results_or_exc = local_results_or_exc + break + else: + assert isinstance(local_results_or_exc, list), type(local_results_or_exc) + write_results_or_exc[local_proc_idx] = local_results_or_exc + p_list[local_proc_idx].join() + + logger.debug('FileSystemWriterAsync: collected worker results successfully') + + global_results_queue.put(write_results_or_exc) + + w_end = time() + logger.debug( + f"{w_end}, rank: {torch.distributed.get_rank()}, write(sync,parallel): {w_end - w_start}" + ) + + @staticmethod + @_disable_gc() + def write_preloaded_data( + local_proc_idx: int, + write_bucket: WriteBucket, + results_queue: mp.SimpleQueue, + count_queue: mp.JoinableQueue, + use_fsync: bool, + ) -> None: + """ + Performs actual data saving to storage. + + Args: + local_proc_idx (int): index of a local process that performs writing + write_bucket (WriteBucket): data to write to storage + results_queue (mp.Queue): queue to return the write results to the proxy checkpoint process. + count_queue (mp.JoinableQueue): queue to marks worker task as completed + use_fsync (bool): if True, calls os.fsync at the end of saving + + Returns: None, the write result are put into the `queue` + """ + mem_before = _process_memory() + + local_results = [] + try: + file_name, storage_key, (bytes_data, tensor_data) = write_bucket + with open(file_name, "wb") as stream: + for write_item, data in bytes_data: + local_results.append(_write_item(stream, data, write_item, storage_key)) + + for write_item, tensor in tensor_data: + assert tensor.is_cpu + local_results.append(_write_item(stream, tensor, write_item, storage_key)) + + if use_fsync: + os.fsync(stream.fileno()) + local_output = (local_proc_idx, local_results) + except Exception as e: + local_output = (local_proc_idx, e) + + results_queue.put(local_output) + # Signal this process is done. + count_queue.get() + count_queue.task_done() + + mem_after = _process_memory() + logger.debug( + f"{local_proc_idx} consumed: {mem_after - mem_before}, before: {mem_before}, after: {mem_after}" + ) + + def write_data( + self, + plan: SavePlan, + planner: SavePlanner, + ) -> Future[List[WriteResult]]: + raise NotImplementedError('write_data not implemented for FileSystemWriterAsync') + + def retrieve_write_results(self) -> List[WriteResult]: + """ + Turn the latest dict including write results from `self.results_queue` into a single results lists. Includes error check. + + Returns (List[WriteResult]): the list of write results from all local processes performing the save. + + """ + assert self.write_buckets is not None + + if self.results_queue is None: + write_results_or_exc = {} + else: + try: + write_results_or_exc = self.results_queue.get_nowait() + except queue.Empty: + raise RuntimeError(f'results_queue should not be empty') + + if isinstance(write_results_or_exc, Exception): + raise RuntimeError(f'Worker failure: {write_results_or_exc}') from write_results_or_exc + write_results: dict = write_results_or_exc + if len(write_results) != len(self.write_buckets): + raise RuntimeError( + f'Incomplete worker results (expected {len(self.write_buckets)}, got {len(write_results)}.' + f' This probably indicates a worker failure.' + ) + return list(chain.from_iterable(write_results.values())) + + +def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]: + """ + Splits write items according to item size into close to uniform bins. + + Same as torch.distributed.checkpoint.filesystem._split_by_size_and_type, + but with a fixed _item_size function. + + Args: + bins (int): numbers of bins to split to + items (List[WriteItem]): list of write items + + Returns (List[List[WriteItem]]): write items split to bins + """ + if bins == 1: + return [items] + + bytes_items = [wi for wi in items if wi.type == WriteItemType.BYTE_IO] + tensor_items = [wi for wi in items if wi.type != WriteItemType.BYTE_IO] + + buckets: List[List[WriteItem]] = [[] for _ in range(bins)] + bucket_sizes = [0 for _ in range(bins)] + + tensor_items.sort(key=_item_size, reverse=True) + + # Assign bytes with a simple round-robin + for i, item in enumerate(bytes_items): + buckets[i % bins].append(item) + + # Then, assign tensors according to their sizes + for item in tensor_items: + # TODO replace with headq + idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0] + buckets[idx].append(item) + bucket_sizes[idx] += _item_size(item) + + return buckets + + +def _item_size(item: WriteItem) -> int: + """ + Calculates size (in bytes) of a single write item. + + Same as torch.distributed.checkpoint.filesystem._item_size, + but fixes computing chunk size (with item.tensor_data.chunk.sizes) + + Args: + item (WriteItem): write item to compute the size of + + Returns (int): size of an item in bytes + """ + size = 1 + assert item.tensor_data is not None + # can't use math.prod as PT needs to support older python + for s in item.tensor_data.chunk.sizes: + size *= s + + dtype = item.tensor_data.properties.dtype + return size * torch._utils._element_size(dtype) + + +def _process_memory() -> int: + """ + Get memory used by current process. + + Returns (int): memory used by current process + """ + process = psutil.Process(os.getpid()) + mem_info = process.memory_info() + return mem_info.rss diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py new file mode 100644 index 0000000..871dae9 --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/fully_parallel.py @@ -0,0 +1,876 @@ +import logging +from collections import defaultdict +from functools import reduce +from itertools import zip_longest +from pathlib import Path +from time import time +from typing import Dict, List, NamedTuple, Optional, Set, Tuple, TypeVar, cast + +import numpy as np +import torch +import torch.distributed as dist + +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.core import CheckpointingException +from megatron.core.dist_checkpointing.dict_utils import ( + dict_list_map_inplace, + extract_matching_values, + merge, + nested_values, +) +from megatron.core.dist_checkpointing.mapping import ShardedStateDict, StateDict, is_main_replica +from megatron.core.dist_checkpointing.strategies.base import ( + AsyncSaveShardedStrategy, + LoadShardedStrategy, + SaveShardedStrategy, +) +from megatron.core.dist_checkpointing.validation import ( + determine_global_metadata, + validate_sharding_integrity, +) + +logger = logging.getLogger(__name__) + + +# _ShardId uniquely identifies a ShardedTensor. This is a subset of ShardedTensor +# attributes: key (str), global_offset (tuple) and flattened_range (optional tuple) +_ShardId = Tuple[str, tuple, Optional[tuple]] + + +class SaveLoadDistribution(NamedTuple): + """Represents a save or load distribution of ShardedTensors. + + Given distribution is valid only for a specific parallelization group, + which is implicit here (not referenced by this class). + + Args: + main_rank_for_shard (Dict[_ShardId, int]): specifies which rank should hold + the main replica for a given shard + shards_in_this_group (Set[_ShardId]): which shards have a main replica + in this parallelization group + shard_to_metadata (Dict[_ShardId, ShardedTensor]): maps ShardedTensor + identifier to the original ShardedTensor + + """ + + main_rank_for_shard: Dict[_ShardId, int] + shards_in_this_group: Set[_ShardId] + shard_to_metadata: Dict[_ShardId, ShardedTensor] + + +class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy): + """Wraps arbitrary strategy and distributes the save during `save`. + + The save distribution happens without any *data* communication. + Only the *metadata* is exchanged and based on data replication on different + ranks, we try to distribute the save as uniformly as possible. + + This wrapper assumes, that setting `replica_id` to 0 will make the + underlying strategy do the saving on current rank. All the other `replica_id`s + are set to 1. + + Currently, the save distribution is realized with a greedy algorithm + described in `distribute_shards_to_ranks`. + + Args: + strategy (SaveShardedStrategy): base strategy to wrap + parallelization_group (ProcessGroup, optional): process group to use for save + distribution. Note that this doesn't have to match exactly the + data distribution, but should cover the replication pattern + to maximize performance. Defaults to the whole world. + do_cache_distribution (bool, optional): whether to cache the save distribution + from previous calls. Should be set to True only if the state dict + structure between the calls is always the same. Defaults to True. + """ + + def __init__( + self, + strategy: SaveShardedStrategy, + parallelization_group: Optional[torch.distributed.ProcessGroup] = None, + do_cache_distribution: bool = False, + ): + super().__init__(strategy.backend, strategy.version) + self.base_strategy = strategy + self.parallelization_group = parallelization_group + self.do_cache_distribution = do_cache_distribution + + self.cached_distribution: Optional[SaveLoadDistribution] = None + + def async_save( + self, + sharded_state_dict: ShardedStateDict, + checkpoint_dir: Path, + ): + if not isinstance(self.base_strategy, AsyncSaveShardedStrategy): + raise CheckpointingException( + f'Cannot apply async_save to non-async base strategy {self.base_strategy}' + ) + self.apply_saving_parallelization(sharded_state_dict) + return self.base_strategy.async_save(sharded_state_dict, checkpoint_dir) + + def save( + self, + sharded_state_dict: ShardedStateDict, + checkpoint_dir: Path, + ): + self.apply_saving_parallelization(sharded_state_dict) + return self.base_strategy.save(sharded_state_dict, checkpoint_dir) + + def apply_saving_parallelization(self, sharded_state_dict: ShardedStateDict) -> None: + """Distributes the save across ranks by exchanging metadata. + + Exchanges metadata from the state dict and computes the uniform + (as close as possible) distribution of saves among the ranks. + + If `self.do_cache_distribution` is True, caches the distribution between + the calls and subsequent distributions happen without any inter-rank + communication. + + Args: + sharded_state_dict (ShardedStateDict): state dict to distribute the saving + + Returns: None + """ + start = time() + if self.do_cache_distribution and self.cached_distribution is not None: + logger.debug(f'Apply *cached* save parallelization') + precomputed_distribution = self.cached_distribution + else: + logger.debug(f'Apply save parallelization') + precomputed_distribution = determine_main_replica_uniform_distribution( + sharded_state_dict, self.parallelization_group + ) + + distribute_main_replicas_with_precomputed_distribution( + sharded_state_dict, self.parallelization_group, precomputed_distribution + ) + if self.cached_distribution is None: + # First time applying the parallelization + validate_sharding_integrity(determine_global_metadata(sharded_state_dict)[1]) + if self.do_cache_distribution: + self.cached_distribution = precomputed_distribution + end = time() + logger.debug(f"parallel save sharding, time: {end - start}") + + @property + def can_handle_sharded_objects(self): + return self.base_strategy.can_handle_sharded_objects + + +class FullyParallelLoadStrategyWrapper(LoadShardedStrategy): + """Wraps arbitrary load strategy and distributes the load during `load`. + + See `load` method docs for details. + + Args: + strategy (LoadShardedStrategy): base strategy to wrap + parallelization_group (ProcessGroup, optional): process group to use for load + distribution. Note that this doesn't have to match exactly the + data distribution, but should cover the replication pattern + to maximize performance. Defaults to the whole world. + In most cases, it's recommended to set it to the DP group. + do_cache_distribution (bool, optional): whether to cache the load distribution + from previous calls. Should be set to True only if the state dict + structure between the calls is always the same. Defaults to False, + since the loading in general happens only once during training. + Note that the load distribution *cannot* be reused as a save distribution, + because save/load is not fully symmetrical. + exchange_algo (str): algorithm to use for exchanging the data. + Options: + - broadcast - each rank broadcasts individual tensors to others + - gather_object (default) - ranks all_gather_object the whole loaded state dicts + - gather_rounds (default) - ranks all gather individual tensors in rounds + See method docs for more details. + """ + + def __init__( + self, + strategy: LoadShardedStrategy, + parallelization_group: Optional[torch.distributed.ProcessGroup] = None, + do_cache_distribution: bool = False, + exchange_algo: str = 'broadcast', + ): + super().__init__() + self.base_strategy = strategy + if parallelization_group is None: + parallelization_group = ( + dist.GroupMember.WORLD + ) # explicit group needed for torch.distributed.get_global_rank call + self.parallelization_group = parallelization_group + self.do_cache_distribution = do_cache_distribution + self.exchange_algo = exchange_algo + + self.cached_distribution: Optional[SaveLoadDistribution] = None + + def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> StateDict: + """Distributes the load and calls underlying strategy only for parts of the state dict. + + Steps: + 1. Load metadata is exchanged between the ranks in the parallelization group. + 2. Each rank deterministically plans the load for the whole workload + so that the loads are as uniform as possible. + 3. Each ranks loads its planned shard of the checkpoint. + 4. All ranks exchange the loaded shards. + + Internode communication is involved in steps (1) (with metadata) + and (4) (with actual data). Storage interaction is involved in step (3). + + Currently, the load distribution (step 2) is realized with a greedy algorithm + described in `distribute_shards_to_ranks` (same as for saving distribution). + + Currently, the shards are all gathered between all ranks in the parallelization + group. This might not be optimal (some ranks do not need all tensors), + but it's a reasonable approximation for an optimal exchange in most scenarios. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to load + checkpoint_dir (Path): checkpoint directory to load from + + Returns: + StateDict: loaded state dict. The state dict should be equivalent to + a state dict that would be loaded with the underlying strategy + without this wrapper. + """ + if torch.distributed.get_world_size(self.parallelization_group) <= 1: + return self.base_strategy.load(sharded_state_dict, checkpoint_dir) + + # Step 1 and 2: exchange load metadata and distribute the load + start = time() + precomputed_distribution = self.apply_loading_parallelization(sharded_state_dict) + assert ( + precomputed_distribution is not None + ), 'Expecting non-trivial distribution for non-trivial parallelization group' + end = time() + logger.debug(f'self.apply_loading_parallelization took {end - start}s') + start = end + + # Step 3: load part of the checkpoint. + # Load only sharded objects first. ShardedTensors will be loaded separately + # so that we can keep track of sharded tensors loaded by this rank + ( + sharded_tensors, + sharded_state_dict, + to_load_shards, + unloaded_shards, + ) = self._defer_loading_sharded_tensors(sharded_state_dict) + loaded_state_dict = self.base_strategy.load(sharded_state_dict, checkpoint_dir) + + end = time() + logger.debug(f'Base load of ShardedObjects took {end - start}s') + start = end + + # Load sharded tensors separately + loaded_tensors = self.base_strategy.load(to_load_shards, checkpoint_dir) + + end = time() + logger.debug(f'Base load of ShardedTensors took {end - start}s') + start = end + + # Step 4: exchange data between ranks + logger.debug(f'Applying parallel load with algo {self.exchange_algo}') + if self.exchange_algo == 'gather_object': + exchange_fn = self.exchange_loaded_tensors_gather_object + elif self.exchange_algo == 'gather_rounds': + exchange_fn = self.exchange_loaded_tensors_gather_rounds + elif self.exchange_algo == 'broadcast': + exchange_fn = self.exchange_loaded_tensors_broadcast + else: + raise NotImplementedError(f'Unrecognized gather algorithm: {self.exchange_algo}') + + all_loaded_tensors = exchange_fn( + loaded_tensors, + unloaded_shards, + precomputed_distribution, + self.parallelization_group, + ) + if not set(unloaded_shards.keys()).issubset(all_loaded_tensors.keys()): + missing_shards = set(unloaded_shards.keys()) - all_loaded_tensors.keys() + raise CheckpointingException( + f'Missing shards after fully parallel loading: {missing_shards}' + ) + + sync_start = time() + torch.cuda.synchronize() + end = time() + logger.debug(f'torch.cuda.synchronize took {end - sync_start}s') + logger.debug(f'self.exchange_loaded_tensors took {end - start}s') + + self.fill_in_deferred_sharded_tensors(sharded_tensors, all_loaded_tensors) + merge(loaded_state_dict, sharded_tensors) + return loaded_state_dict + + def _defer_loading_sharded_tensors(self, sharded_state_dict: ShardedStateDict) -> Tuple[ + ShardedStateDict, + ShardedStateDict, + Dict[_ShardId, ShardedTensor], + Dict[_ShardId, ShardedTensor], + ]: + """Divides state dict into parts loaded by this vs other ranks. + + ShardedTensors with main replica_id will be loaded by this rank, + others will be received by other ranks (after loading from storage). + + Args: + sharded_state_dict (ShardedStateDict): state dict with ShardedTensor + that will be divided. + + Returns: a tuple of: + - ShardedStateDict: sub-state dict only with ShardedTensors + - ShardedStateDict: sub-state dict with non-ShardedTensors + - Dict[_ShardId, ShardedTensor]: ShardedTensor are uniquely identified + by shard ids. This is a mapping from shard id to a corresponding + ShardedTensor for tensors loaded by *this* rank + - Dict[_ShardId, ShardedTensor]: mapping from shard id to a corresponding + ShardedTensor for tensors loaded by *other* ranks + """ + to_load_shards = {} + unloaded_shards = {} + + sharded_tensors, sharded_state_dict = extract_matching_values( + sharded_state_dict, lambda v: isinstance(v, ShardedTensor) + ) + + def wrap_non_main_replicas(x): + if isinstance(x, ShardedTensor): + # Assign shard to be loaded or not + if is_main_replica(x.replica_id): + to_load_shards[_sharded_tensor_shard_id(x)] = x + else: + unloaded_shards[_sharded_tensor_shard_id(x)] = x + return x + + dict_list_map_inplace(wrap_non_main_replicas, sharded_tensors) + return sharded_tensors, sharded_state_dict, to_load_shards, unloaded_shards + + def apply_loading_parallelization( + self, sharded_state_dict: ShardedStateDict + ) -> Optional[SaveLoadDistribution]: + """Distributes the load across ranks by exchanging metadata. + + Exchanges metadata from the state dict and computes the uniform + (as close as possible) distribution of loads among the ranks. + Marks ShardedTensors to be loaded by the current rank with replica_id 0 + (and others with non 0 values). + + If `self.do_cache_distribution` is True, caches the distribution between + the calls and subsequent distributions happen without any inter-rank + communication. + + Args: + sharded_state_dict (ShardedStateDict): state dict to distribute the loading + + Returns: + SaveLoadDistribution (optional): the computed loading distribution + """ + if self.do_cache_distribution and self.cached_distribution is not None: + logger.debug(f'Apply *cached* load parallelization') + precomputed_distribution = self.cached_distribution + else: + logger.debug(f'Apply load parallelization') + precomputed_distribution = determine_main_replica_uniform_distribution( + sharded_state_dict, self.parallelization_group, True + ) + + distribute_main_replicas_with_precomputed_distribution( + sharded_state_dict, self.parallelization_group, precomputed_distribution + ) + if self.do_cache_distribution: + self.cached_distribution = precomputed_distribution + + return precomputed_distribution + + def exchange_loaded_tensors_gather_object( + self, + loaded_tensors: Dict[_ShardId, torch.Tensor], + unloaded_shards: Dict[_ShardId, ShardedTensor], + precomputed_distribution: SaveLoadDistribution, + parallelization_group: Optional[torch.distributed.ProcessGroup] = None, + ) -> Dict[_ShardId, torch.Tensor]: + """Exchange the tensors loaded by different ranks with a simple all_gather_object call. + + This version can be used for debugging purposes do to its simplistic + implementation. Shouldn't be used if performance is important. + + Args: + loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to tensors already loaded by this rank. + unloaded_shards (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to ShardedTensors that aren't loaded yet. + precomputed_distribution (SaveLoadDistribution): uniform load distribution + parallelization_group (ProcessGroup, optional): process group used for load + distribution. Tensors will be exchanged within this group + + Returns: + Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors + needed by this rank to load a given state dict. Includes + previously loaded tensors (from `loaded_tensors` input) + + """ + all_loaded_tensors_list = [None] * torch.distributed.get_world_size( + group=parallelization_group + ) + torch.distributed.all_gather_object( + all_loaded_tensors_list, loaded_tensors, group=parallelization_group + ) + all_loaded_tensors_list = cast(List[Dict[_ShardId, torch.Tensor]], all_loaded_tensors_list) + all_loaded_tensors = reduce(lambda x, y: {**x, **y}, all_loaded_tensors_list) + + # Error checks + if len(all_loaded_tensors) != sum(map(len, all_loaded_tensors_list)): + err_msg = 'Duplicate shard ids loaded by different ranks' + if torch.distributed.get_rank() == 0: + logger.error( + f'{err_msg}. Shards ids by rank: {[lt.keys() for lt in all_loaded_tensors_list]}' + ) + raise CheckpointingException(err_msg) + + return all_loaded_tensors + + @torch.no_grad() + def exchange_loaded_tensors_gather_rounds( + self, + loaded_tensors: Dict[_ShardId, torch.Tensor], + unloaded_shards: Dict[_ShardId, ShardedTensor], + precomputed_distribution: SaveLoadDistribution = None, + parallelization_group: Optional[torch.distributed.ProcessGroup] = None, + ) -> Dict[_ShardId, torch.Tensor]: + """Exchange the tensors loaded by different ranks with several all_gather calls. + + Groups tensors by dtype, divide tensors that will be exchanged into rounds + and execute all_gather for tensors from each round. + + Note: the loading is distributed across ranks based on total loaded size + in bytes, so there is no guarantee that number of rounds needed for each + rank will be similar, which might result in a lot of almost empty + all_gathers. The solution would be to group all tensors into a one + bytes tensor and do a single all_gather (with similarly sized messages). + + Args: + loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to tensors already loaded by this rank. + unloaded_shards (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to ShardedTensors that aren't loaded yet. + precomputed_distribution (SaveLoadDistribution): uniform load distribution + parallelization_group (ProcessGroup, optional): process group used for load + distribution. Tensors will be exchanged within this group + + Returns: + Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors + needed by this rank to load a given state dict. Includes + previously loaded tensors (from `loaded_tensors` input) + """ + shard_to_saving_rank, _, shard_to_metadata = precomputed_distribution + local_rank = torch.distributed.get_rank(group=self.parallelization_group) + + all_loaded_tensors = dict(loaded_tensors) + + # Group by dtype so that we all_gather tensors of the same dtype + for dtype in sorted( + set(map(lambda sh_ten: sh_ten.dtype, shard_to_metadata.values())), key=str + ): + + start = time() + # shards_by_rank maps rank to tensors loaded by this rank + shards_by_rank: List[List[torch.Tensor]] = [ + [] for _ in range(torch.distributed.get_world_size(group=parallelization_group)) + ] + for shard_id, rank in shard_to_saving_rank.items(): + if shard_to_metadata[shard_id].dtype == dtype: + shards_by_rank[rank].append(shard_id) + + # Transpose `shards_by_rank` to form exchange rounds + shards_by_round = zip_longest(*shards_by_rank, fillvalue=None) + for round_idx, round_shard_ids in enumerate(shards_by_round): + round_tensors = [] + orig_devices = {} + for rank, shard_id in enumerate(round_shard_ids): + if shard_id is None: + # if no more useful data, the given rank will exchange empty tensor + local_ten = torch.empty(0, dtype=dtype, device='cuda') + orig_device = None + else: + assert isinstance(shard_id, tuple), type(shard_id) + if rank == local_rank: + assert shard_id in all_loaded_tensors, ( + shard_id, + all_loaded_tensors.keys(), + ) + orig_device = all_loaded_tensors[shard_id] + all_loaded_tensors[shard_id] = all_loaded_tensors[shard_id].cuda() + local_ten = all_loaded_tensors[shard_id] + else: + local_ten, orig_device = self._get_empty_tensor_for_exchange( + shard_id, unloaded_shards, shard_to_metadata, all_loaded_tensors + ) + round_tensors.append(local_ten) + if orig_device is not None: + orig_devices[shard_id] = orig_device + + torch.distributed.all_gather( + list(round_tensors), + round_tensors[local_rank], + group=self.parallelization_group, + async_op=False, + ) + + # Move tensors back to CPU if originally was on CPU + for shard_id, orig_device in orig_devices.items(): + all_loaded_tensors[shard_id] = all_loaded_tensors[shard_id].to(orig_device) + + del round_tensors # remove tensor references + + end = time() + if torch.distributed.get_rank() == 0: + logger.debug(f'{dtype} exchange rounds all_gather schedule took {end - start}s') + + return all_loaded_tensors + + @torch.no_grad() + def exchange_loaded_tensors_broadcast( + self, + loaded_tensors: Dict[_ShardId, torch.Tensor], + unloaded_shards: Dict[_ShardId, ShardedTensor], + precomputed_distribution: SaveLoadDistribution = None, + parallelization_group: Optional[torch.distributed.ProcessGroup] = None, + ) -> Dict[_ShardId, torch.Tensor]: + """Exchange the tensors loaded by different ranks by a series of broadcasts. + + For each rank for each loaded tensor do a broadcast to the whole group. + A reasonable tradeoff in terms of performance and simplicity. + + Args: + loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to tensors already loaded by this rank. + unloaded_shards (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor + shard ids to ShardedTensors that aren't loaded yet. + precomputed_distribution (SaveLoadDistribution): uniform load distribution + parallelization_group (ProcessGroup, optional): process group used for load + distribution. Tensors will be exchanged within this group + + Returns: + Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors + needed by this rank to load a given state dict. Includes + previously loaded tensors (from `loaded_tensors` input) + """ + shard_to_saving_rank, _, shard_to_metadata = precomputed_distribution + local_rank = torch.distributed.get_rank(group=self.parallelization_group) + + all_loaded_tensors = dict(loaded_tensors) + + start = time() + + for idx, (shard_id, rank) in enumerate(shard_to_saving_rank.items()): + if rank == local_rank: + assert shard_id in all_loaded_tensors, (shard_id, all_loaded_tensors.keys()) + orig_device = all_loaded_tensors[shard_id].device + local_ten = all_loaded_tensors[shard_id].cuda() + else: + local_ten, orig_device = self._get_empty_tensor_for_exchange( + shard_id, unloaded_shards, shard_to_metadata, all_loaded_tensors + ) + + global_src_rank = torch.distributed.get_global_rank(parallelization_group, rank) + # We can do async_op=True only if there is no CPU-copy follow-up + torch.distributed.broadcast( + local_ten, + src=global_src_rank, + group=parallelization_group, + async_op=orig_device is None, + ) + # Move tensor back to CPU if originally was on CPU + if orig_device is not None: + all_loaded_tensors[shard_id] = local_ten.to(orig_device) + del local_ten + + end = time() + if torch.distributed.get_rank() == 0: + logger.debug(f'exchange broadcast schedule took {end - start}s') + + return all_loaded_tensors + + def _get_empty_tensor_for_exchange( + self, + shard_id: _ShardId, + needed_shards: Dict[_ShardId, ShardedTensor], + unneeded_shards: Dict[_ShardId, ShardedTensor], + loaded_tensors: Dict[_ShardId, torch.Tensor], + ) -> Tuple[torch.Tensor, Optional[torch.device]]: + """Determines the empty tensor to use for exchange. + + If shard_id is needed by this rank, it will be in the `unloaded_shards`. + Otherwise, the metadata for this tensor can be found in `shard_to_metadata` + + Args: + shard_id (_ShardId): shard_id that will be exchanged + needed_shards (Dict[_ShardId, ShardedTensor]): mapping from shard ids + to metadata for shards needed by this rank + unneeded_shards (Dict[_ShardId, ShardedTensor]): mapping from shard ids + to metadata for shards that can be discarded after exchange + loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping where useful tensors + are placed in + + Returns: + Tuple[torch.Tensor, Optional[torch.device]]: empty CUDA tensor to be exchanged, + and the device of the original state dict tensor (if there was any) + """ + local_unloaded_sh_ten = needed_shards.get(shard_id) + if local_unloaded_sh_ten is None: + orig_device = None # this tensor will be discarded anyway + sh_ten = unneeded_shards[shard_id] + if sh_ten.data is None: + sh_ten.init_data('cuda') + tensor = sh_ten.data + sh_ten.data = None # won't be used. free memory + else: + tensor = sh_ten.data + if tensor.device.type == 'cpu': + tensor = torch.empty_like(tensor, device='cuda') + else: + local_unloaded_sh_ten.init_data('cuda') + orig_device = local_unloaded_sh_ten.data.device + tensor = local_unloaded_sh_ten.data + if tensor.device.type == 'cpu': + tensor = torch.empty_like(tensor, device='cuda') + loaded_tensors[shard_id] = tensor + return tensor, orig_device + + def fill_in_deferred_sharded_tensors( + self, sharded_state_dict: ShardedStateDict, loaded_tensors: Dict[_ShardId, torch.Tensor] + ) -> None: + """Fill in tensors not loaded by current rank with tensors from `loaded_tensors` map. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to fill in. + ShardedTensors are completely replaced with corresponding torch.Tensors. + loaded_tensors (Dict[_ShardId, torch.Tensor]): dict allowing to map + ShardedTensor from the sharded_state_dict to loaded tensors. + + Returns: + + """ + + def fill_in_sharded_tensor(x): + if isinstance(x, ShardedTensor): + try: + x = loaded_tensors[_sharded_tensor_shard_id(x)] + except KeyError as e: + raise CheckpointingException( + f'Missing loaded tensor shard: {_sharded_tensor_shard_id(x)}' + ) from e + + return x + + dict_list_map_inplace(fill_in_sharded_tensor, sharded_state_dict) + + @property + def can_handle_sharded_objects(self): + return self.base_strategy.can_handle_sharded_objects + + def load_tensors_metadata(self, checkpoint_dir: Path): + return self.base_strategy.load_tensors_metadata(checkpoint_dir) + + def load_sharded_metadata(self, checkpoint_dir: Path): + return self.base_strategy.load_sharded_metadata(checkpoint_dir) + + def check_backend_compatibility(self, loaded_version): + return self.base_strategy.check_backend_compatibility(loaded_version) + + def check_version_compatibility(self, loaded_version): + return self.base_strategy.check_version_compatibility(loaded_version) + + +def _sharded_tensor_shard_id(sharded_tensor: ShardedTensor) -> _ShardId: + """Unique id of the sharded tensor data. + + Should yield the same value for same data replicated on different ranks. + + Args: + sharded_tensor (ShardedTensor): sharded tensor representing the data shard + + Returns (tuple): unique id of a data shard + """ + f_range = sharded_tensor.flattened_range + return ( + sharded_tensor.key, + sharded_tensor.global_offset, + None if f_range is None else (f_range.start, f_range.stop), + ) + + +def _shard_size(sh_ten: ShardedTensor): + """Returns size in bytes of a given sharded tensor.""" + if sh_ten.flattened_range is None: + numel = np.product(sh_ten.local_shape) + else: + numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start + return numel * torch._utils._element_size(sh_ten.dtype) + + +def determine_main_replica_uniform_distribution( + sharded_state_dict: ShardedStateDict, + parallelization_group: torch.distributed.ProcessGroup, + is_loading: bool = False, +) -> Optional[SaveLoadDistribution]: + """Computes the save distribution. + + Should be used in conjunction with `distribute_main_replicas_with_precomputed_distribution` + which applies the computed save distribution. + + We rely on the fact that the assignment algorithm is deterministic on all ranks, + so there is no extra communication needed after metadata exchange. + + Args: + sharded_state_dict (ShardedStateDict): state dict to compute the distribution of + parallelization_group (ProcessGroup): distribution will be computed + within this process group + is_loading (bool, optional): whether the distribution is for loading or saving. + For loading, even non-main replicas must be loaded by this parallelization + group. Defaults to False. + + Returns (SaveLoadDistribution, optional): distribution that can be used to apply the + parallelization. Returns None if the process_group is trivial (1 rank) + + """ + group_size = torch.distributed.get_world_size(group=parallelization_group) + if group_size <= 1: + return + local_shards = list( + sh_base + for sh_base in nested_values(sharded_state_dict) + if isinstance(sh_base, ShardedTensor) + ) + local_shards_no_data = [ten.without_data() for ten in local_shards] + + all_shards = [None] * torch.distributed.get_world_size(group=parallelization_group) + torch.distributed.all_gather_object( + all_shards, local_shards_no_data, group=parallelization_group + ) + + shard_to_ranks = defaultdict(list) + shard_to_size = {} + shard_to_metadata = {} + shards_saved_by_this_parallelization_group: Set[_ShardId] = set() + for rank, rank_shards in enumerate(all_shards): + for sh_ten in rank_shards: + shard_id = _sharded_tensor_shard_id(sh_ten) + shard_to_ranks[shard_id].append(rank) + if shard_id not in shard_to_size: + shard_to_size[shard_id] = _shard_size(sh_ten) + shard_to_metadata[shard_id] = sh_ten + if is_main_replica(sh_ten.replica_id) or is_loading: + shards_saved_by_this_parallelization_group.add(shard_id) + + shard_to_ranks = { + k: v for k, v in shard_to_ranks.items() if k in shards_saved_by_this_parallelization_group + } + + shard_to_saving_rank = distribute_shards_to_ranks( + shard_to_ranks, shard_to_size, len(all_shards) + ) + + return SaveLoadDistribution( + shard_to_saving_rank, shards_saved_by_this_parallelization_group, shard_to_metadata + ) + + +def distribute_main_replicas_with_precomputed_distribution( + sharded_state_dict: ShardedStateDict, + parallelization_group: torch.distributed.ProcessGroup, + precomputed_distribution: Optional[SaveLoadDistribution], +): + """Applies the save distribution computed with `determine_main_replica_uniform_distribution`. + + Based on rank assignment, sets replica ids of the shards saved by current rank to 0 + and all the other replica ids to 1. + + Args: + sharded_state_dict (ShardedStateDict): state dict to apply the save distribution to + parallelization_group (ProcessGroup): distribution will be applied within this + process group. Must match with the process group passed to + `determine_main_replica_uniform_distribution`. + precomputed_distribution (SaveLoadDistribution): distribution computed with + `determine_main_replica_uniform_distribution` + + Returns: None + + Example replica ids of tensors A, B, C before distribution: + rank0: A: (0, 0, 0), B: (0, 0, 0), C: (0, 0, 0) + rank1: A: (0, 0, 1), B: (0, 0, 1), C: (0, 0, 1) + rank2: A: (0, 0, 2), B: (0, 0, 2), C: (0, 0, 2) + + Replicas after distribution for the example above: + rank0: A: 0, B: 1, C: 1 + rank1: A: 1, B: 0, C: 1 + rank2: A: 1, B: 1, C: 0 + """ + if torch.distributed.get_world_size(group=parallelization_group) <= 1: + return + if precomputed_distribution is None: + raise ValueError( + 'precomputed_distribution must be not None for non-trivial parallelization group' + ) + + local_shards = list( + sh_base + for sh_base in nested_values(sharded_state_dict) + if isinstance(sh_base, ShardedTensor) + ) + + rank_within_dp_group = torch.distributed.get_rank(parallelization_group) + for sh_ten in local_shards: + shard_id = _sharded_tensor_shard_id(sh_ten) + if ( + shard_id in precomputed_distribution.shards_in_this_group + and rank_within_dp_group == precomputed_distribution.main_rank_for_shard[shard_id] + ): + sh_ten.replica_id = 0 + else: + sh_ten.replica_id = 1 + + +T = TypeVar('T') + + +def distribute_shards_to_ranks( + shard_to_ranks: Dict[T, List[int]], shard_to_size: Dict[T, int], num_ranks: int +) -> Dict[T, int]: + """Computes uniform distribution of workload across ranks, based on sizes. + + Currently, the assignment is greedy, based on: + 1. Firstly, the coverage of each shard + (how many ranks the shard is available on; lower coverage is assigned first) + 2. Secondly, the size of each shard (larger size is assigned first) + 3. Finally, shard id for differentiation. + + Third step is added because we rely on the fact that the assignment is deterministic on all ranks. + + Args: + shard_to_ranks (Dict[T, List[int]]): mapping which tells which rank have access to which shards + shard_to_size (Dict[T, int]): sizes of each shard + num_ranks (int): number of ranks in the parallelization group + + Returns (Dict[T, int]): assignment of shard to rank (which rank should do the work + to achieve maximal uniformity) + """ + shard_to_ranks = {k: tuple(v) for k, v in shard_to_ranks.items()} + shard_to_saving_rank = {} + rank_sizes = [(0, rank) for rank in range(num_ranks)] + + # start from tensors with lowest coverage, then go by tensor size from largest (hence minus size) + for shard_id, shard_ranks in sorted( + shard_to_ranks.items(), + key=lambda sh_id_ranks: ( + len(sh_id_ranks[1]), + -shard_to_size[sh_id_ranks[0]], + sh_id_ranks[0], + ), + ): + # assign greedily to the least occupied rank + size, rank = min((size, rank) for size, rank in rank_sizes if rank in shard_ranks) + + shard_to_saving_rank[shard_id] = rank + rank_sizes[rank] = (size + shard_to_size[shard_id], rank) + + logger.debug(f'distribute_shards_to_ranks distribution: {rank_sizes}') + + return shard_to_saving_rank diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py new file mode 100644 index 0000000..c1c2bce --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/resharding.py @@ -0,0 +1,315 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" Performant resharding of flattened tensors. + +Tensors that are first sharded (e.g. across TP) and then flattened cause +very irregular access patterns during loading. The idea for performant save/load +is to store tensors with global shape [X, Y, Z] and local shape [x, y, z] +as tensors with global shape [X // x, Y // y, Z // z, x * y * z] and +local shape [1, 1, 1, x * y * z]. This allows parallel save of tensors along the +last (flattened) dimension. During loading, some additional resharding is needed. +""" +import logging +import math +from dataclasses import dataclass +from itertools import product +from typing import Any, Dict, Optional, Tuple, Union + +import numpy as np +import torch +from torch.distributed.checkpoint import ChunkStorageMetadata +from torch.distributed.checkpoint.resharding import _shards_get_overlap_region_wrt_saved_tensor + +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.core import CheckpointingException +from megatron.core.dist_checkpointing.dict_utils import ( + dict_list_map_inplace, + extract_matching_values, +) +from megatron.core.dist_checkpointing.mapping import ( + ReplicaId, + ShardedStateDict, + ShardedTensorFactory, + StateDict, + apply_factories, + apply_factory_merges, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class TensorReformulationMetadata: + """Metadata needed to restore the original tensor shape. + + Args: + ckpt_orig_global_shape (Tuple[int, ...]): original global shape of the tensor + saved in the checkpoint. This is the global shape of the application, + further reformulated into `ckpt_reform_global_shape` while saving. + ckpt_reform_global_shape (Tuple[int, ...]): reformulated global shape of the tensor + saved in the checkpoint. This is the actual saved shape. + """ + + ckpt_orig_global_shape: Tuple[int, ...] + ckpt_reform_global_shape: Tuple[int, ...] + + def __post_init__(self): + assert self.ckpt_orig_global_shape + + +def nd_flattened_tensor_reformulated_global_shape(sh_ten: ShardedTensor) -> Tuple[int, ...]: + """Reformulated global shape of the flattened N-D ShardedTensor. + + N-D tensor global shape [X, Y, Z] and local shape [x, y, z] + is reformulated into global shape [X // x, Y // y, Z // z, x * y * z] and + local shape [1, 1, 1, x * y * z], to allow parallel save of tensors along the + last (flattened) dimension. + + Args: + sh_ten (ShardedTensor): flattened N-D ShardedTensor (N > 1) + + Returns: + Tuple[int, ...]: reformulated tensor shape + """ + assert is_nd_flattened_tensor(sh_ten), sh_ten + return sh_ten.axis_fragmentations + (int(np.prod(sh_ten.local_shape)),) + + +def is_nd_flattened_tensor(sh_ten: Any) -> bool: + """Checks if ShardedTensor is flattened and more than 1-dimensional + + Args: + sh_ten (Any): any object + + Returns: + bool: whether the given object is a flattened ShardedTensor and is N-dimensional (N > 1) + """ + return ( + isinstance(sh_ten, ShardedTensor) + and sh_ten.flattened_range is not None + and len(sh_ten.global_shape) > 1 + ) + + +# information needed to restore. With current implementation, this is a nested state dict +# with ShardedTensorFactories which is basically a ShardedStateDict type +ReformulationRestoreMetadata = ShardedStateDict + + +def apply_nd_flattened_tensors_reformulation( + sharded_state_dict: ShardedStateDict, + reformulation_metadata: Dict[str, TensorReformulationMetadata], +) -> Tuple[ShardedStateDict, ReformulationRestoreMetadata]: + """Applies N-D reformulation to a given sharded state dict. + + After applying the method and loading the reformulated state dict, + the `restore_nd_flattened_tensors_formulation` needs to be applied. + + Current implementation uses ShardedTensorFactories for convenience of + restoring the original structure, but it's just an implementation detail. + Turns N-D ShardedTensors into factories and immediately applies them, + keeping the data needed to restore the original structure. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict potentially + with tensors to reformulate. + reformulation_metadata (Dict[str, TensorReformulationMetadata]): dict + containing all metadata needed for reformulating tensors in `sharded_state_dict`. + for each N-D flattened tensor `sh_ten` in `sharded_state_dict` there must be an + entry with `sh_ten.key`. + + Returns: + tuple: + ShardedStateDict - reformulated sharded state dict + ReformulationRestoreMetadata - data needed to restore the original formulation + with `restore_nd_flattened_tensors_formulation` + """ + + def maybe_reformulate_nd_flattened_tensor(sh_ten: Any): + if not isinstance(sh_ten, ShardedTensor) or not is_nd_flattened_tensor(sh_ten): + return sh_ten + # N-D flattened ShardedTensor + try: + sh_ten_reformulation_metadata = reformulation_metadata[sh_ten.key] + except KeyError as e: + raise CheckpointingException( + f'Missing reformulation metadata for tensor {sh_ten}. Existing keys: {reformulation_metadata.keys()}' + ) from e + + ckpt_actual_saved_shape = sh_ten_reformulation_metadata.ckpt_reform_global_shape + app_actual_load_shape = nd_flattened_tensor_reformulated_global_shape(sh_ten) + if ckpt_actual_saved_shape == app_actual_load_shape: + # Same shape - no need to reshard + return sh_ten + + return reformulate_single_nd_flattened_tensor(sh_ten, sh_ten_reformulation_metadata) + + # Turn N-D tensors into factories and immediately apply them + dict_list_map_inplace(maybe_reformulate_nd_flattened_tensor, sharded_state_dict) + sh_ten_factories, _ = extract_matching_values( + sharded_state_dict, + lambda x: isinstance(x, ShardedTensorFactory), + return_lists_as_dicts=True, + ) + apply_factories(sharded_state_dict) + + # Unlink `data` pointers to free memory + def unlink_data(x): + x.data = None + return x + + dict_list_map_inplace(unlink_data, sh_ten_factories) + return sharded_state_dict, sh_ten_factories + + +def restore_nd_flattened_tensors_formulation( + state_dict: StateDict, formulation_restore_metadata: ReformulationRestoreMetadata +) -> StateDict: + """Restores the original state dict from a reformulated form. + + Inverse of `apply_nd_flattened_tensors_reformulation`. + + Args: + state_dict (StateDict): state dict obtained by loading a reformulated + sharded state dict. + formulation_restore_metadata (ReformulationRestoreMetadata): metadata returned by + `apply_nd_flattened_tensors_reformulation` function + + Returns: + StateDict: state dict with the original tensors formulation restored + """ + return apply_factory_merges(state_dict, formulation_restore_metadata) + + +def reformulate_single_nd_flattened_tensor( + sh_ten: ShardedTensor, reformulation_metadata: TensorReformulationMetadata +) -> Union[Any, ShardedTensorFactory]: + """Reformulates shapes of a single N-D flattened ShardedTensor. + + We need to define a pair of transformations: + - turn N-D ShardedTensor with original formulation into multiple reformulated ShardedTensors + - merge multiple reformulated loaded torch.Tensors into a single original tensor + Current implementation uses ShardedTensorFactories as a convenient mechanism + for specifying and keeping track of those transformations. + + Args: + sh_ten (ShardedTensor): sharded tensor to reformulate. + reformulation_metadata (TensorReformulationMetadata): metadata needed to + perform the reformulation + + Returns: + ShardedTensorFactory: factory that keeps information how to reformulate + (build) the ShardedTensor and then restore original formulation (merge) + after loading. + """ + rmd = reformulation_metadata + # Data won't be needed - remove unnecessary tensor references + sh_ten = sh_ten.without_data() + + # Based on reformulation_metadata, determine other tensor shapes and metadata + ckpt_axis_fragmentation = rmd.ckpt_reform_global_shape[:-1] + for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation): + assert sh % fragm == 0, (sh_ten, rmd.ckpt_reform_global_shape) + ckpt_local_shape_with_prepended_axis = tuple( + sh // fragm for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation) + ) + assert ( + ckpt_local_shape_with_prepended_axis[: sh_ten.prepend_axis_num] + == (1,) * sh_ten.prepend_axis_num + ), (ckpt_local_shape_with_prepended_axis, sh_ten) + ckpt_local_shape = ckpt_local_shape_with_prepended_axis[sh_ten.prepend_axis_num :] + + # Iterate over reformulated shapes needed by the application and from checkpoint, + # and generate new ShardedTensors that match the checkpoint sharding. + overlap_dim_offsets = [] + assert len(ckpt_axis_fragmentation) == len(sh_ten.axis_fragmentations), ( + ckpt_axis_fragmentation, + sh_ten, + ) + for dim, (app_chunk_dim_offset, ckpt_fragm, app_fragm) in enumerate( + zip( + sh_ten.local_chunk_offset_in_global(), + ckpt_axis_fragmentation, + sh_ten.axis_fragmentations, + ) + ): + # without `int`, it's an exact offset of the app shard expressed in ckpt_local_shape units + first_overlap_dim_offset = int(ckpt_fragm / app_fragm * app_chunk_dim_offset) + # `math.ceil` argument is an exact offset of the app next shard expressed in ckpt_local_shape units + next_overlap_dim_offset = math.ceil(ckpt_fragm / app_fragm * (app_chunk_dim_offset + 1)) + overlap_dim_offsets.append(range(first_overlap_dim_offset, next_overlap_dim_offset)) + + logger.debug( + f'Generated the following number of overlap shards for each dimension: {list(map(len, overlap_dim_offsets))}' + f' for fragmentation ckpt {ckpt_axis_fragmentation} vs app {sh_ten.axis_fragmentations} and chunk offset {sh_ten.local_chunk_offset_in_global()}' + ) + reformulated_sh_tens = {} + for chunk_offset in product(*overlap_dim_offsets): + global_offset = tuple( + chunk_off * chunk_shape + for chunk_off, chunk_shape in zip(chunk_offset, ckpt_local_shape_with_prepended_axis) + ) + reformulated_sh_tens[(global_offset, ckpt_local_shape)] = ShardedTensor( + sh_ten.key, + None, + sh_ten.dtype, + ckpt_local_shape, + rmd.ckpt_orig_global_shape, + global_offset, + ckpt_axis_fragmentation, + sh_ten.replica_id, + sh_ten.prepend_axis_num, + sh_ten.allow_shape_mismatch, + flattened_range=slice(0, rmd.ckpt_reform_global_shape[-1]), # whole ckpt shard + ) + + # Now, we have to define the transformations from application sharding + # to checkpoint sharding. + + @torch.no_grad() + def sh_ten_build_fn(*args, **kwargs): + # Here we simply return the precomputed tensors. + return reformulated_sh_tens + + @torch.no_grad() + def sh_ten_merge_fn(sub_state_dict): + # This is the non-flattened local tensor with original formulation + # that we are going to fill with shards loaded from the checkpoint. + app_non_flat_ten = torch.empty( + sh_ten.local_shape, + dtype=sh_ten.dtype, + device=sh_ten.data.device if sh_ten.data is not None else None, + ) + + assert len(sub_state_dict) > 0 + for (ckpt_global_offset, ckpt_local_shape), ckpt_ten in sub_state_dict.items(): + # For each ckpt shard, we fill the appropriate application shard part + dest_ten = app_non_flat_ten + src_ten = ckpt_ten.view(ckpt_local_shape) + # We don't need narrowing over `prepend_axis_num` axes so we take the [sh_ten.prepend_axis_num:] offsets slice + for ( + dim, + offset_for_saved_tensor, + offset_for_current_tensor, + length, + ) in _shards_get_overlap_region_wrt_saved_tensor( + saved_shard=ChunkStorageMetadata( + ckpt_global_offset[sh_ten.prepend_axis_num :], ckpt_local_shape + ), + current_shard=ChunkStorageMetadata( + sh_ten.global_offset[sh_ten.prepend_axis_num :], sh_ten.local_shape + ), + ): + src_ten = src_ten.narrow(dim, offset_for_saved_tensor, length) + dest_ten = dest_ten.narrow(dim, offset_for_current_tensor, length) + dest_ten.copy_(src_ten) + return app_non_flat_ten.flatten()[sh_ten.flattened_range] + + return ShardedTensorFactory( + sh_ten.key, + sh_ten.data, + sh_ten_build_fn, + sh_ten_merge_fn, + sh_ten.replica_id, + sh_ten.flattened_range, + ) diff --git a/megatron/core/dist_checkpointing/strategies/state_dict_saver.py b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py new file mode 100644 index 0000000..092e91d --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py @@ -0,0 +1,162 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +""" State dict saver for PyT Distributed format allowing asynchronous save. """ + +from logging import getLogger +from time import time +from typing import TYPE_CHECKING, Optional, Tuple, cast + +import torch +import torch.distributed as dist +from torch.distributed.checkpoint import CheckpointException +from torch.distributed.checkpoint.default_planner import DefaultSavePlanner +from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE, Metadata +from torch.distributed.checkpoint.planner import SavePlan, SavePlanner +from torch.distributed.checkpoint.utils import _DistWrapper, _get_failure_dict + +if TYPE_CHECKING: + from .filesystem_async import FileSystemWriterAsync + + +logger = getLogger(__name__) + + +def save_state_dict_async_plan( + state_dict: STATE_DICT_TYPE, + storage_writer: 'FileSystemWriterAsync', + process_group: Optional[dist.ProcessGroup] = None, + coordinator_rank: int = 0, + planner: Optional[SavePlanner] = None, + cached_ckpt_structure: Optional[Tuple[SavePlan, SavePlan, bool]] = None, +) -> Tuple[Tuple['FileSystemWriterAsync', Metadata, _DistWrapper], SavePlan, bool]: + """ + First stage of saving a state dict to storage. + + This is an async adjustment of torch.distributed.checkpoint.state_dict_saver. + In order to support async save, saving should be split into three parts: + 1. Planning + 2. Actual saving + 3. Finalization + + Out of these, step (2) *must* happen asynchronously. + The first step is realized with this function. + + The planning part consists of several steps, described here: + https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.SavePlanner + + Args: + state_dict (STATE_DICT_TYPE): state dict to save + storage_writer (FileSystemWriterAsync): in current version only an instance of + FileSystemWriterAsync + process_group (dist.ProcessGroup, optional): process group used for save planning + coordinator_rank (int, optional): coordinator rank for planning. Defaults to 0. + planner (SavePlanner, optional): save planner for torch.distributed.checkpoint format + cached_ckpt_structure (Tuple[SavePlan, SavePlan, bool], Optional): + Each object of this tuple will be used in the order as following + cached_central_plan (SavePlan): a globally coordinated save plan + cached in the previous iteration + cached_local_plan (SavePlan): a local plan + cached in the previous iteration + validated_cache_reuse (bool): boolean value to tell global_metadata and planning dict + is consistent over iterations + + Returns: Tuple of: + - storage writer (the one passed as input) + - metadata from planning + - distributed wrapper used for planning + The return value of this function should be passed as an input to + `save_state_dict_async_finalize` and cached_plan to skip `reduce_scatter` at planning. + """ + cached_central_plan, cached_local_plan, validated_cache_reuse = (None, None, False) + if cached_ckpt_structure: + cached_central_plan, cached_local_plan, validated_cache_reuse = cached_ckpt_structure + + rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 + dist_wrapper = _DistWrapper(process_group, True, coordinator_rank) + if planner is None: + planner = DefaultSavePlanner() + assert planner is not None + + global_metadata = None + logger.debug(f"rank: {rank}, starting state dict save") + local_plan = cached_local_plan + + def local_step(): + nonlocal local_plan + assert planner is not None + planner.set_up_planner(state_dict, dist_wrapper.is_coordinator) + storage_writer.set_up_storage_writer(dist_wrapper.is_coordinator) + if not validated_cache_reuse and local_plan is None: + local_plan = planner.create_local_plan() + local_plan = storage_writer.prepare_local_plan(local_plan) + return local_plan + + def global_step(all_local_plans): + nonlocal global_metadata + assert planner is not None + all_local_plans, global_metadata = planner.create_global_plan(all_local_plans) + all_local_plans = storage_writer.prepare_global_plan(all_local_plans) + return all_local_plans + + # Execute local and global planning + start_plan = time() + if validated_cache_reuse and cached_central_plan: + logger.debug(f"rank: {rank}, Passed cache reusable") + local_step() + central_plan = cached_central_plan + else: + central_plan = dist_wrapper.reduce_scatter("plan", local_step, global_step) + central_plan = planner.finish_plan(central_plan) + end_plan = time() + logger.debug(f"rank: {rank}, plan time: {end_plan - start_plan}") + # Prepare async writing of tensors. + # The `storage_writer` will store the information about tensors it needs to save + start = time() + storage_writer.prepare_write_data(central_plan, planner) + end = time() + logger.debug(f"{time()} rank: {rank}, write(async) time: {end - start}") + return ( + (storage_writer, cast(Metadata, global_metadata), dist_wrapper), + central_plan, + local_plan, + cached_central_plan == central_plan, + ) + + +def save_state_dict_async_finalize( + storage_writer: 'FileSystemWriterAsync', + global_metadata: Metadata, + dist_wrapper: _DistWrapper, +) -> None: + """ + Finalization of save_state_dict_async_plan. + + The input arguments are the same as the save_state_dict_async_plan output, + the `write_results` are retrieved from the storage_writer. + + Args: + storage_writer (FileSystemWriterAsync): storage writer used for planning + global_metadata (Metadata): metadata created during planning + dist_wrapper (_DistWrapper): distributed wrapper created during planning + + Returns: None + """ + write_results = storage_writer.retrieve_write_results() + + # Gather the write results that will be saved to the metadata file. + gather_start = time() + all_results = dist_wrapper.gather_object(write_results) + gather_end = time() + logger.debug(f"{gather_end}, {torch.distributed.get_rank()}, gather: {gather_end-gather_start}") + + # Store the metadata on coordinator rank + if dist_wrapper.is_coordinator: + node_failures = _get_failure_dict(all_results) + if len(node_failures) == 0: + assert global_metadata is not None + write_start = time() + storage_writer.finish(global_metadata, all_results) + write_end = time() + logger.debug(f"{write_end}, metadata_write: {write_end - write_start}") + else: + raise CheckpointException("write", node_failures) diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py new file mode 100644 index 0000000..61972ec --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/tensorstore.py @@ -0,0 +1,131 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Strategies using TensorStore to load and save Zarr arrays. """ + +from functools import partial +from itertools import starmap +from logging import getLogger +from pathlib import Path + +import tensorstore as ts +import torch + +from ..core import CheckpointingException +from ..dict_utils import dict_list_map_inplace +from ..mapping import ShardedStateDict, ShardedTensor +from .base import LoadShardedStrategy, StrategyAction, default_strategies +from .zarr import ( + load_zarr_based_sharded_metadata, + numpy_to_torch_dtype_dict, + postprocess_numpy_array, +) + +_import_trigger = None + +logger = getLogger(__name__) + + +class TensorStoreLoadShardedStrategy(LoadShardedStrategy): + def __init__(self, load_directly_on_device: bool = False): + super().__init__() + self.load_directly_on_device = load_directly_on_device + + def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + if torch.distributed.get_rank() == 0: + print(f'Loading distributed checkpoint with {self.__class__.__name__}') + if self.load_directly_on_device: + print(f'Loading distributed checkpoint directly on the GPU') + load_fn = partial( + _load_from_array, + checkpoint_dir=checkpoint_dir, + load_directly_on_device=self.load_directly_on_device, + ) + dict_list_map_inplace(load_fn, sharded_state_dict) + return sharded_state_dict + + def load_tensors_metadata(self, checkpoint_dir: Path): + def get_ts_shape_dtype(path): + arr = open_ts_array(path) + return arr.shape, arr.dtype.numpy_dtype + + return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype) + + def check_backend_compatibility(self, loaded_version): + pass # TODO + + def check_version_compatibility(self, loaded_version): + pass # TODO + + +def merge_global_slice_with_shape(global_slice, actual_shape, key): + def _merge_slice(dim_slice, dim_size): + if isinstance(dim_slice, slice): + assert ( + dim_slice.start < dim_size + ), f'Got empty slice for ShardedTensor {key} ({dim_slice}, {dim_size})' + if dim_slice.stop > dim_size: + dim_slice = slice(dim_slice.start, dim_size, dim_slice.step) + return dim_slice + + assert len(global_slice) == len(actual_shape), (global_slice, actual_shape, key) + return tuple(starmap(_merge_slice, zip(global_slice, actual_shape))) + + +def _load_from_array( + sharded_tensor: ShardedTensor, + checkpoint_dir: Path, + load_directly_on_device: bool = False, + apply_flattened_range: bool = True, +): + x = _load_regular_chunk(sharded_tensor, checkpoint_dir) + ten = postprocess_numpy_array(x, sharded_tensor, apply_flattened_range) + if load_directly_on_device: + sharded_tensor.data.data.copy_(ten) + return sharded_tensor.data + else: + return ten + + +def _load_regular_chunk(sharded_tensor: ShardedTensor, checkpoint_dir: Path): + assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor) + arr = open_ts_array(checkpoint_dir / sharded_tensor.key) + if sharded_tensor.global_shape == arr.shape: + x = ( + arr[sharded_tensor.global_slice()].read().result() + ) # flattened tensors loading is delayed + elif sharded_tensor.allow_shape_mismatch: + global_slice = merge_global_slice_with_shape( + sharded_tensor.global_slice(), arr.shape, sharded_tensor.key + ) + x = arr[global_slice].read().result() # flattened tensors loading is delayed + else: + _msg = ( + f'Global shape mismatch for loaded ({arr.shape})' + f' and expected ({sharded_tensor.global_shape}) tensor' + f' for key {sharded_tensor.key}' + ) + raise CheckpointingException(_msg) + return x + + +def open_ts_array(arr_path: Path): + """Opens a Zarr file array with Tensorstore with basic setting. + + Args: + arr_path (Path): path to a Zarr (Tensorstore) array + """ + spec = {'driver': 'zarr', 'metadata_key': '.zarray', 'kvstore': {}} + spec['kvstore'] = { + 'driver': 'file', + 'path': str(arr_path), + } + try: + arr = ts.open(ts.Spec(spec), open=True).result() + except Exception as e: + raise CheckpointingException(f'Array {arr_path} could not be loaded. Error: {e}') from e + return arr + + +default_strategies[StrategyAction.LOAD_SHARDED.value][ + ('zarr', 1) +] = TensorStoreLoadShardedStrategy() diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py new file mode 100644 index 0000000..d42d3cc --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/torch.py @@ -0,0 +1,815 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Strategies using PyTorch distributed.checkpoint as an underlying format. """ +import dataclasses +import io +import itertools +import math +from collections import ChainMap, defaultdict +from dataclasses import dataclass +from itertools import product +from logging import getLogger +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast + +import numpy as np +import torch +from pkg_resources import packaging +from torch.distributed import checkpoint +from torch.distributed._shard._utils import narrow_tensor_by_index +from torch.distributed._shard.metadata import ShardMetadata +from torch.distributed._shard.sharded_tensor import Shard, ShardedTensorMetadata, TensorProperties +from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor +from torch.distributed.checkpoint import ( + BytesStorageMetadata, + DefaultLoadPlanner, + DefaultSavePlanner, + FileSystemReader, + LoadPlan, + LoadPlanner, + Metadata, + ReadItem, + SavePlan, + TensorStorageMetadata, + WriteItem, +) +from torch.distributed.checkpoint._nested_dict import FLATTEN_MAPPING, unflatten_state_dict +from torch.distributed.checkpoint._traverse import OBJ_PATH, traverse_state_dict +from torch.distributed.checkpoint.default_planner import create_default_local_save_plan +from torch.distributed.checkpoint.metadata import Metadata +from torch.distributed.checkpoint.planner import LoadItemType +from torch.distributed.checkpoint.planner_helpers import _create_write_items +from torch.futures import Future + +from ..core import CheckpointingException +from ..dict_utils import extract_matching_values, nested_values +from ..mapping import ( + ShardedBase, + ShardedObject, + ShardedStateDict, + ShardedTensor, + ShardedTensorFactory, + StateDict, + apply_factories, + apply_factory_merges, + is_main_replica, +) +from .async_utils import AsyncRequest +from .base import AsyncSaveShardedStrategy, LoadShardedStrategy, StrategyAction, default_strategies +from .filesystem_async import FileSystemWriterAsync +from .resharding import ( + TensorReformulationMetadata, + apply_nd_flattened_tensors_reformulation, + is_nd_flattened_tensor, + nd_flattened_tensor_reformulated_global_shape, + restore_nd_flattened_tensors_formulation, +) +from .state_dict_saver import save_state_dict_async_finalize, save_state_dict_async_plan + +try: + from transformer_engine.pytorch.float8_tensor import Float8Tensor + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +_import_trigger = None + +logger = getLogger(__name__) + + +def flatten_state_dict( + state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, Dict[str, OBJ_PATH]]: + """Flattens state dict into a single level dict. + + It's a copy of torch.distributed.checkpoint._nested_dict.flatten_state_dict + which also accepts ShardedBase tensors as terminal objects + + Args: + state_dict (ShardedStateDict): state dict to be flattened + + Returns (tuple): flattened state dict and a mapping allowing to recreate the original one + + """ + flattened = {} + mappings = {} + + def flat_copy(path: OBJ_PATH, value: Any) -> None: + new_fqn = ".".join(map(str, path)) + if new_fqn in flattened: + raise ValueError(f"duplicated flatten key {new_fqn}") + flattened[new_fqn] = value + mappings[new_fqn] = path + + traverse_state_dict(state_dict, flat_copy, lambda x: isinstance(x, (torch.Tensor, ShardedBase))) + return flattened, mappings + + +def sharded_tensor_to_torch_sharded_tensor( + sh_tens: List[ShardedTensor], rank: Optional[int] = None +) -> TorchShardedTensor: + """Convert MCore ShardedTensor to PyT ShardedTensor. PyT requires information about all chunks. + + On high-level, this function follows the logic of torch.distributed.fsdp._shard_utils._create_chunk_sharded_tensor. + Additionally, it saves `prepend_axis_num` and `has_flattened_range` (specific to MCore) as attributes + for further restoration in `_unwrap_pyt_sharded_tensor`. + + NOTE: this function assumes regular (grid) sharding of the MCore ShardedTensor. + The only local irregularities could be introduced with a `flattened_range` attribute. + + This function handles 3 different type of ShardedTensors: + 1. Non-flat regular ShardedTensors (`not has_flattened_range`) + 2. 1D flattened ShardedTensors (`is_flattened_range_1d`) + 3. N-D flattened ShardedTensors (`has_flattened_range`) + + (1) and (2) type are saved according to their original shape. + Type (3) however requires global shape adjustment for efficiency: + we treat [X, Y, Z] global shape tensor with local shape [x, y, z] + as a [X // x, Y // y, Z // z, x * y * z] tensor with last axis + partitioned according to `flattened_range` slices. + This will need special handling while resharding. + + Args: + sh_tens (List[ShardedTensor]): list of sharded tensors to convert + rank (int, optional): current process rank passed to PyT ShardedTensor. + If None, assumes rank in the default pg. + + Returns (TorchShardedTensor): PyT ShardedTensor containing all passed shards. + + """ + if rank is None: + rank = torch.distributed.get_rank() + + some_sh_ten = sh_tens[0] + has_flattened_range = some_sh_ten.flattened_range is not None + is_flattened_range_1d = has_flattened_range and len(some_sh_ten.global_shape) == 1 + + for sh_ten in sh_tens: + assert (sh_ten.flattened_range is not None) == has_flattened_range, sh_tens + if not sh_ten.data.is_contiguous(): + sh_ten.data = sh_ten.data.contiguous() + + local_global_offsets = {} + + prepend_axis_num = sh_tens[0].prepend_axis_num + # Determine local shards according to tensor type (see docs) + if is_flattened_range_1d: + # Type (2) case: 1D flattened ShardedTensors + for sh_ten in sh_tens: + assert len(sh_ten.global_offset) == 1, sh_ten + assert sh_ten.prepend_axis_num == 0, sh_ten + local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten) + + global_shape = some_sh_ten.global_shape + offsets_shape = ( + some_sh_ten.local_shape + ) # local shape is not flattened, we need it for chunk offsets + + local_shards = [ + Shard.from_tensor_and_offsets( + sh_ten.data, + [ + sh_ten.global_offset[0] + sh_ten.flattened_range.start + ], # additional flattened offset + rank, + ) + for sh_ten in sh_tens + ] + + elif has_flattened_range: + # Type (3) case: N-D flattened ShardedTensors + for sh_ten in sh_tens: + local_global_offsets.setdefault(sh_ten.local_chunk_offset_in_global(), []).append( + sh_ten + ) + assert sh_ten.data.ndim == 1, sh_ten + sh_ten.data = sh_ten.data.view((1,) * len(sh_ten.global_shape) + (-1,)) + + # Global shape reformulation: + global_shape = nd_flattened_tensor_reformulated_global_shape(some_sh_ten) + offsets_shape = (1,) * len( + some_sh_ten.global_shape + ) # reformulated global shape has shape equal ti number of local chunks + + local_shards = [ + Shard.from_tensor_and_offsets( + sh_ten.data, + list( + sh_ten.local_chunk_offset_in_global() + (sh_ten.flattened_range.start,) + ), # additional flattened offset + rank, + ) + for sh_ten in sh_tens + ] + else: + # Type (1) case: non-flat regular ShardedTensors + for sh_ten in sh_tens: + local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten) + sh_ten.data = sh_ten.data.view( + (1,) * prepend_axis_num + sh_ten.local_shape + ) # adjust to prepended_axis_num + + global_shape = some_sh_ten.global_shape + offsets_shape = some_sh_ten.data.shape # includes prepended axes + + local_shards = [ + Shard.from_tensor_and_offsets( + sh_ten.data, list(sh_ten.global_offset), rank # simple case + ) + for sh_ten in sh_tens + ] + + # Create a ShardedTensor without invoking communication. Determine global shards + shard_metadata = [] + # NOTE: here we assume a regular grid of shards + for fragment_offsets in itertools.product(*map(range, some_sh_ten.axis_fragmentations)): + offset = tuple(map(lambda x: x[0] * x[1], zip(fragment_offsets, offsets_shape))) + if offset in local_global_offsets: + # local shard + placement = f"rank:{rank}/cuda" + for sh_ten in local_global_offsets[offset]: + if is_flattened_range_1d: + offset = (sh_ten.global_offset[0] + sh_ten.flattened_range.start,) + size = sh_ten.data.shape + elif has_flattened_range: + assert offset == sh_ten.local_chunk_offset_in_global() + # This is not an actual offset, but an offset of the whole shard + # This is needed for a PyT Dist internal integrity check + offset = sh_ten.local_chunk_offset_in_global() + (0,) + size = (1,) * len(offsets_shape) + global_shape[-1:] + else: + size = sh_ten.data.shape + shard_metadata.append(ShardMetadata(offset, size, placement)) + + else: + # for shards from other ranks we provide simplistic data - this information will be discarded + # during TorchShardedTensor._init_from_local_shards_and_global_metadata call + if has_flattened_range and not is_flattened_range_1d: + offset = offset + (0,) + size = (1,) * len(offsets_shape) + global_shape[-1:] + else: + size = offsets_shape + shard_metadata.append(ShardMetadata(offset, size, "cuda")) + + tensor = some_sh_ten.data + sharded_tensor_metadata = ShardedTensorMetadata( + shards_metadata=shard_metadata, + size=torch.Size(global_shape), + tensor_properties=TensorProperties( + dtype=tensor.dtype, + layout=tensor.layout, + requires_grad=tensor.requires_grad, + memory_format=torch.contiguous_format, + pin_memory=tensor.is_pinned(), + ), + ) + pyt_sh_ten = TorchShardedTensor._init_from_local_shards_and_global_metadata( + local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=None + ) + # Store MCore related data as PyTShardedTensor attribute. This won't be stored in the checkpoint, only for runtime purposes + pyt_sh_ten.mcore_sh_ten = sh_ten.without_data() + pyt_sh_ten.mcore_metadata = {} + if has_flattened_range and not is_flattened_range_1d: + pyt_sh_ten.mcore_metadata['nd_reformulated_orig_global_shape'] = sh_ten.global_shape + return pyt_sh_ten + + +def mcore_to_pyt_state_dict( + state_dict: Dict[str, List[ShardedBase]], + is_loading: bool = False, + init_device: torch.device = torch.device("cpu"), +) -> Dict[str, Union[TorchShardedTensor, io.BytesIO]]: + """Turn state dict with ShardedTensors and ShardedObjects to state dict compatible with PyT Dist format. + + Operates in-place and returns the original state dict. + + Args: + state_dict (Dict[str, List[ShardedBase]]): flattened state dict, where values + are lists of either ShardedTensor or ShardedObjects. + is_loading (bool, optional): flag indicating if loading or saving. Defaults to False. + init_device (torch.device, optional): device to initialize potentially missing tensors + during loading. Defaults to 'cpu'. + + Returns (Dict[str, Union[TorchShardedTensor, io.BytesIO]]): original dictionary with values + converted either into PyT ShardedTensors or io.BytesIO. + + """ + rank = torch.distributed.get_rank() + pyt_state_dict = {} + + def _mcore_to_torch_sharded_tensor(sh_tens: List[ShardedTensor]) -> TorchShardedTensor: + """Build a PyT ShardedTensor from given shards. + + During loading: + - if data is None, initialize it with an empty tensor (will be used to copy the data into) + - if `allow_shape_mismatch` is True, the data is initialized with zeros + prior to loading (not all parts of the tensor will be read from the checkpoint) + """ + assert all(isinstance(sh_ten, ShardedTensor) for sh_ten in sh_tens), sh_tens + for sh_ten in sh_tens: + if sh_ten.data is None: + if is_loading: + sh_ten.init_data( + init_device, + init_fn=torch.zeros if sh_ten.allow_shape_mismatch else torch.empty, + ) + else: + raise CheckpointingException(f'`data` attr is None for {sh_ten}') + else: + sh_ten.data = sh_ten.data.detach() + if sh_ten.allow_shape_mismatch and is_loading: + sh_ten.data.zero_() + + torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(sh_tens, rank) + torch_sh_ten.key = sh_tens[0].key + return torch_sh_ten + + def _mcore_to_torch_sharded_object(sh_objs: List[ShardedObject]) -> io.BytesIO: + """Build io.BytesIO from given sharded objects data.""" + assert all(isinstance(sh_obj, ShardedObject) for sh_obj in sh_objs), sh_objs + serialized_data = io.BytesIO() + torch.save([sh_obj.data for sh_obj in sh_objs], serialized_data) + return serialized_data + + for k, v in state_dict.items(): + if isinstance(v[0], ShardedTensor): + v = cast(List[ShardedTensor], v) + pyt_state_dict[k] = _mcore_to_torch_sharded_tensor(v) + else: + v = cast(List[ShardedObject], v) + pyt_state_dict[k] = _mcore_to_torch_sharded_object(v) + + return pyt_state_dict + + +def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]: + """Unwrap tensor from PyT ShardedTensor instance. + + If `prepend_axis_num` was non-zero (which is specific to MCore ShardedTensor) + then the tensor has additional singleton dimensions which should be squeezed. + """ + mcore_sh_ten = sh_ten.mcore_sh_ten + ret_tensors = [] + for sh in sh_ten.local_shards(): + ten = sh.tensor + if mcore_sh_ten.flattened_range is not None: + assert ten.shape[:-1] == (1,) * (len(ten.shape) - 1), ten.shape + ten = ten.view(-1) + else: + for _ in range(mcore_sh_ten.prepend_axis_num): + ten = ten.squeeze(0) + ret_tensors.append(ten) + return ret_tensors + + +def _replace_state_dict_keys_with_sharded_keys( + sharded_state_dict: ShardedStateDict, keep_only_main_replica: bool = False +) -> Tuple[Dict[str, List[ShardedBase]], FLATTEN_MAPPING, Dict[str, List[str]]]: + """Group ShardedBase objects by keys and return mappings required for recreating the original dict.""" + flat_sd, flat_mapping = flatten_state_dict(sharded_state_dict) + rename_mapping = defaultdict(list) + new_flat_sd = defaultdict(list) + for k, sh_base in flat_sd.items(): + assert isinstance(sh_base, ShardedBase), type(sh_base) + key = sh_base.unique_key if isinstance(sh_base, ShardedObject) else sh_base.key + if is_main_replica(sh_base.replica_id) or not keep_only_main_replica: + rename_mapping[key].append(k) + new_flat_sd[key].append(sh_base) + return new_flat_sd, flat_mapping, rename_mapping + + +def _replace_sharded_keys_with_state_dict_keys( + state_dict: Dict[str, List[Union[torch.Tensor, io.BytesIO]]], + flat_mapping: FLATTEN_MAPPING, + rename_mapping: Dict[str, List[str]], +): + """Inverse of _replace_state_dict_keys_with_sharded_keys.""" + recovered_sd = {} + for k, tensors in state_dict.items(): + assert len(tensors) == len(rename_mapping[k]) + for ten, recovered_k in zip(tensors, rename_mapping[k]): + recovered_sd[recovered_k] = ten + + return unflatten_state_dict(recovered_sd, flat_mapping) + + +def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[dict, list, Any]): + """Recursively update `x` keys, based on `keys_template`.""" + if isinstance(keys_template, dict): + assert isinstance(x, dict), type(x) + for k, v in keys_template.items(): + if not isinstance(k, str): + assert str(k) in x, (k, x.keys) + x[k] = x.pop(str(k)) + _restore_dict_types(x[k], v) + elif isinstance(keys_template, list): + assert isinstance(x, list), type(x) + for x_val, templ_val in zip(x, keys_template): + _restore_dict_types(x_val, templ_val) + + +@dataclass(frozen=True) +class MCoreSavePlan(SavePlan): + mcore_data: Dict[str, Dict[str, Any]] = None # Mcore related data about each tensor + + +class MCoreSavePlanner(DefaultSavePlanner): + """Differs with the default planner by saving BytesIO objects on all ranks. + + In the integration of MCore with PyT Distributed format, BytesIO objects + come from ShardedObjects, which should be treated as separate objects on each rank + (not common on all ranks). + + Also, the objects are already packed in io.BytesIO, so no need to redo it + in transform_object. + """ + + def __init__( + self, + *args, + dedup_replicated_tensors: Optional[bool] = None, + nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None, + **kwargs, + ) -> None: + # `dedup_replicated_tensors` was deprecated in 2.3 - this avoids tons of warnings during saving + if packaging.version.Version(torch.__version__) <= packaging.version.Version("2.2"): + kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors + super().__init__(*args, **kwargs) + self.nd_flattened_global_shapes = nd_flattened_global_shapes or {} + + def create_local_plan(self) -> SavePlan: + plan = create_default_local_save_plan(self.state_dict, self.is_coordinator) + self._add_non_coordinator_iobytes_request(plan) + if self.flatten_state_dict: + plan = dataclasses.replace(plan, planner_data=self.mappings) + plan = MCoreSavePlan( + items=plan.items, + storage_data=plan.storage_data, + planner_data=plan.planner_data, + mcore_data={ + k: sh_ten.mcore_metadata + for k, sh_ten in self.state_dict.items() + if isinstance(sh_ten, TorchShardedTensor) + }, + ) + self.plan = plan + + return self.plan + + def create_global_plan(self, all_plans: List[MCoreSavePlan]) -> Tuple[List[SavePlan], Metadata]: + global_plan, metadata = super().create_global_plan(all_plans) + metadata.mcore_data = dict(ChainMap(*(plan.mcore_data for plan in all_plans))) + return global_plan, metadata + + def _add_non_coordinator_iobytes_request(self, plan): + if self.is_coordinator: + return + for fqn, obj in self.state_dict.items(): + if isinstance(obj, io.BytesIO): + plan.items.extend(_create_write_items(fqn, obj)) + + def transform_object(self, write_item: WriteItem, object: Any): + return object + + +class MCoreLoadPlanner(DefaultLoadPlanner): + """Adds global shape validation to the default planner. + + If global shape validation can be ignored (shouldn't!), the default + load planner can be used. + """ + + def __init__( + self, *args, shapes_validation_sharded_tensors: Iterable[ShardedTensor] = (), **kwargs + ) -> None: + super().__init__(*args, **kwargs) + self.shapes_validation_sharded_tensors = shapes_validation_sharded_tensors + self._intermediate_read_item_and_target: Optional[Tuple[ReadItem, torch.Tensor]] = None + + def _validate_global_shapes(self, metadata, sharded_tensors): + for sh_ten in sharded_tensors: + loaded_shape = metadata.state_dict_metadata[sh_ten.key].size + if not is_nd_flattened_tensor(sh_ten): + expected_shape = sh_ten.global_shape + else: + expected_shape = nd_flattened_tensor_reformulated_global_shape(sh_ten) + if loaded_shape != expected_shape: + _msg = ( + f'Global shape mismatch for loaded ({loaded_shape})' + f' and expected ({expected_shape}) tensor' + f' for key {sh_ten.key}' + ) + raise CheckpointingException(_msg) + + def create_local_plan(self) -> LoadPlan: + self._validate_global_shapes(self.metadata, self.shapes_validation_sharded_tensors) + return super().create_local_plan() + + def resolve_tensor(self, read_item: ReadItem): + """Override to add FP8 support. + + Narrowing the Float8Tensor can create incontiguous tensors and there are + no `copy` kernels for such cases. This method creates a contiguous FP8 + tensors so that the subsequent `copy_` in FileSystemReader succeeds. + Note that this requires tracking the original tensor + (as `self._intermediate_read_item_and_target` attribute) + and restoring it in `commit_tensor` method. + """ + target_tensor = super().resolve_tensor(read_item) + if ( + not target_tensor.is_contiguous() + and HAVE_TE + and isinstance(target_tensor, Float8Tensor) + ): + self._intermediate_read_item_and_target = (read_item, target_tensor) + target_tensor = Float8Tensor.make_like( + target_tensor, + data=target_tensor._data.contiguous(), + ) + return target_tensor + + def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> None: + """Restores the original FP8 tensor saved in `resolve_tensor`.""" + if self._intermediate_read_item_and_target is not None: + interm_read_item, target_tensor = self._intermediate_read_item_and_target + assert ( + interm_read_item is read_item + ), '`commit_tensor` method should be called right after `resolve_tensor`' + target_tensor.copy_(tensor) + tensor = target_tensor + self._intermediate_read_item_and_target = None + return super().commit_tensor(read_item, tensor) + + +class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy): + """Async save strategy for the PyT Distributed format. + + The idea is to translate MCore ShardedTensors into PyT ShardedTensors + and use the async-adjusted torch.distributed.checkpoint saving mechanism + provided by the FileSystemWriterAsync writer. + """ + + def __init__( + self, + backend: str, + version: int, + keep_only_main_replica: bool = True, + thread_count: int = 2, + cached_metadata: bool = False, + ): + """Adds parameters specific to PyT Distributed format + Args: + backend (str): format backend string + version (int): format version + keep_only_main_replica (bool, optional): PyT Distributed has a mechanism + for deduplication, but replica_id aware deduplication is more coherent. + Default is True (recommended to keep it). + thread_count (int, optional): threads to use during saving. + Affects the number of files in the checkpoint (saving ranks * num_threads). + cached_metadata (bool, optional): Enables using cached global metadata to avoid + gathering local metadata every checkpointing invocation + """ + super().__init__(backend, version) + self.keep_only_main_replica = keep_only_main_replica + self.thread_count = thread_count + + # Cached SavePlans to skip plan in `save_state_dict_async_plan` + # cached outcome of `SavePlan.prepare_global_plan`, which aggregates local plans from all ranks + self.cached_central_plan: SavePlan = None + # cached outcome of `SavePlan.prepare_local_plan` describes how local state_dict is written + self.cached_local_plan: SavePlan = None + # Cached global metadata, only `coordinator` for dist-ckpt holds if central plans are consistent over iters + self.cached_global_metadata: Metadata = None + # This variable records if the ckpt structures are consistent + # so the following checkpoint savings reuse `cached_global_metadata` + self.validated_cache_reuse: bool = False + # The knob to enable cached metadata communication in saving + self.use_cached_ckpt_structure: bool = cached_metadata + + def async_save( + self, + sharded_state_dict: ShardedStateDict, + checkpoint_dir: Path, + ) -> AsyncRequest: + """Translates MCore ShardedTensors to PyT ShardedTensors and saves in PyT Distributed format. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to save + checkpoint_dir (Path): checkpoint directory + + Returns: None + """ + # Translate the state dict + ( + sharded_state_dict, + flat_mapping, + rename_mapping, + ) = _replace_state_dict_keys_with_sharded_keys( + sharded_state_dict, self.keep_only_main_replica + ) + pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False) + # Use PyT saving mechanism + writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count) + # This should be set differently if we run in a smaller process group than the default + coordinator = 0 + # Try twice to validate the generated `central_plan` is the same across iterations + # If so, reuse `cached_central_plan` and `cached_global_metadata` + # From the 3rd iteration, `save_state_dict_async_plan` will not generate `global_metadata` + # (return None) so `self.cached_global_metadata` is reused + args_cached_plans = None + if self.use_cached_ckpt_structure: + args_cached_plans = ( + self.cached_central_plan, + self.cached_local_plan, + self.validated_cache_reuse, + ) + + ( + save_state_dict_ret, + self.cached_central_plan, + self.cached_local_plan, + self.validated_cache_reuse, + ) = save_state_dict_async_plan( + pyt_state_dict, + writer, + None, + coordinator, + planner=MCoreSavePlanner(dedup_replicated_tensors=not self.keep_only_main_replica), + cached_ckpt_structure=args_cached_plans, + ) + rank = torch.distributed.get_rank() + if self.use_cached_ckpt_structure: + if self.validated_cache_reuse: + logger.debug(f"rank: {rank}, cache validated") + if save_state_dict_ret[1]: # when global_metadata is not cached + self.cached_global_metadata = save_state_dict_ret[1] # Cache Metadata + # Only Coordinator rank holds cached global_metadata + # (None is returned for global_metadata) + elif coordinator == rank: + logger.debug(f"rank: {rank}, reuse metadata, {save_state_dict_ret[1]}") + save_state_dict_ret = list(save_state_dict_ret) + save_state_dict_ret[1] = self.cached_global_metadata + + return self._get_save_and_finalize_callbacks(writer, save_state_dict_ret) + + def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret) -> AsyncRequest: + save_fn_args = writer.get_save_function_and_args() + save_fn, save_args = save_fn_args + + def finalize_fn(): + save_state_dict_async_finalize(*save_state_dict_ret) + torch.distributed.barrier() + + return AsyncRequest(save_fn, save_args, [finalize_fn]) + + def can_handle_sharded_objects(self): + return True + + +def get_reformulation_metadata( + sharded_state_dict: ShardedStateDict, checkpoint_dir: Path +) -> Dict[str, TensorReformulationMetadata]: + ckpt_metadata = FileSystemReader(checkpoint_dir).read_metadata() + reformulation_metadata = {} + for sh_ten in nested_values(sharded_state_dict): + if not is_nd_flattened_tensor(sh_ten): + continue + try: + ckpt_global_shape = ckpt_metadata.mcore_data[sh_ten.key][ + 'nd_reformulated_orig_global_shape' + ] + except KeyError as e: + raise CheckpointingException( + f'Cannot find global shape metadata for N-D flattened tensor {sh_ten} in checkpoint metadata: {ckpt_metadata.mcore_data}' + ) from e + + reformulation_metadata[sh_ten.key] = TensorReformulationMetadata( + ckpt_global_shape, ckpt_metadata.state_dict_metadata[sh_ten.key].size + ) + return reformulation_metadata + + +class TorchDistLoadShardedStrategy(LoadShardedStrategy): + """Basic load strategy for the PyT Distributed format.""" + + def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> StateDict: + """Translates MCore ShardedTensors to PyT ShardedTensors and loads from PyT Distributed format. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict with mapping + information to instruct loading + checkpoint_dir (Path): checkpoint directory + + Returns: loaded state dict + """ + # Apply N-D tensors resharding + sharded_state_dict, formulation_restore_data = apply_nd_flattened_tensors_reformulation( + sharded_state_dict, get_reformulation_metadata(sharded_state_dict, checkpoint_dir) + ) + + flexible_shape_sharded_tensors = [ + sh_ten + for sh_ten in nested_values(sharded_state_dict) + if isinstance(sh_ten, ShardedTensor) and not sh_ten.allow_shape_mismatch + ] + + orig_sharded_state_dict = sharded_state_dict + # MCore state dict to PyT Distributed compatible + ( + sharded_state_dict, + flat_mapping, + rename_mapping, + ) = _replace_state_dict_keys_with_sharded_keys(sharded_state_dict) + pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True) + # Load PyT Distributed format + checkpoint.load_state_dict( + pyt_state_dict, + FileSystemReader(checkpoint_dir), + planner=MCoreLoadPlanner( + shapes_validation_sharded_tensors=flexible_shape_sharded_tensors + ), + ) + pyt_state_dict = cast( + Dict[str, Union[TorchShardedTensor, List[io.BytesIO]]], pyt_state_dict + ) + # Unwrap ShardedTensors and return to original state dict + mcore_state_dict = { + k: v if not isinstance(v, TorchShardedTensor) else _unwrap_pyt_sharded_tensor(v) + for k, v in pyt_state_dict.items() + } + mcore_state_dict = _replace_sharded_keys_with_state_dict_keys( + mcore_state_dict, flat_mapping, rename_mapping + ) + _restore_dict_types(mcore_state_dict, orig_sharded_state_dict) + # Apply N-D tensors resharding postprocessing + mcore_state_dict = restore_nd_flattened_tensors_formulation( + mcore_state_dict, formulation_restore_data + ) + return mcore_state_dict + + def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metadata = None): + """Uses tensors metadata stored in the metadata file.""" + if metadata is None: + fs_reader = FileSystemReader(checkpoint_dir) + metadata = fs_reader.read_metadata() + + mcore_data = getattr(metadata, 'mcore_data', {}) + sharded_metadata = {} + for k, tp in metadata.state_dict_metadata.items(): + if not isinstance(tp, TensorStorageMetadata): + continue # load only tensors + + nd_orig_global_shape = mcore_data.get(k, {}).get('nd_reformulated_orig_global_shape') + if nd_orig_global_shape is None: + # Regular tensor + sharded_metadata[k] = ShardedTensor.from_rank_offsets( + k, + torch.empty(tp.size, **tp.properties.__dict__, device='meta'), + ).without_data() + else: + # N-D flattened tensor + unflat_ten = torch.empty( + nd_orig_global_shape, **tp.properties.__dict__, device='meta' + ) + flat_ten = unflat_ten.flatten() + sharded_metadata[k] = ShardedTensor.from_rank_offsets_flat( + k, + flat_ten, + unflat_ten.shape, + flattened_range=slice(0, unflat_ten.numel()), # whole slice + ).without_data() + + return sharded_metadata + + def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict: + """Uses tensors and objects metadata stored in the metadata file.""" + fs_reader = FileSystemReader(checkpoint_dir) + metadata = fs_reader.read_metadata() + + sharded_metadata = {} + for metadata_key, storage_metadata in metadata.state_dict_metadata.items(): + if not isinstance(storage_metadata, BytesStorageMetadata): + continue + sh_obj = ShardedObject.empty_from_unique_key(metadata_key) + sharded_metadata[sh_obj.unique_key] = sh_obj + + sharded_metadata.update(self.load_tensors_metadata(checkpoint_dir, metadata)) + return sharded_metadata + + def can_handle_sharded_objects(self): + return True + + def check_backend_compatibility(self, loaded_version): + pass # TODO + + def check_version_compatibility(self, loaded_version): + pass # TODO + + +default_strategies[StrategyAction.LOAD_SHARDED.value][ + ('torch_dist', 1) +] = TorchDistLoadShardedStrategy() +default_strategies[StrategyAction.SAVE_SHARDED.value][('torch_dist', 1)] = ( + TorchDistSaveShardedStrategy('torch_dist', 1) +) diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py new file mode 100644 index 0000000..8d20c32 --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/two_stage.py @@ -0,0 +1,257 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" 2-stage checkpoint loading. """ +import os +import time +from collections import defaultdict +from dataclasses import dataclass +from functools import partial, wraps +from itertools import chain +from logging import DEBUG, INFO, StreamHandler, getLogger +from operator import attrgetter, itemgetter +from pathlib import Path +from typing import Iterable, List, NamedTuple, Optional, Tuple, Union + +import torch + +from ..dict_utils import dict_list_map_inplace, map_reduce, nested_values +from ..mapping import ShardedStateDict, ShardedTensor, StateDict +from .base import LoadShardedStrategy +from .tensorstore import TensorStoreLoadShardedStrategy, _load_from_array, open_ts_array +from .zarr import flatten_range, load_zarr_based_sharded_metadata + +_import_trigger = None + + +timers = defaultdict(list) + +logger = getLogger(__name__) + + +def timed(verbose=True): + def timed_dec(fn): + name = fn.__name__ + + @wraps(fn) + def wrapped(*args, **kwargs): + if verbose: + logger.debug(f'{name} init') + start = time.time() + ret = fn(*args, **kwargs) + took = time.time() - start + if verbose: + logger.debug(f'{name} took {took}s') + timers[name].append(took) + return ret + + return wrapped + + return timed_dec + + +@dataclass +class _ShardedTensorMetadata: + global_rank: int + sharded_tensor_no_data: ShardedTensor + dist_group_rank: Tuple[int] # id of distributed group + dist_group_ranks: Tuple[int] # id of distributed group + data_size: Optional[int] = None # bytes + + +def sharded_tensor_chunk_id(sharded_tensor: ShardedTensor): + return ( + sharded_tensor.key, + sharded_tensor.global_offset, + ) + + +class TwoStageDataParallelLoadShardedStrategy(LoadShardedStrategy): + """Loads one checkpoint replica from storage and broadcasts to other nodes. + + This strategy loads checkpoint from storage on minimal set of nodes + and distributes the checkpoint to other nodes with torch.distributed. + Loading is performed with tensorstore. + + Steps: + 0. (optional) create Gloo distributed groups + 1. Exchange ShardedTensors metadata between all nodes + 2. Align needed tensors within DP groups + 3. For each globally unique tensor: + 3.a) on one of the ranks load it from storage to CPU and move to CUDA + 3.b) allocate CUDA tensor on other ranks + 3.c) broadcast within DP group + 3.d) copy tensor content to the model param location + 3.e) free tensor buffers from a) and b) + + Notes: + 1. Loading and broadcasting is done sequentially to avoid both host and device OOMs + 2. There is a lot of overlap potential between all three steps done for each tensor: + 2.a) loading from storage to numpy + 2.b) moving CPU tensors to CUDA + 2.c) broadcast + """ + + def __init__(self, data_parallel_group, cpu_transfer=True): + super().__init__() + + self.cpu_transfer = cpu_transfer + self.data_parallel_group_orig = data_parallel_group + self.data_parallel_group = None if cpu_transfer else data_parallel_group + self.dp_group_ranks = tuple( + sorted(torch.distributed.get_process_group_ranks(data_parallel_group)) + ) + self.dp_group_rank = torch.distributed.get_rank(self.data_parallel_group_orig) + self.global_rank = torch.distributed.get_rank() + + def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + self.maybe_init_gloo_group() + all_tensors_sorted = self._build_load_plan(sharded_state_dict) + self._exchange_loaded_tensors(all_tensors_sorted, sharded_state_dict, checkpoint_dir) + # TODO: fix hang in summarize_load_times + # self.summarize_load_times() + return sharded_state_dict + + def summarize_load_times(self): + torch.distributed.barrier() + logger.info('Checkpoint loading finished. Summary:') + # TODO: `timers` keys are not guaranteed to be the same across ranks which causes hangs + for key, times in sorted(timers.items()): + times_sum = sum(times) + max_times = torch.tensor([times_sum], device='cuda') + avg_times = torch.tensor([times_sum], device='cuda') + torch.distributed.all_reduce(max_times, op=torch.distributed.ReduceOp.MAX) + torch.distributed.all_reduce(avg_times, op=torch.distributed.ReduceOp.SUM) + avg_times /= torch.distributed.get_world_size() + if torch.distributed.get_rank() == 0: + logger.info(f'{key}: max {max_times[0]}, avg {avg_times[0]}') + + @timed(verbose=False) + def load_tensor_from_storage(self, checkpoint_dir, ten_meta: _ShardedTensorMetadata): + logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) init') + ret = _load_from_array( + ten_meta.sharded_tensor_no_data, + checkpoint_dir, + load_directly_on_device=False, + apply_flattened_range=False, + ) + logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) DONE') + return ret + + @timed() + def maybe_init_gloo_group(self): + if not self.cpu_transfer: + return + all_groups = [None] * torch.distributed.get_world_size() + torch.distributed.all_gather_object(all_groups, self.dp_group_ranks) + all_groups = set(tuple(sorted(gr)) for gr in all_groups) + for group_ranks in sorted(all_groups): + gloo_pg = torch.distributed.new_group(ranks=group_ranks, backend='gloo') + if self.global_rank in group_ranks: + self.data_parallel_group = gloo_pg + assert self.dp_group_rank == torch.distributed.get_rank(self.data_parallel_group) + + def check_backend_compatibility(self, loaded_version): + pass # TODO + + def check_version_compatibility(self, loaded_version): + pass # TODO + + @timed() + def _build_load_plan( + self, sharded_state_dict: ShardedStateDict + ) -> List[_ShardedTensorMetadata]: + local_meta = [ + _ShardedTensorMetadata( + self.global_rank, + sharded_ten.without_data(), + self.dp_group_rank, + self.dp_group_ranks, + ) + for sharded_ten in nested_values(sharded_state_dict) + ] + all_meta = [None] * torch.distributed.get_world_size(group=self.data_parallel_group) + torch.distributed.all_gather_object(all_meta, local_meta, group=self.data_parallel_group) + all_meta = list(chain.from_iterable(all_meta)) + all_tensors_sorted = self.deduplicate_chunks(all_meta) + return all_tensors_sorted + + @timed() + def deduplicate_chunks(self, ten_metas: List[_ShardedTensorMetadata]): + """ Group tensors by chunk and then pick the tensor with the lowest rank. + + NOTE: with proper loading overlap, loading from randomized ranks + (instead of the smallest one) could be beneficial here. + """ + ten_metas = map_reduce( + ten_metas, + key_fn=lambda meta: sharded_tensor_chunk_id(meta.sharded_tensor_no_data), + reduce_fn=partial(min, key=attrgetter('dist_group_rank')), + ) + all_metas_sorted = list(map(itemgetter(1), sorted(ten_metas.items()))) + return all_metas_sorted + + @timed() + def _exchange_loaded_tensors( + self, ten_metas: List[_ShardedTensorMetadata], sharded_state_dict, checkpoint_dir + ): + logger.debug(f'_exchange_loaded_tensors, num ten_metas: {len(ten_metas)}') + for ten_meta in ten_metas: + + src_rank = torch.distributed.get_global_rank( + self.data_parallel_group, ten_meta.dist_group_rank + ) + + if self.dp_group_rank == ten_meta.dist_group_rank: + exchange_tensor = self.load_tensor_from_storage(checkpoint_dir, ten_meta) + if not self.cpu_transfer: + exchange_tensor = exchange_tensor.cuda() + else: + # TODO: for non-flattened ranges we could reuse the buffer from the start here + exchange_tensor = torch.empty( + ten_meta.sharded_tensor_no_data.local_shape, + device='cpu' if self.cpu_transfer else 'cuda', + dtype=ten_meta.sharded_tensor_no_data.dtype, + ) + + logger.debug( + f'exchange {ten_meta.sharded_tensor_no_data.key}, {exchange_tensor.shape}({exchange_tensor.numel()}), broadcast({src_rank} -> {self.dp_group_ranks})' + ) + torch.distributed.broadcast( + exchange_tensor, group=self.data_parallel_group, src=src_rank + ) + self._distribute_data_to_state_dict(ten_meta, exchange_tensor, sharded_state_dict) + logger.debug(f'exchange {ten_meta.sharded_tensor_no_data.key} done') + + # free buffer memory + exchange_tensor = None + + @timed(verbose=False) + def _distribute_data_to_state_dict( + self, + ten_meta: _ShardedTensorMetadata, + loaded_ten: torch.Tensor, + sharded_state_dict: ShardedStateDict, + ): + tensor_key = sharded_tensor_chunk_id(ten_meta.sharded_tensor_no_data) + + def _fill_in_data(t: Union[ShardedTensor, torch.Tensor]): + if not isinstance(t, ShardedTensor) or sharded_tensor_chunk_id(t) != tensor_key: + # already filled-in or key not matching + return t + sharded_tensor: ShardedTensor = t + x = loaded_ten + if sharded_tensor.flattened_range is not None: + x = flatten_range(sharded_tensor, x) + + # Reuse existing buffer + sharded_tensor.data.data.copy_(x) + return sharded_tensor.data + + dict_list_map_inplace(_fill_in_data, sharded_state_dict) + + def load_tensors_metadata(self, checkpoint_dir: Path): + def get_ts_shape_dtype(path): + arr = open_ts_array(path) + return arr.shape, arr.dtype.numpy_dtype + + return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype) diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py new file mode 100644 index 0000000..ee63c91 --- /dev/null +++ b/megatron/core/dist_checkpointing/strategies/zarr.py @@ -0,0 +1,307 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Strategies using Zarr as an underlying format. """ +import logging +import os +import threading +from functools import partial +from logging import getLogger +from pathlib import Path +from typing import Callable, List, Optional, Tuple + +import numpy as np +import torch +import zarr + +from ..core import CheckpointingException +from ..dict_utils import dict_list_map_inplace, nested_values +from ..mapping import ShardedStateDict, ShardedTensor, is_main_replica +from .base import LoadShardedStrategy, SaveShardedStrategy, StrategyAction, default_strategies + +logger = logging.getLogger(__name__) + +numpy_to_torch_dtype_dict = { + np.dtype('bool'): torch.bool, + np.dtype('uint8'): torch.uint8, + np.dtype('int8'): torch.int8, + np.dtype('int16'): torch.int16, + np.dtype('int32'): torch.int32, + np.dtype('int64'): torch.int64, + np.dtype('float16'): torch.float16, + np.dtype('float32'): torch.float32, + np.dtype('float64'): torch.float64, + np.dtype('complex64'): torch.complex64, + np.dtype('complex128'): torch.complex128, +} + +torch_to_numpy_dtype_dict = {v: k for k, v in numpy_to_torch_dtype_dict.items()} + + +try: + import tensorstore + + HAS_BFLOAT16 = True + numpy_to_torch_dtype_dict[np.dtype('bfloat16')] = torch.bfloat16 + torch_to_numpy_dtype_dict[torch.bfloat16] = np.dtype('bfloat16') +except ImportError: + HAS_BFLOAT16 = False + +_import_trigger = None + +logger = getLogger(__name__) + + +class ZarrSaveShardedStrategy(SaveShardedStrategy): + def __init__(self, backend: str, version: int): + super().__init__(backend, version) + logger.warning( + f'`zarr` distributed checkpoint backend is deprecated.' + ' Please switch to PyTorch Distributed format (`torch_dist`).' + ) + + def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): + sharded_tensors = list(nested_values(sharded_state_dict)) + arrays = _create_or_open_zarr_arrays(sharded_tensors, checkpoint_dir) + for ten, arr in zip(sharded_tensors, arrays): + _save_to_existing_array(ten, arr) + torch.distributed.barrier() + + +def _create_or_open_zarr_arrays( + sharded_tensors: List[ShardedTensor], checkpoint_dir: Path +) -> List[Optional[zarr.Array]]: + """Returns list of zarr arrays corresponding to given tensors. + + For a sharded tensors that: + a) is main replica and represents the first chunk (all offsets 0), creates the Zarr array + b) is main replica but not the first chunk, opens the arrays created in (a) (possibly by other process) + c) otherwise, sets the corresponding array to None since it won't be used + + Args: + sharded_tensors (List[ShardedTensor]): sharded tensors from a given rank that will be saved to checkpoint + checkpoint_dir (Path): checkpoint in which the arrays will be created + """ + arrays = [] + for ten in sharded_tensors: + arr = _create_zarr_array(ten, checkpoint_dir) if _should_create_array(ten) else None + arrays.append(arr) + + torch.distributed.barrier() + # Open arrays created above by other processes + for arr_idx, ten in enumerate(sharded_tensors): + if arrays[arr_idx] is not None: + # array created by this process + assert _should_create_array(ten), ten + continue + if not is_main_replica(ten.replica_id): + # this array won't be needed for saving and can stay None + continue + open_kwargs = {} + if ten.flattened_range is not None: + open_kwargs['synchronizer'] = zarr.ProcessSynchronizer( + str(checkpoint_dir / f'{ten.key}.sync') + ) + arrays[arr_idx] = _open_zarr_array_verbose(checkpoint_dir / ten.key, 'r+', **open_kwargs) + return arrays + + +def _should_create_array(ten: ShardedTensor): + return ( + is_main_replica(ten.replica_id) + and set(ten.global_offset) == {0} + and (ten.flattened_range is None or ten.flattened_range.start == 0) + ) + + +def _save_to_existing_array(sharded_tensor: ShardedTensor, arr: Optional[zarr.Array]): + if not is_main_replica(sharded_tensor.replica_id): + return + assert arr is not None + x = sharded_tensor.data + x = x.detach().cpu() + torch.cuda.synchronize() + if x.dtype == torch.bfloat16: + x = x.float() + x = x.numpy() + x = x.astype('bfloat16') + else: + x = x.numpy() + + if sharded_tensor.flattened_range is None: + arr[sharded_tensor.global_slice()] = x + else: + arr.set_coordinate_selection(sharded_tensor.global_coordinates(), x) + + +def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path): + np_dtype = torch_to_numpy_dtype_dict[sharded_tensor.dtype] + try: + arr = zarr.create( + sharded_tensor.global_shape, + dtype=np_dtype, + store=checkpoint_dir / sharded_tensor.key, + chunks=sharded_tensor.max_allowed_chunks(), + compressor=None, + fill_value=None, + write_empty_chunks=True, + ) + logger.debug(f'Created a new Zarr array at {checkpoint_dir / sharded_tensor.key}') + except zarr.errors.ContainsArrayError as e: + raise CheckpointingException( + f'Array {checkpoint_dir / sharded_tensor.key} already exists' + ) from e + + if HAS_BFLOAT16 and np_dtype == np.dtype('bfloat16'): + arr._dtype = np_dtype + zarray = arr.store['.zarray'] + arr.store['.zarray'] = zarray.replace(b' exp_sh: + assert ( + False + ), f'Expected shape ({exp_sh}) smaller than actual ({x_sh}) for {repr(expected_sharded_ten)}' + else: + pad_args.extend((0, exp_sh - x_sh)) + # TODO: behavior control with envvar is for testing purposes only, remove it + if not int(os.environ.get('DIST_CKPT_PAD_REPLICATE', 0)): + return torch.nn.functional.pad(x, pad_args) + + # unsqueeze and squeeze to get shapes supported by cudnn + print(f'Replicating last row for {expected_sharded_ten.key}') + if x.dtype == torch.bfloat16: + return ( + torch.nn.functional.pad(x.float().unsqueeze(0), pad_args, mode='replicate') + .squeeze(0) + .bfloat16() + ) + return torch.nn.functional.pad(x.unsqueeze(0), pad_args, mode='replicate').squeeze(0) + + +def load_zarr_based_sharded_metadata( + checkpoint_dir: Path, get_shape_dtype_fn: Callable[[str], Tuple[Tuple[int], np.dtype]] +) -> ShardedStateDict: + """Load metadata of Zarr arrays. + + Args: + checkpoint_dir (str): checkpoint root directory + get_shape_dtype_fn (str -> ((int, ...), np.dtype)): a function returning + an array shape and dtype for a given Zarr array path + """ + sharded_state_dict = {} + for subdir in checkpoint_dir.iterdir(): + if not subdir.is_dir() or not (subdir / '.zarray').exists(): + continue + key = subdir.name + arr_shape, arr_dtype = get_shape_dtype_fn(str(subdir)) + + sharded_state_dict[key] = ShardedTensor( + key, + None, + numpy_to_torch_dtype_dict[arr_dtype], + arr_shape, + arr_shape, + tuple(0 for _ in arr_shape), + tuple(1 for _ in arr_shape), + ) + return sharded_state_dict + + +# default_strategies[StrategyAction.LOAD_SHARDED.value][('zarr', 1)] = ZarrLoadShardedStrategy() +default_strategies[StrategyAction.SAVE_SHARDED.value][('zarr', 1)] = ZarrSaveShardedStrategy( + 'zarr', 1 +) diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py new file mode 100644 index 0000000..98ce01d --- /dev/null +++ b/megatron/core/dist_checkpointing/utils.py @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + +""" Helpers for manipulating sharded tensors and sharded state dicts. """ + +from typing import Dict, Tuple + +from .dict_utils import dict_list_map_inplace, extract_matching_values +from .mapping import ( + LocalNonpersistentObject, + ShardedBase, + ShardedObject, + ShardedStateDict, + ShardedTensor, + ShardedTensorFactory, + StateDict, +) + + +def extract_sharded_tensors( + sharded_state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, StateDict]: + """Extract a dict consisting of only ShardedTensor objects from a given state dict with any objects. + + Args: + sharded_state_dict: state dict possibly containing ShardedTensor objects + + Returns: + Tuple[ShardedStateDict, StateDict]: tuple of: + - state dict with all ShardedTensor (keeping the original state dict structure) + - state dict with all objects other than ShardedTensor (keeping the original state dict structure) + """ + return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor)) + + +def extract_sharded_tensors_and_factories( + sharded_state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, StateDict]: + """Extract a dict consisting of only ShardedTensor and ShardedTensorFactory objects from a given state dict with any objects. + + Args: + sharded_state_dict: state dict possibly containing ShardedTensor and ShardedTensorFactory objects + + Returns: + Tuple[ShardedStateDict, StateDict]: tuple of: + - state dict with all ShardedTensor and ShardedTensorFactory (keeping the original state dict structure) + - state dict with all other objects (keeping the original state dict structure) + """ + return extract_matching_values( + sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, ShardedTensorFactory)) + ) + + +def extract_sharded_tensors_or_nonpersistent( + sharded_state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, StateDict]: + """Extract a dict consisting of only ShardedTensor, ShardedTensorFactory and LocalNonpersistentObject + objects from a given state dict with any objects. + + Args: + sharded_state_dict: state dict possibly containing ShardedTensor, ShardedTensorFactory and LocalNonpersistentObject objects + + Returns: + Tuple[ShardedStateDict, StateDict]: tuple of: + - state dict with all ShardedTensor, ShardedTensorFactory and LocalNonpersistentObject (keeping the original state dict structure) + - state dict with all other objects (keeping the original state dict structure) + """ + return extract_matching_values( + sharded_state_dict, + lambda v: isinstance(v, (ShardedTensor, LocalNonpersistentObject, ShardedTensorFactory)), + ) + + +def extract_sharded_base( + sharded_state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, StateDict]: + return extract_matching_values( + sharded_state_dict, + lambda v: isinstance(v, ShardedBase), + ) + + +def extract_nonpersistent( + sharded_state_dict: ShardedStateDict, +) -> Tuple[ShardedStateDict, StateDict]: + return extract_matching_values( + sharded_state_dict, + lambda v: isinstance(v, LocalNonpersistentObject), + ) + + +def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str): + """Prepend a given prefix to all ShardedBase objects in a given state dict *in-place*. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict + prefix (str): prefix to be prepended + + Returns: + None: state dict is modified in-place + """ + + def add_prefix(t): + if isinstance(t, ShardedBase): + t.key = f'{prefix}{t.key}' + return t + + dict_list_map_inplace(add_prefix, sharded_state_dict) + + +def replace_prefix_for_sharding( + sharded_state_dict: ShardedStateDict, old_prefix: str, new_prefix: str +): + """Replaces the given prefix in *all* sharded keys in a given state dict. + + Errors out if some key does not begin with a given prefix. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to replace keys in + old_prefix (str): prefix to be replaced in each key + new_prefix (str): new prefix + + Returns: + None: state dict is modified in place + """ + + def _replace_prefix(x): + if isinstance(x, (ShardedTensor, ShardedTensorFactory, ShardedObject)): + if not x.key.startswith(old_prefix): + raise ValueError(f'Expected {x.key} to begin with prefix {old_prefix}') + x.key = f'{new_prefix}{x.key[len(old_prefix):]}' # str.removeprefix in Python >= 3.9 + return x + + dict_list_map_inplace(_replace_prefix, sharded_state_dict) + + +def apply_prefix_mapping(sharded_state_dict: ShardedStateDict, prefix_map: Dict[str, str]): + """Replaces prefixes *only in keys matching* with one of prefixes in the map. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to replace keys in + prefix_map (Dict[str, str]): map of old->new prefixes. The first matching prefix for each key is used + + Returns: + None: state dict is modified in place + """ + + def _replace_prefixes(x): + if not isinstance(x, (ShardedTensor, ShardedTensorFactory, ShardedObject)): + return x + for old_prefix, new_prefix in prefix_map.items(): + if x.key.startswith(old_prefix): + x.key = ( + f'{new_prefix}{x.key[len(old_prefix):]}' # str.removeprefix in Python >= 3.9 + ) + break + return x + + dict_list_map_inplace(_replace_prefixes, sharded_state_dict) diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py new file mode 100644 index 0000000..4d86099 --- /dev/null +++ b/megatron/core/dist_checkpointing/validation.py @@ -0,0 +1,528 @@ +import logging +from collections import Counter, defaultdict +from enum import Enum +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union + +import numpy as np +import torch + +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config +from megatron.core.dist_checkpointing.dict_utils import ( + extract_matching_values, + map_reduce, + nested_values, +) +from megatron.core.dist_checkpointing.mapping import ( + ShardedBase, + ShardedObject, + ShardedStateDict, + is_main_replica, +) +from megatron.core.dist_checkpointing.strategies.base import ( + LoadCommonStrategy, + LoadShardedStrategy, + SaveCommonStrategy, + SaveShardedStrategy, + StrategyAction, + get_default_strategy, +) + +if TYPE_CHECKING: + from megatron.core.dist_checkpointing.serialization import CkptShardedMetadata + +logger = logging.getLogger(__name__) + +# list of local saved/loaded ShardedBase objects +_LocalMetadata = List[Union[ShardedTensor, ShardedObject]] +# list of lists of global saved/loaded ShardedBase objects (each list element corresponds to global rank) +_GlobalMetadata = List[_LocalMetadata] + + +class StrictHandling(Enum): + """Determines handling of load mismatch (non-empty "unexpected" or "missing" keys). + + Different flags carry different implications on performance and behaviour and + are divided into two groups: + - *_UNEXPECTED + - *_ALL + The first group ignores missing keys (present in the checkpoint but missing + in the sharded state dict) which is created in order to avoid inter-rank + metadata exchange. Note that the metadata exchange will happen anyway + with `load(..., validate_access_integrity=True)` flag in which case using the + `*_ALL` option is recommended as it provides a more thorough check with no + performance penalty wrt. `*_UNEXPECTED` group. + + All options except for the first one (`ASSUME_OK_UNEXPECTED`) require + extra disk access before the load in order to remove unexpected keys + from the sharded state dict requested to load. + """ + + # Relies on the underlying strategy to raise error on unexpected keys + ASSUME_OK_UNEXPECTED = 'assume_ok_unexpected' + # Logs (with WARNING level) "unexpected" keys. Missing keys are ignored. + # This is treated as a reasonable default for a "non-strict" load + LOG_UNEXPECTED = 'log_unexpected' + # Logs (with WARNING level) all mismatched keys. + LOG_ALL = 'log_all' + # Raise error on unexpected keys before load attempt. + # Gives cleaner error message than `ASSUME_OK_UNEXPECTED` but requires + # extra disk access. + RAISE_UNEXPECTED = 'raise_unexpected' + # Raise error on any mismatch. Similar to `RAISE_UNEXPECTED` but requires + # metadata exchange. + RAISE_ALL = 'raise_all' + # "Unexpected" mismatches are not reported, but returned by the `load` + # function along with the loaded state dict. Missing keys are ignored. + RETURN_UNEXPECTED = 'return_unexpected' + # All mismatches are returned along with the loaded state dict. + RETURN_ALL = 'return_all' + # Simply ignores mismatches (not recommended) + IGNORE_ALL = 'ignore_all' + + @staticmethod + def requires_explicit_ckpt_mismatch_check(val: 'StrictHandling') -> bool: + """Whether a given strict flag involves mismatch check against the checkpoint.""" + return val != StrictHandling.ASSUME_OK_UNEXPECTED + + @staticmethod + def requires_global_app_metadata(val: 'StrictHandling') -> bool: + """Whether a given strict option requires global metadata for validation.""" + return val in ( + StrictHandling.IGNORE_ALL, + StrictHandling.RAISE_ALL, + StrictHandling.RETURN_ALL, + StrictHandling.LOG_ALL, + ) + + @staticmethod + def requires_returning_mismatch_keys(val: 'StrictHandling') -> bool: + """Whether a given strict option results in extra return value from the `load` function.""" + return val in ( + StrictHandling.RETURN_UNEXPECTED, + StrictHandling.RETURN_ALL, + ) + + +def parse_strict_flag(strict: Union[str, StrictHandling]) -> StrictHandling: + """Parse user passed strict flag from a string to StrictHandling instance. + + Args: + strict (str, StrictHandling): strict flag to parse. If already an instance + of StrictHandling, this function is a noop. + + Returns: + StrictHandling: enum instance + """ + if isinstance(strict, StrictHandling): + return strict + try: + return StrictHandling(strict) + except (ValueError, TypeError) as e: + raise ValueError(f'Invalid strict flag: {e}') from e + + +def validate_integrity_and_strict_load( + sharded_state_dict: ShardedStateDict, + strict: StrictHandling, + validate_access_integrity: bool, + local_metadata: Optional[_LocalMetadata] = None, + global_metadata: Optional[_GlobalMetadata] = None, + ckpt_sharded_metadata: Optional['CkptShardedMetadata'] = None, +) -> Tuple[ShardedStateDict, Set[str], Set[str]]: + """Validates sharding integrity and potential mismatches with the checkpoint. + + `validate_access_integrity` controls sharding integrity check (orthogonal + to strictness checking) which verifies `sharded_state_dict` runtime completeness + (in isolation from the actual checkpoint). + + `strict` flag controls handling of mismatches between the requested + sharded state dict to load and the actual checkpoint. See `StrictHandling` + docs for details regarding flag behavior and performance implications + (disk interactions or inter-rank communication). + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to verify. + strict (StrictHandling): flag determining how to handle sharded keys mismatch. + validate_access_integrity (bool): whether to perform sharding validation. + local_metadata (_LocalMetadata, optional): local sharded state dict metadata. + Defaults to None, in which case it's determined based on `sharded_state_dict`. + global_metadata (_GlobalMetadata, optional): global sharded state dict metadata + (exchanged between ranks). Defaults to None, in which case "missing" + keys are not determined. + ckpt_sharded_metadata (CkptShardedMetadata, optional): sharded metadata + from the checkpoint. Defaults to None, which only makes sense + for the `StrictHandling.ASSUME_OK_UNEXPECTED` strict value. + + Returns: + Tuple[ShardedStateDict, Set[str], Set[str]]: tuple of: sharded state dict + without unexpected keys, missing and unexpected keys. Missing keys are equal + on all ranks, unexpected keys might differ across ranks. Additionally, + missing keys might be erroneously empty (depending on `strict` value). + """ + missing_keys, unexpected_keys = [], [] + if StrictHandling.requires_explicit_ckpt_mismatch_check(strict): + if ckpt_sharded_metadata is None: + raise CheckpointingException( + 'Cannot verify checkpoint mismatch with ckpt_sharded_metadata=None.' + ) + if local_metadata is None: + local_metadata = [ + sh_base.without_data() for sh_base in nested_values(sharded_state_dict) + ] + # We don't want to check for missing keys even if we could + _skip_missing_keys = strict in ( + StrictHandling.ASSUME_OK_UNEXPECTED, + StrictHandling.LOG_UNEXPECTED, + StrictHandling.RAISE_UNEXPECTED, + StrictHandling.RETURN_UNEXPECTED, + ) + missing_keys, unexpected_keys = _determine_missing_and_unexpected_keys( + ckpt_sharded_metadata, local_metadata, None if _skip_missing_keys else global_metadata + ) + + sharded_state_dict = adjust_non_strict_load(sharded_state_dict, unexpected_keys) + + if strict == StrictHandling.IGNORE_ALL: + missing_keys, unexpected_keys = [], [] + elif strict in (StrictHandling.RAISE_UNEXPECTED, StrictHandling.RAISE_ALL): + maybe_report_missing_and_unexpected_keys(missing_keys, unexpected_keys, True) + elif strict in (StrictHandling.LOG_UNEXPECTED, StrictHandling.LOG_ALL): + maybe_report_missing_and_unexpected_keys(missing_keys, unexpected_keys, False) + + if validate_access_integrity: + if global_metadata is None: + raise CheckpointingException( + 'Cannot check sharding intergrity without global_metadata (None).' + ) + validate_sharding_integrity(global_metadata) + + return sharded_state_dict, missing_keys, unexpected_keys + + +def verify_checkpoint_and_load_strategy( + checkpoint_dir: str, + sharded_strategy: Union[LoadShardedStrategy, Tuple[str, int], None] = None, + common_strategy: Union[LoadCommonStrategy, Tuple[str, int], None] = None, +) -> Tuple[LoadShardedStrategy, LoadCommonStrategy]: + """Verifies if checkpoint metadata exists and matches given strategies. + + If no strategies are passed, they are determined based on the checkpoint metadata. + + Args: + checkpoint_dir (str): checkpoint directory + sharded_strategy (LoadShardedStrategy, Tuple[str, int], optional): sharded load strategy to be verified + if compatible with the checkpoint content. If None, the default sharded load strategy + for the checkpoint backend will be returned. + common_strategy (LoadCommonStrategy, Tuple[str, int], optional): common load strategy to be verified + if compatible with the checkpoint content. If None, the default common load strategy + for the checkpoint backend will be returned. + """ + if not Path(checkpoint_dir).exists(): + raise CheckpointingException(f'Checkpoint directory {checkpoint_dir} does not exist') + + saved_config = maybe_load_config(checkpoint_dir) + if saved_config is None: + raise CheckpointingException(f'{checkpoint_dir} is not a distributed checkpoint') + + if sharded_strategy is None: + sharded_strategy = get_default_strategy( + StrategyAction.LOAD_SHARDED, + saved_config.sharded_backend, + saved_config.sharded_backend_version, + ) + elif isinstance(sharded_strategy, tuple): + sharded_strategy = get_default_strategy(StrategyAction.LOAD_SHARDED, *sharded_strategy) + + if common_strategy is None: + common_strategy = get_default_strategy( + StrategyAction.LOAD_COMMON, + saved_config.common_backend, + saved_config.common_backend_version, + ) + elif isinstance(common_strategy, tuple): + sharded_strategy = get_default_strategy(StrategyAction.LOAD_COMMON, *common_strategy) + + sharded_strategy.check_backend_compatibility(saved_config.sharded_backend) + sharded_strategy.check_version_compatibility(saved_config.sharded_backend_version) + common_strategy.check_backend_compatibility(saved_config.common_backend) + common_strategy.check_version_compatibility(saved_config.common_backend_version) + return sharded_strategy, common_strategy + + +def adjust_non_strict_load( + sharded_state_dict: ShardedStateDict, + sharded_keys_to_remove: Set[str], +) -> ShardedStateDict: + """Adjusts sharded state dict removing keys not existing in the checkpoint. + + Args: + sharded_state_dict (ShardedStateDict): sharded state dict to modify + sharded_keys_to_remove (Set[str]): keys to remove from the state dict + + Returns: + ShardedStateDict: state dict without ShardedBase objects with specified keys + """ + + def is_unexpected_key(x: ShardedBase): + assert isinstance(x, ShardedBase), f'Unexpected type {type(x)}' + return x.key in sharded_keys_to_remove + + _, sharded_state_dict = extract_matching_values(sharded_state_dict, is_unexpected_key) + return sharded_state_dict + + +def _determine_missing_and_unexpected_keys( + ckpt_sharded_metadata: 'CkptShardedMetadata', + local_metadata: _LocalMetadata, + global_metadata: Optional[_GlobalMetadata] = None, +) -> Tuple[Set[str], Set[str]]: + """Determines load mismatches based on metadata. + + There is an asymmetry between "unexpected" and "missing" keys. + Unexpected keys can be determined based only on local metadata. + Missing keys must be based on global metadata, since other ranks might access + different keys than the current rank. + In consequence, the return value of this function is different on each rank: + "missing_keys" are equal, but "unexpected_keys" might differ across ranks. + + Args: + ckpt_sharded_metadata (CkptShardedMetadata): sharded state dict (without data) + constructed based on the checkpoint content + local_metadata (_LocalMetadata): list of local ShardedBase objects + requested to be loaded by this rank + global_metadata (_GlobalMetadata, optional): list of global ShardedBase objects + requested to be loaded by all ranks. Defaults to None, in which case + returned "missing" keys are empty. + + Returns: + Tuple[Set[str], Set[str]]: missing and unexpected keys. Missing keys are equal + on all ranks, unexpected keys might differ across ranks. If passed + `global_metadata` is empty, returned missing keys are empty as well. + + """ + local_accessed_keys = set(sh_base.key for sh_base in local_metadata) + ckpt_keys = set(sh_base.key for sh_base in ckpt_sharded_metadata.values()) + unexpected_keys = local_accessed_keys - ckpt_keys + if global_metadata is not None: + global_accessed_keys = set( + sh_base.key for rank_metadata in global_metadata for sh_base in rank_metadata + ) + missing_keys = ckpt_keys - global_accessed_keys + else: + missing_keys = set() + + if missing_keys: + logger.debug(f'Dist ckpt load missing keys: {missing_keys}') + if unexpected_keys: + logger.debug(f'Dist ckpt load unexpected keys: {unexpected_keys}') + + return missing_keys, unexpected_keys + + +def maybe_report_missing_and_unexpected_keys( + missing_keys: Set[str], unexpected_keys: Set[str], raise_error: bool = True +) -> None: + """Raises or logs an error in case missing or unexpected keys are non-empty. + + Args: + missing_keys (Set[str]): missing keys in the state dict + unexpected_keys (Set[str]): unexpected keys in the state dict + raise_error: If True, raises error on mismatch. Otherwise, logs mismatch + with WARNING level. + + Returns: + None + + Raises: + CheckpointingException: if `raise_error` is True and at least one of + `missing_keys` or `unexpected_keys` are non-empty. + """ + if not missing_keys and not unexpected_keys: + return + missing_title_msg = ( + f'Some keys found in the checkpoint are missing in the provided sharded state dict. ' + ) + missing_body_msg = f'Missing keys (for all ranks): {missing_keys}. ' + unexpected_title_msg = f'Unexpected keys (not found in the checkpoint) encountered in the provided sharded state dict. ' + unexpected_body_msg = f'Unexpected keys (for this rank): {unexpected_keys}. ' + error_msg = '' + if missing_keys: + error_msg += missing_title_msg + if unexpected_keys: + error_msg += unexpected_title_msg + + error_msg += '\n' + if missing_keys: + error_msg += missing_body_msg + if unexpected_keys: + error_msg += unexpected_body_msg + + if raise_error: + raise CheckpointingException(error_msg) + else: + logger.warning(error_msg) + + +def validate_sharding_integrity(global_metadata: _GlobalMetadata) -> None: + """Validate if the ShardedTensors and ShardedObjects from multiple processes define correct sharding. + + Local ShardedTensors and ShardedObject metadata is exchanged with `torch.distributed.all_gather_object` + and then process with global rank 0 checks if main replicas of the shards: + - cover the whole global tensors + - don't overlap + + Args: + global_metadata (_GlobalMetadata): ShardedTensor and ShardedObject objects from all ranks. + + Returns: + None + + Raises: + CheckpointingException for invalid access pattern + """ + if torch.distributed.get_rank() != 0: + return + + key_shardings = defaultdict(list) + for rank, rank_shardings in enumerate(global_metadata): + for sharding in rank_shardings: + key_shardings[sharding.key].append((rank, sharding)) + for key, shardings in key_shardings.items(): + if isinstance(shardings[0][1], ShardedObject): + _validate_objects_for_key(shardings) + else: + _validate_sharding_for_key(shardings) + + +def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): + some_rank_shard = rank_sharding[0][1] + global_shape = some_rank_shard.global_shape + local_shape = some_rank_shard.local_shape + dtype = some_rank_shard.dtype + has_flattened_range = some_rank_shard.flattened_range is not None + for rank, sharding in rank_sharding: + assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard) + assert sharding.global_shape == global_shape, ( + sharding.global_shape, + global_shape, + some_rank_shard, + ) + assert sharding.local_shape == local_shape, ( + sharding.local_shape, + local_shape, + some_rank_shard, + ) + assert (sharding.flattened_range is not None) == has_flattened_range, ( + (sharding.flattened_range is not None), + has_flattened_range, + some_rank_shard, + ) + + shard_access_cnt = _compute_shards_access(rank_sharding) + if has_flattened_range: + map_reduce( + rank_sharding, + lambda x: x[1].global_offset, + lambda x: x[1], + _validate_sharding_for_key_flattened, + ) + else: + if not torch.all(shard_access_cnt == 1): + logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}') + raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}') + + +def _compute_shards_access(rank_sharding): + shard_access_cnt = torch.zeros( + rank_sharding[0][1].axis_fragmentations, dtype=torch.int, device='cpu' + ) + for rank, sharding in rank_sharding: + if is_main_replica(sharding.replica_id): + shard_access_cnt[sharding.local_chunk_offset_in_global()] += 1 + return shard_access_cnt + + +def _validate_sharding_for_key_flattened(tensors_by_shard): + all_slices = [] + local_shape = tensors_by_shard[0].local_shape + for sharding in tensors_by_shard: + assert sharding.local_shape == local_shape + sharding: ShardedTensor + if not is_main_replica(sharding.replica_id): + continue + + all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop)) + + starts, stops = map(np.asarray, zip(*sorted(all_slices))) + if ( + starts[0] != 0 + or stops[-1] != np.product(local_shape) + or not np.all(starts[1:] == stops[:-1]) + ): + logger.error( + f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' + ) + raise CheckpointingException( + f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' + ) + + +def _validate_objects_for_key(sharded_objects: List[ShardedObject]): + """Ensure uniqueness of saved objects.""" + unique_keys = [ + sh_obj.unique_key for _, sh_obj in sharded_objects if is_main_replica(sh_obj.replica_id) + ] + if len(unique_keys) != len(set(unique_keys)): + duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1} + logger.error(f'Duplicate ShardedObject keys and counts: {duplicates}') + raise CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}') + expected_shard_num = np.prod(sharded_objects[0][1].global_shape) + if len(unique_keys) != expected_shard_num: + err_msg = f'Invalid access pattern: {expected_shard_num - len(unique_keys)} ShardedObject are missing.' + logger.error(f'{err_msg} Existing shards: {unique_keys}') + raise CheckpointingException(err_msg) + + +def determine_global_metadata( + sharded_state_dict: ShardedStateDict, +) -> Tuple[_LocalMetadata, _GlobalMetadata]: + """Exchanges local metadata with `all_gather_object` to determine global metadata. + + Args: + sharded_state_dict (ShardedStateDict): local sharded state dict + + Returns: + Tuple[_LocalMetadata, _GlobalMetadata]: local and global ShardedBase objects with stripped data + """ + local_metadata = [ten.without_data() for ten in nested_values(sharded_state_dict)] + global_metadata = [None] * torch.distributed.get_world_size() + torch.distributed.all_gather_object(global_metadata, local_metadata) + return local_metadata, global_metadata + + +def validate_sharded_objects_handling( + sharded_strategy: Union[SaveShardedStrategy, LoadShardedStrategy], + common_strategy: Union[SaveCommonStrategy, LoadCommonStrategy], +) -> None: + """Checks if either of the passed strategies can handle sharded objects. + + Args: + sharded_strategy (Union[SaveShardedStrategy, LoadShardedStrategy]): sharded strategy used for saving/loading + common_strategy (Union[SaveCommonStrategy, LoadCommonStrategy]): common strategy used for saving/loading + + Returns: + None + + Raises: + CheckpointingException: if both strategies can't handle ShardedObjects + """ + if ( + not sharded_strategy.can_handle_sharded_objects + and not common_strategy.can_handle_sharded_objects + ): + raise CheckpointingException( + f'Either sharded strategy or common strategy must implement ShardedObjects handling.' + f' Both {sharded_strategy} and {common_strategy} specify can_handle_sharded_objects=False' + ) diff --git a/megatron/core/distributed/__init__.py b/megatron/core/distributed/__init__.py new file mode 100644 index 0000000..b375e37 --- /dev/null +++ b/megatron/core/distributed/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from .distributed_data_parallel import DistributedDataParallel +from .distributed_data_parallel_config import DistributedDataParallelConfig +from .finalize_model_grads import finalize_model_grads +from .param_and_grad_buffer import ParamAndGradBuffer, shard_buffer diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py new file mode 100644 index 0000000..7b95b85 --- /dev/null +++ b/megatron/core/distributed/distributed_data_parallel.py @@ -0,0 +1,329 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import logging +from contextlib import contextmanager +from typing import Dict, Optional + +import torch + +from .. import parallel_state +from ..transformer.module import MegatronModule +from ..transformer.transformer_config import TransformerConfig +from ..utils import log_single_rank +from .distributed_data_parallel_config import DistributedDataParallelConfig +from .param_and_grad_buffer import ParamAndGradBuffer + +logger = logging.getLogger(__name__) + + +class DistributedDataParallel(MegatronModule): + """ + DDP wrapper which stores grads in contiguous buffers. Also has option of overlapping + communication with backprop computation by breaking up full model's gradients into smaller + buckets and running all-reduce / reduce-scatter on each bucket asynchronously. This class + also provides the option to do the gradient accumulation in a type other than the param type + (e.g., fp32 for a bf16 model). + + Args: + config: Transformer config object. + ddp_config: DistributedDataParallel config object. + module: Underlying model. + disable_bucketing: If true, force assign all parameters to a single bucket. If false, + use standard bucketing policy: assign parameters to smaller buckets and all-reduce + per bucket _if_ overlap_grad_reduce is True and pp_rank is 0. + + """ + + def __init__( + self, + config: TransformerConfig, + ddp_config: DistributedDataParallelConfig, + module: torch.nn.Module, + disable_bucketing: bool = False, + ): + super().__init__(config=config) + self.module = module + + # If bucket_size is not provided as an input, use sane default. + # If using very large dp_sizes, make buckets larger to ensure that chunks used in NCCL + # ring-reduce implementations are large enough to remain bandwidth-bound rather than + # latency-bound. + if ddp_config.bucket_size is None: + ddp_config.bucket_size = max( + 40000000, 1000000 * parallel_state.get_data_parallel_world_size() + ) + # Set bucket_size to infinity if overlap_grad_reduce is False. + if not ddp_config.overlap_grad_reduce: + ddp_config.bucket_size = None + + self.ddp_config = ddp_config + log_single_rank( + logger, + logging.INFO, + f'Setting up DistributedDataParallel with config {self.ddp_config}', + ) + + # Turn off bucketing if we are on a pipeline stage that is not the first (since + # data-parallel communication on these stages is not on the critical path), or if + # disable_bucketing is True (e.g., we might not want to break up model parameters + # into buckets for model chunks after the first in the interleaved schedule). + self.bucket_size = self.ddp_config.bucket_size + if parallel_state.get_pipeline_model_parallel_rank() > 0: + self.bucket_size = None + if disable_bucketing: + self.bucket_size = None + + self.module = module + self.param_to_buffer = {} + + # Group parameters by their gradient type. + param_to_name = {} + dense_params = [] + expert_parallel_params = [] + for name, param in self.module.named_parameters(): + if not param.requires_grad: + continue + + param.grad_added_to_main_grad = False + param_to_name[param] = name + + if getattr(param, 'allreduce', True): + dense_params.append(param) + else: + expert_parallel_params.append(param) + + def allocate_buffers_for_parameters( + input_params, + data_parallel_group, + gradient_scaling_factor, + ): + param_and_grad_dtype_to_params = {} + + # Group parameters by their gradient type. + for param in input_params: + if not param.requires_grad: + continue + + param_dtype = param.dtype + grad_dtype = torch.float if self.ddp_config.grad_reduce_in_fp32 else param.dtype + + params = param_and_grad_dtype_to_params.get((param_dtype, grad_dtype), []) + params.append(param) + param_and_grad_dtype_to_params[(param_dtype, grad_dtype)] = params + + if not config.calculate_per_token_loss: + target_gradient_scaling_factor = 1.0 / parallel_state.get_data_parallel_world_size() + if self.ddp_config.average_in_collective: + # Collective is averaging gradients in collective with data_parallel_group. + assert ( + gradient_scaling_factor + / torch.distributed.get_world_size(group=data_parallel_group) + == target_gradient_scaling_factor + ) + else: + assert gradient_scaling_factor == target_gradient_scaling_factor + + # Allocate the grad buffers and map the grads. + buffers = [] + for (param_dtype, grad_dtype), params in param_and_grad_dtype_to_params.items(): + buffers.append( + ParamAndGradBuffer( + self.ddp_config, + param_dtype, + grad_dtype, + params, + data_parallel_group, + self.bucket_size, + param_to_name, + gradient_scaling_factor, + ) + ) + for param in params: + self.param_to_buffer[param] = buffers[-1] + + return buffers + + if config.calculate_per_token_loss: + gradient_scaling_factor = 1.0 + expert_gradient_scaling_factor = 1.0 + else: + if self.ddp_config.average_in_collective: + gradient_scaling_factor = 1.0 + expert_gradient_scaling_factor = ( + 1.0 / parallel_state.get_expert_model_parallel_world_size() + ) + else: + data_parallel_world_size = parallel_state.get_data_parallel_world_size() + gradient_scaling_factor = 1.0 / data_parallel_world_size + expert_gradient_scaling_factor = 1.0 / data_parallel_world_size + + # Allocate the param+grad buffers for dense params' grads. + self.buffers = allocate_buffers_for_parameters( + dense_params, + parallel_state.get_data_parallel_group(with_context_parallel=True), + gradient_scaling_factor=gradient_scaling_factor, + ) + + # Allocate separate param+grad buffers for expert parallel params' grads. + self.expert_parallel_buffers = allocate_buffers_for_parameters( + expert_parallel_params, + parallel_state.get_data_modulo_expert_parallel_group(with_context_parallel=True), + gradient_scaling_factor=expert_gradient_scaling_factor, + ) + + # Delete references to weight_tensor if they exist since we don't want two parameter copies + # if we re-mapped parameters (which happens when we use the distributed optimizer). + # This is a temporary workaround around a TE bug that is fixed with + # https://github.com/NVIDIA/TransformerEngine/pull/719. + if self.ddp_config.use_distributed_optimizer: + + @torch.no_grad() + def unmap_weight_tensor(m): + if hasattr(m, 'weight_tensor'): + m.weight_tensor = None + + self.module.apply(unmap_weight_tensor) + + # Register backward hook. + # Accumulation function for the gradients need to be stored so they + # don't go out of scope. + self.grad_accs = [] + for param in self.module.parameters(): + if param.requires_grad: + # Expand so we get access to grad_fn. + param_tmp = param.expand_as(param) + # Get the gradient accumulator function. + grad_acc = param_tmp.grad_fn.next_functions[0][0] + grad_acc.register_hook(self._make_param_hook(param, self.param_to_buffer)) + self.grad_accs.append(grad_acc) + + def forward(self, *inputs, **kwargs): + """ + Calls the wrapped module's forward() method. + """ + return self.module(*inputs, **kwargs) + + def _make_param_hook( + self, + param: torch.nn.Parameter, + param_to_buffer: Dict[torch.nn.Parameter, ParamAndGradBuffer], + ): + """ + Creates the all-reduce / reduce-scatter hook for backprop. + """ + + def param_hook(*unused): + if param.requires_grad: + if self.ddp_config.overlap_grad_reduce: + assert ( + param.grad is not None + ), 'param.grad being None is not safe when overlap_grad_reduce is True' + if param.grad is not None and ( + not param.grad_added_to_main_grad or getattr(param, 'zero_out_wgrad', False) + ): + param.main_grad.add_(param.grad.data) + param.grad = None + + if self.ddp_config.overlap_grad_reduce: + param_to_buffer[param].register_grad_ready(param) + + return param_hook + + @contextmanager + def no_sync(self): + """ + Context manager that turns off gradient synchronization. + """ + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.is_last_microbatch = False + try: + yield + finally: + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.is_last_microbatch = True + + def start_grad_sync(self, *unused): + """ + Initiates grad sync (all-reduce or reduce-scatter) communication operations + for all model gradients. + + When overlap_grad_reduce is set to True, dispatches asynchronous communication + calls. When overlap_grad_reduce is set to False, calls synchronous + communication ops. + """ + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.start_grad_sync() + + def scale_gradients(self, scaling_factor: float) -> None: + """Scale all gradients inside the buffers by `scaling_factor`.""" + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.scale_gradients(scaling_factor) + + def finish_grad_sync(self): + """ + Finishes grad sync (all-reduce or reduce-scatter) communication operations + for all model gradients. + + When overlap_grad_reduce is set to True, waits for asynchronous communication + calls to complete. When overlap_grad_reduce is set to False, calls synchronous + communication ops. + """ + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.finish_grad_sync() + + def zero_grad_buffer(self): + """ + Zeros out all grad buffers. Needs to be called at the beginning of each + training iteration. + """ + for param in self.module.parameters(): + if param.requires_grad: + param.grad_added_to_main_grad = False + for buffer in self.buffers + self.expert_parallel_buffers: + buffer.reset() + + def broadcast_params(self): + """ + Syncs parameters across all DP ranks. + """ + for param in self.module.parameters(): + is_expert_parallel = not getattr(param, 'allreduce', True) + + if is_expert_parallel: + data_parallel_group = parallel_state.get_data_modulo_expert_parallel_group( + with_context_parallel=True + ) + else: + data_parallel_group = parallel_state.get_data_parallel_group( + with_context_parallel=True + ) + torch.distributed.broadcast( + param.data, + src=torch.distributed.get_global_rank(data_parallel_group, 0), + group=data_parallel_group, + ) + + def state_dict(self, prefix='', keep_vars=False): + """ + Returns a dictionary containing references to the whole state of the + wrapped module. + + Both parameters and persistent buffers (e.g. running averages) are included. + Keys are corresponding parameter and buffer names. Parameters and buffers + set to None are not included. + """ + return self.module.state_dict(prefix=prefix, keep_vars=keep_vars) + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """ + Returns wrapped module's state_dict for checkpoint saving. + """ + return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars) + + def load_state_dict(self, state_dict, strict=True): + """ + Copies parameters and buffers from state_dict into the wrapped module and its + descendants. If strict is True, then the keys of state_dict must exactly match + the keys returned by this module’s state_dict() function. + """ + self.module.load_state_dict(state_dict, strict=strict) diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py new file mode 100644 index 0000000..c1396e0 --- /dev/null +++ b/megatron/core/distributed/distributed_data_parallel_config.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class DistributedDataParallelConfig: + """Configuration for DistributedDataParallel.""" + + grad_reduce_in_fp32: bool = False + """If true, reduce grads in fp32.""" + + overlap_grad_reduce: bool = False + """If true, overlap grad all-reduce / reduce-scatter with backward compute.""" + + use_distributed_optimizer: bool = False + """If true, issue reduce-scatter collectives to aggregate gradients and clean up + originally allocated model parameters, otherwise issue all-reduce collectives. + """ + + check_for_nan_in_grad: bool = False + """ If true, check for NaNs in gradients _before_ communication collective.""" + + bucket_size: Optional[int] = None + """Maximum number of parameters in each bucket. If unspecified, MCore uses a default + value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger + buckets to ensure collectives do not become latency-bound).""" + + average_in_collective: bool = False + """If true, compute average in collective directly, as opposed to dividing by the + dp_size first and then computing sum in the collective.""" diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py new file mode 100644 index 0000000..502f15a --- /dev/null +++ b/megatron/core/distributed/finalize_model_grads.py @@ -0,0 +1,151 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from typing import List, Optional + +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from .. import parallel_state +from ..transformer.transformer_config import TransformerConfig +from ..utils import get_attr_wrapped_model, get_model_config + + +def _allreduce_word_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig): + """ + All-reduce word embedding grads. + + Reduce grads across first and last stages to ensure that word_embeddings parameters stay in + sync. This should only run for models that support pipelined model parallelism (BERT and GPT). + """ + + if ( + parallel_state.is_rank_in_embedding_group(ignore_virtual=True) + and parallel_state.get_pipeline_model_parallel_world_size() > 1 + ): + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + model_module = model[0] + elif parallel_state.is_pipeline_last_stage(ignore_virtual=True): + model_module = model[-1] + else: # We do not support the interleaved schedule for T5 yet. + model_module = model[0] + + # Look for module with 'pre_process' attribute to get around the fact that DDP and + # other wrapper classes inherit from non-core MegatronModule that has + # 'share_embeddings_and_output_weights' and 'shared_embedding_or_output_weight' + # attributes already, causing get_attr_wrapped_model() to not unwrap anything here. + # TODO: Clean this up once the wrapper classes inherit from core MegatronModule. + model_module = get_attr_wrapped_model(model_module, 'pre_process', return_model_obj=True) + if model_module.share_embeddings_and_output_weights: + weight = model_module.shared_embedding_or_output_weight() + grad = weight.main_grad + torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) + + +def _allreduce_position_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig): + """ + All-reduce position_embeddings grad across first (encoder) and split (decoder) stages to + ensure that position embeddings parameters stay in sync. This should only run for T5 models + with pipeline parallelism. + """ + if ( + parallel_state.is_rank_in_position_embedding_group() + and parallel_state.get_pipeline_model_parallel_world_size() > 1 + and config.pipeline_model_parallel_split_rank is not None + ): + model_module = model[0] + grad = get_attr_wrapped_model( + model_module, 'language_model.embedding.position_embeddings.weight.main_grad' + ) + torch.distributed.all_reduce(grad, group=parallel_state.get_position_embedding_group()) + + +def _allreduce_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig): + """ + All-reduce both word and position embeddings. + """ + _allreduce_word_embedding_grads(model, config) + _allreduce_position_embedding_grads(model, config) + + +def _allreduce_layernorm_grads(model: List[torch.nn.Module], config: TransformerConfig): + """ + All-reduce layernorm grads (for sequence parallelism). + """ + + # All-reduce layernorm parameters across model parallel nodes + # when sequence parallelism is used + if parallel_state.get_tensor_model_parallel_world_size() > 1 and ( + config.sequence_parallel or config.qk_layernorm + ): + grads = [] + for model_chunk in model: + for name, param in get_attr_wrapped_model(model_chunk, 'named_parameters')(): + if ( + param.requires_grad + and getattr(param, 'sequence_parallel', False) + or 'q_layernorm' in name + or 'k_layernorm' in name + ): + grad = param.main_grad + grads.append(grad.data) + if grads: + coalesced = _flatten_dense_tensors(grads) + torch.distributed.all_reduce( + coalesced, group=parallel_state.get_tensor_model_parallel_group() + ) + for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + + +def finalize_model_grads(model: List[torch.nn.Module], num_tokens: Optional[torch.Tensor] = None): + """ + All-reduce all model grads across DP replicas, layernorm grads for sequence parallelism, + embedding grads across first and last pipeline stages (if not tied), + scale gradients by `num_tokens`. + """ + + config = get_model_config(model[0]) + + # All-reduce / reduce-scatter across DP replicas. + if config.timers is not None: + config.timers('all-grads-sync', log_level=1).start(barrier=config.barrier_with_L1_time) + for model_chunk in model: + model_chunk.finish_grad_sync() + if config.timers is not None: + config.timers('all-grads-sync').stop() + + # All-reduce layer-norm grads (for sequence parallelism). + if config.timers is not None: + config.timers('layernorm-grads-all-reduce', log_level=1).start( + barrier=config.barrier_with_L1_time + ) + _allreduce_layernorm_grads(model, config) + if config.timers is not None: + config.timers('layernorm-grads-all-reduce').stop() + + # All-reduce embedding grads (for pipeline parallelism). + if config.timers is not None: + config.timers('embedding-grads-all-reduce', log_level=1).start( + barrier=config.barrier_with_L1_time + ) + _allreduce_embedding_grads(model, config) + if config.timers is not None: + config.timers('embedding-grads-all-reduce').stop() + + # normalize gradients for per-token loss normalization. + # if we are using by the number of tokens, then we use that as a divisor. this number + # will be the total number of non-padded tokens in the global batch. + if num_tokens is not None: + # the number of tokens is only present on the last stage, so broadcast it + # to the other ranks in the pipeline parallel group. + torch.distributed.broadcast( + num_tokens, + src=parallel_state.get_pipeline_model_parallel_last_rank(), + group=parallel_state.get_pipeline_model_parallel_group(), + ) + # all-reduce across DP ranks. + torch.distributed.all_reduce(num_tokens, group=parallel_state.get_data_parallel_group()) + for model_chunk in model: + if num_tokens > 0: + scaling = 1.0 / num_tokens + model_chunk.scale_gradients(scaling) diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py new file mode 100644 index 0000000..efed47c --- /dev/null +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -0,0 +1,549 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import logging +import math +import os +from enum import Enum +from typing import Dict, List, Optional + +import torch + +from ..utils import log_on_each_pipeline_stage +from .distributed_data_parallel_config import DistributedDataParallelConfig + +logger = logging.getLogger(__name__) + + +class BufferType(Enum): + PARAM = 1 + GRAD = 2 + + +def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int): + """ + Shard buffer into data_parallel_world_size chunks of equal size. + """ + assert buffer.numel() % data_parallel_world_size == 0 + shard_size = buffer.numel() // data_parallel_world_size + sharded_buffer = [ + buffer[(r * shard_size) : ((r + 1) * shard_size)] for r in range(data_parallel_world_size) + ] + return sharded_buffer + + +class Bucket: + """ + Bucket to keep track of a subset of the model's gradients. Provides functionality to register + when params in the bucket have grads ready to be synced; an asynchronous communication call + is automatically launched when _all_ params in the bucket have grads ready. + + Args: + ddp_config: DistributedDataParallel config object. + params: List of parameters whose gradients are collated in this bucket. + param_data: View in larger ParamAndGradBuffer.param_data that this bucket is responsible for. + grad_data: View in larger ParamAndGradBuffer.grad_data that this bucket is responsible for. + offset: Offset of this bucket's view in the larger ParamAndGradBuffer. + numel_unpadded: Number of unpadded elements in bucket. + data_parallel_group: Data-parallel process group. + data_parallel_world_size: World size using the data-parallel group group. + gradient_scaling_factor: This factor is utilized to scale gradients prior to their + communication. Its application is twofold: it facilitates the averaging of gradients + and the scaling of gradients in the context of the Mixture of Experts (MoE) model. + """ + + def __init__( + self, + ddp_config: DistributedDataParallelConfig, + params: List[torch.nn.Parameter], + param_data: Optional[torch.Tensor], + grad_data: torch.Tensor, + offset: int, + numel_unpadded: int, + data_parallel_group: torch.distributed.ProcessGroup, + data_parallel_world_size: int, + gradient_scaling_factor: float, + ): + self.ddp_config = ddp_config + + # State for bookkeeping: params is the set of parameters this bucket is + # responsible for, params_with_grad is the set of parameters with grads + # available. When overlap_grad_reduce is True, communication (all-reduce + # or reduce-scatter) is issued when params_with_grad equals params. + self.params_list = params + self.params = set(params) + self.params_with_grad = set() + self.param_data = param_data + self.grad_data = grad_data + # The distributed optimizer needs to keep track of this bucket's offset + # within the full grad_buffer. + self.offset = offset + self.numel_unpadded = numel_unpadded + self.data_parallel_group = data_parallel_group + self.data_parallel_world_size = data_parallel_world_size + self.data_parallel_rank = torch.distributed.get_rank(group=data_parallel_group) + self.gradient_scaling_factor = gradient_scaling_factor + + self.reset() + + def reset(self): + """ + Reset metadata in bucket in preparation for the next iteration of training. + """ + self.params_with_grad = set() + self.communication_handle = None + self.is_communication_outstanding = False + + def start_grad_sync(self): + """ + Initiates grad sync (all-reduce or reduce-scatter) communication operation + for this bucket. + + When overlap_grad_reduce is set to True, dispatches an asynchronous + communication call. When overlap_grad_reduce is set to False, makes + synchronous call. + """ + assert ( + self.communication_handle is None and not self.is_communication_outstanding + ), 'Should not have multiple communication calls outstanding at once' + + # Make sure norm of grads in bucket are not NaN + # prior to data-parallel all-reduce / reduce-scatter. + if self.ddp_config.check_for_nan_in_grad: + global_rank = torch.distributed.get_rank() + norm = self.grad_data.norm(p=2) + assert not norm.isnan(), ( + f'Rank {global_rank}: found NaN in local grad norm in ' + f'backward pass before data-parallel communication collective. ' + f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}' + ) + + # gradient_scaling_factor already takes into account whether we are computing + # an average or sum in the data-parallel collective. + if self.gradient_scaling_factor != 1.0: + self.grad_data *= self.gradient_scaling_factor + + # Decide reduce_op. + reduce_op = torch.distributed.ReduceOp.SUM + if self.ddp_config.average_in_collective: + reduce_op = torch.distributed.ReduceOp.AVG + + # Use async_op only when overlap_grad_reduce is True. + if self.ddp_config.use_distributed_optimizer: + local_data_view = shard_buffer(self.grad_data, self.data_parallel_world_size)[ + self.data_parallel_rank + ] + self.communication_handle = torch.distributed._reduce_scatter_base( + local_data_view, + self.grad_data, + op=reduce_op, + group=self.data_parallel_group, + async_op=self.ddp_config.overlap_grad_reduce, + ) + else: + self.communication_handle = torch.distributed.all_reduce( + self.grad_data, + op=reduce_op, + group=self.data_parallel_group, + async_op=self.ddp_config.overlap_grad_reduce, + ) + if self.ddp_config.overlap_grad_reduce: + self.is_communication_outstanding = True + else: + self.is_communication_outstanding = False + + def finish_grad_sync(self): + """ + Finishes grad sync (all-reduce or reduce-scatter) communication operation + for this bucket. + + When overlap_grad_reduce is set to True, waits for asynchronous communication + call to complete. When overlap_grad_reduce is set to False, makes synchronous call. + """ + # If overlap_grad_reduce is False, start (and finish) synchronous communication call here. + if not self.ddp_config.overlap_grad_reduce: + self.start_grad_sync() + return + assert self.communication_handle is not None and self.is_communication_outstanding, ( + f'Communication call has not been issued for this bucket ' + f'({len(self.params_with_grad)}/{len(self.params)} params have grad available)' + ) + self.communication_handle.wait() + + def register_grad_ready(self, param: torch.nn.Parameter): + """ + Registers grads for the passed-in param to be "ready" for grad sync. + + When the number of microbatches is greater than 1, we only want to register + grads as ready when processing the last microbatch and overlap_grad_reduce is True. + """ + assert param in self.params, 'Param is not in the bucket' + assert param not in self.params_with_grad, 'Cannot set grad twice' + assert ( + self.ddp_config.overlap_grad_reduce + ), 'register_grad_ready() should be called only when overlapping grad reduce' + self.params_with_grad.add(param) + # If all params in bucket have grads available, issue communication call. + if len(self.params_with_grad) == len(self.params): + self.start_grad_sync() + + +class ParamAndGradBuffer: + """ + Groups parameters and gradients into a contiguous buffer, and then breaks the buffer into + buckets with roughly `bucket_size` parameters each. + + Args: + ddp_config: DistributedDataParallel config object. + param_dtype: Type of param tensor. + grad_dtype: Type of grad tensor. + params: List of parameters whose parameters and gradients are collated in the underlying + tensor. + data_parallel_group: Data-parallel process group. + bucket_size: The rough size of each bucket in terms of number of parameters. + param_to_name: Mapping from `torch.nn.Parameter` to name (for logging purposes). + gradient_scaling_factor: This factor is utilized to scale gradients prior to their + communication. Its application is twofold: it facilitates the averaging of gradients + and the scaling of gradients in the context of the Mixture of Experts (MoE) model. + """ + + def __init__( + self, + ddp_config: DistributedDataParallelConfig, + param_dtype: torch.dtype, + grad_dtype: torch.dtype, + params: List[torch.nn.Parameter], + data_parallel_group: torch.distributed.ProcessGroup, + bucket_size: int, + param_to_name: Dict[torch.nn.Parameter, str], + gradient_scaling_factor: float, + ): + self.ddp_config = ddp_config + + # Check that params are unique. + unique_params = set() + for param in params: + assert param not in unique_params + unique_params.add(param) + del unique_params + + # Store attributes that will be needed later. + self.param_dtype = param_dtype + self.grad_dtype = grad_dtype + self.data_parallel_group = data_parallel_group + self.data_parallel_world_size = torch.distributed.get_world_size( + group=self.data_parallel_group + ) + self.gradient_scaling_factor = gradient_scaling_factor + self.is_last_microbatch = True + + # Data structures to store underlying buckets and relevant indexing data. + self.buckets = [] + self.param_to_bucket = {} # Param -> bucket mapping. + self.param_index_map = {} # Param -> location in buffer mapping (used in dist. optimizer). + + def _pad(number_to_be_padded: int, divisor: int) -> int: + return int(math.ceil(number_to_be_padded / divisor) * divisor) + + def _pad_end_of_bucket_if_needed(bucket_end_index: int) -> int: + """ + Pads end index of bucket if using distributed optimizer (to ensure uniform sharding). + """ + if self.ddp_config.use_distributed_optimizer: + # Workaround for TE bug causing cuBLAS to pick an incompatible algorithm. + # This also helps cuBLAS pick more efficient algorithms for GEMMs. + # We now ensure that all buckets start at a memory address that is 256-byte + # aligned (128 values since params and grads use >= 16-bit precision). + return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128)) + return bucket_end_index + + def _pad_start_of_param_if_needed(param_start_index: int) -> int: + """ + Pads start index of param if using distributed optimizer (to ensure "good" alignment). + """ + if self.ddp_config.use_distributed_optimizer: + # Ensure that params start at 128-byte aligned addresses (64 values + # since params are >= 16-bit precision). + return _pad(param_start_index, 64) + return param_start_index + + # First, figure out how many elements should be in the underlying buffer storage. + # Note that if we need to split the buffer into smaller buckets, each of these + # might need to be padded as well (if using the distributed optimizer). + data_start_index = 0 + bucket_data_start_index = data_start_index + bucket_params = set() + self.bucket_indices = [] + per_bucket_numel_unpadded = [] + bucket_id = 0 + + def _create_new_bucket(data_end_index: int) -> int: + """ + Create the bucket_id'th bucket with collected bucket_params, starting at + bucket_data_start_index. + """ + nonlocal bucket_data_start_index, bucket_params, bucket_id + per_bucket_numel_unpadded.append(data_end_index - bucket_data_start_index) + data_end_index = _pad_end_of_bucket_if_needed(data_end_index) + # Update bucket metadata. + self.bucket_indices.append((bucket_data_start_index, data_end_index)) + bucket_data_start_index = data_end_index + # Re-set bucket_params and increment bucket_id for next bucket. + bucket_params = set() + bucket_id += 1 + # Return the potentially padded data_end_index. + return data_end_index + + for param in params[::-1]: + # Iterate through parameters in reverse order to roughly follow backprop order, + # and skip parameters that don't require gradients. + if not param.requires_grad: + continue + this_numel = param.data.nelement() + data_start_index = _pad_start_of_param_if_needed(data_start_index) + data_end_index = data_start_index + this_numel + + def _does_param_require_new_bucket(param): + """ + Split shared embedding parameters into separate bucket if using distributed + optimizer that makes use of reduce-scatters instead of all-reduces. + This ensures that the first and last pipeline stage partition optimizer state + for the shared embedding parameters the same way across DP replicas, allowing + the DP reduce-scatter to be before the embedding all-reduce. + """ + return ( + getattr(param, "shared_embedding", False) + and self.ddp_config.use_distributed_optimizer + ) + + # Create bucket with already collected parameters if current param needs its own bucket. + if _does_param_require_new_bucket(param) and len(bucket_params) > 0: + # We are creating a bucket for the already accumulated parameters, whose params + # end at the current data_start_index. + if self.ddp_config.use_distributed_optimizer: + # data_start_index should already be padded. + assert data_start_index % self.data_parallel_world_size == 0 + _create_new_bucket(data_start_index) + + self.param_index_map[param] = ( + data_start_index, + data_end_index, + bucket_id, + ) + bucket_params.add(param) + + # If we have enough elements already or the current param is part of the shared embedding + # layer and needs a separate bucket, form a new bucket. + if ( + bucket_size is not None + and (data_end_index - bucket_data_start_index) >= bucket_size + ) or _does_param_require_new_bucket(param): + data_end_index = _create_new_bucket(data_end_index) + data_start_index = data_end_index + + # Add remaining params to a new bucket. + if len(bucket_params) > 0: + data_end_index = _create_new_bucket(data_end_index) + + # Next, create underlying storage for buffer (with numel elements that includes + # padding as necessary). + self.numel = data_end_index + self.numel_unpadded = sum(per_bucket_numel_unpadded) + assert self.numel_unpadded <= self.numel + if self.ddp_config.use_distributed_optimizer: + assert self.numel % self.data_parallel_world_size == 0 + else: + assert self.numel == self.numel_unpadded + + self.param_data = None + # Only re-map param tensors if using distributed optimizer. + if self.ddp_config.use_distributed_optimizer: + self.param_data = torch.zeros( + self.numel, + dtype=self.param_dtype, + device=torch.cuda.current_device(), + requires_grad=False, + ) + self.grad_data = torch.zeros( + self.numel, + dtype=self.grad_dtype, + device=torch.cuda.current_device(), + requires_grad=False, + ) + + # Finally, map param.data and param.main_grad fields to buffers. + bucket_params = set() + bucket_data_start_index = 0 + cur_bucket_id = 0 + for param in params[::-1]: + if not param.requires_grad: + continue + data_start_index, data_end_index, bucket_id = self.param_index_map[param] + + # Assign param.data to appropriate segment of self.param_data. + if self.param_data is not None: + old_param_data = param.data + param.data = self._get( + param.data.shape, data_start_index, buffer_type=BufferType.PARAM + ) + assert old_param_data._base is None + # Copy tensor values (from initialization or checkpoint). + param.data.detach().copy_(old_param_data) + del old_param_data + + param.main_grad = self._get( + param.data.shape, data_start_index, buffer_type=BufferType.GRAD + ) + if bucket_id != cur_bucket_id: + bucket_data_end_index = _pad_end_of_bucket_if_needed(data_start_index) + self._set_bucket( + bucket_params=bucket_params, + start_index=bucket_data_start_index, + end_index=bucket_data_end_index, + numel_unpadded=per_bucket_numel_unpadded[cur_bucket_id], + bucket_id=cur_bucket_id, + ) + bucket_data_start_index = bucket_data_end_index + bucket_params = set() + assert cur_bucket_id + 1 == len(self.buckets) + assert bucket_id == cur_bucket_id + 1 + cur_bucket_id = bucket_id + bucket_params.add(param) + + # Add remaining params to a new bucket. + if len(bucket_params) > 0: + bucket_data_end_index = _pad_end_of_bucket_if_needed(data_end_index) + self._set_bucket( + bucket_params=bucket_params, + start_index=bucket_data_start_index, + end_index=bucket_data_end_index, + numel_unpadded=per_bucket_numel_unpadded[cur_bucket_id], + bucket_id=cur_bucket_id, + ) + + # Log buckets for all PP stages. + log_strs = [] + log_strs.append( + f'Number of buckets for gradient all-reduce / reduce-scatter: {len(self.buckets)}' + ) + for index, bucket in enumerate(self.buckets): + numel = 0 + for param in bucket.params: + numel += param.data.nelement() + log_strs.append(f'Params for bucket {index+1} ({numel} elements):') + for param in bucket.params: + log_strs.append(f'\t{param_to_name[param]}') + log_on_each_pipeline_stage(logger, logging.INFO, '\n'.join(log_strs)) + + def scale_gradients(self, scaling_factor: float) -> None: + """Scale the gradient data by `scaling_factor`.""" + self.grad_data *= scaling_factor + + def _get(self, shape: torch.Size, start_index: int, buffer_type: BufferType) -> torch.Tensor: + """ + Return a tensor with the input `shape` as a view into the 1-D data starting at + `start_index`. + """ + end_index = start_index + shape.numel() + assert end_index <= self.numel, 'Requested tensor is out of buffer range' + if buffer_type == BufferType.PARAM: + assert self.param_data is not None + buffer_tensor = self.param_data[start_index:end_index] + elif buffer_type == BufferType.GRAD: + buffer_tensor = self.grad_data[start_index:end_index] + else: + raise Exception("Illegal buffer type provided to GradBuffer._get() function") + buffer_tensor = buffer_tensor.view(shape) + return buffer_tensor + + def _set_bucket( + self, + bucket_params: List[torch.nn.Parameter], + start_index: int, + end_index: int, + numel_unpadded: int, + bucket_id: int, + ): + """ + Helper function to create new bucket, add it to list of buckets, and + also update param->bucket mapping. + """ + + # Assert that indices are correctly padded (if needed), and that bucket + # position is same as originally computed. + if self.ddp_config.use_distributed_optimizer: + assert start_index % self.data_parallel_world_size == 0 + assert end_index % self.data_parallel_world_size == 0 + assert (start_index, end_index) == self.bucket_indices[bucket_id] + + # Get appropriate view into global ParamAndGradBuffer. + bucketed_param_data = None + if self.param_data is not None: + bucketed_param_data = self._get( + torch.Size([end_index - start_index]), start_index, buffer_type=BufferType.PARAM + ) + bucketed_grad_data = self._get( + torch.Size([end_index - start_index]), start_index, buffer_type=BufferType.GRAD + ) + bucket = Bucket( + ddp_config=self.ddp_config, + params=bucket_params, + param_data=bucketed_param_data, + grad_data=bucketed_grad_data, + offset=start_index, + numel_unpadded=numel_unpadded, + data_parallel_group=self.data_parallel_group, + data_parallel_world_size=self.data_parallel_world_size, + gradient_scaling_factor=self.gradient_scaling_factor, + ) + self.buckets.append(bucket) + for bucket_param in bucket_params: + assert bucket_param not in self.param_to_bucket + self.param_to_bucket[bucket_param] = bucket + + def reset(self): + """ + Zero out the underlying grad_buffer and reset all buckets in preparation for the next + iteration of training. + """ + self.grad_data.zero_() + for bucket in self.buckets: + bucket.reset() + self.is_last_microbatch = True + + def start_grad_sync(self): + """ + Initiates grad sync (all-reduce or reduce-scatter) communication operations + for all buckets in the grad buffer. + + When overlap_grad_reduce is set to True, dispatches asynchronous communication + calls. When overlap_grad_reduce is set to False, calls synchronous + communication ops. + """ + for bucket in self.buckets: + bucket.start_grad_sync() + + def finish_grad_sync(self): + """ + Finishes grad sync (all-reduce or reduce-scatter) communication operations + for all buckets in the grad buffer. + + When overlap_grad_reduce is set to True, waits for asynchronous communication + calls to complete. When overlap_grad_reduce is set to False, calls synchronous + communication ops. + """ + for bucket in self.buckets: + bucket.finish_grad_sync() + + def register_grad_ready(self, param: torch.nn.Parameter): + """ + Registers grads for the passed-in param to be "ready" for grad sync. + + When the number of microbatches is greater than 1, we only want to register + grads as ready when processing the last microbatch and overlap_grad_reduce is True. + """ + assert ( + self.ddp_config.overlap_grad_reduce + ), 'register_grad_ready() should only be called when overlap_grad_reduce is True' + if self.is_last_microbatch: + bucket = self.param_to_bucket[param] + bucket.register_grad_ready(param) diff --git a/megatron/core/enums.py b/megatron/core/enums.py new file mode 100644 index 0000000..46e7d3b --- /dev/null +++ b/megatron/core/enums.py @@ -0,0 +1,10 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import enum + + +class ModelType(enum.Enum): + encoder_or_decoder = 1 + encoder_and_decoder = 2 + retro_encoder = 3 + retro_decoder = 4 diff --git a/megatron/core/fusions/__init__.py b/megatron/core/fusions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py new file mode 100644 index 0000000..08af02b --- /dev/null +++ b/megatron/core/fusions/fused_bias_dropout.py @@ -0,0 +1,73 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from typing import Optional, Tuple + +import torch + +from megatron.core.jit import jit_fuser + + +def _bias_dropout_add_func(x_with_bias, residual, prob, training): + # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor + # NOTE: Previously, the argument `bias` used to be passed as + # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the + # transformer layer but broadcasting should automatically take care of that. + # Also, looking at broadcasting semantics, `expand_as` and broadcasting + # seem to be identical performance-wise (both just change the view). + + x, bias = x_with_bias # unpack + + # If we want to train mixed precision, then the output of this function + # should be half precision. However, in AMP O1, the input (residual) is + # in fp32, and it will up-cast the result to fp32, causing pipeline parallel + # GPU communication to hang. Therefore, we need to cast residual to the same + # dtype as x. + residual = residual if residual.dtype == x.dtype else residual.to(x.dtype) + + # The Dropout operation, Residual Addition and the tensor returning can be + # done generically outside the if statement, but that stops fusing of Bias + # Addition-Dropout-Residual Addition operation. So doing it together inside + # the conditional branch to improve performance + if bias is not None: + x = x + bias + out = torch.nn.functional.dropout(x, p=prob, training=training) + out = residual + out + return out + else: + out = torch.nn.functional.dropout(x, p=prob, training=training) + out = residual + out + return out + + +def bias_dropout_add_unfused(training): + def _bias_dropout_add(x_with_bias, residual, prob): + return _bias_dropout_add_func(x_with_bias, residual, prob, training) + + return _bias_dropout_add + + +@jit_fuser +def bias_dropout_add_fused_train( + x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float, +) -> torch.Tensor: + return _bias_dropout_add_func(x_with_bias, residual, prob, True) + + +@jit_fuser +def bias_dropout_add_fused_inference( + x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float, +) -> torch.Tensor: + return _bias_dropout_add_func(x_with_bias, residual, prob, False) + + +def get_bias_dropout_add(training, fused): + if fused: + # jit scripting for a nn.module (with dropout) is not + # triggering the fusion kernel. For now, we use two + # different nn.functional routines to account for varying + # dropout semantics during training and inference phases. + if training: + return bias_dropout_add_fused_train + else: + return bias_dropout_add_fused_inference + else: + return bias_dropout_add_unfused(training) diff --git a/megatron/core/fusions/fused_bias_geglu.py b/megatron/core/fusions/fused_bias_geglu.py new file mode 100644 index 0000000..70ef348 --- /dev/null +++ b/megatron/core/fusions/fused_bias_geglu.py @@ -0,0 +1,85 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + +from megatron.core.jit import jit_fuser + +###### BIAS GELU FUSION/ NO AUTOGRAD ################ +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) + + +@jit_fuser +def geglu(y): + y_1, y_2 = torch.chunk(y, 2, -1) + return (y_1 * 0.5 * (1.0 + torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1)))) * y_2 + + +@jit_fuser +def bias_geglu(bias, y): + y = y + bias + return geglu(y) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@jit_fuser +def geglu_back(g, y): + y_1, y_2 = torch.chunk(y, 2, -1) + tanh_out = torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * y_1 * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * y_1 * y_1)) + 0.5 * ( + 1 + tanh_out + ) + return torch.cat(((g * y_2) * ff, g * (y_1 * 0.5 * (1.0 + tanh_out))), -1) + + +@jit_fuser +def bias_geglu_back(g, y, bias): + y = y + bias + return geglu_back(g, y) + + +class BiasGeGLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_geglu(input, bias) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_geglu_back(grad_output, input, bias) + return tmp, tmp + + +class GeGLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input): + ctx.save_for_backward(input) + return geglu(input) + + @staticmethod + def backward(ctx, grad_output): + input = ctx.saved_tensors + tmp = geglu_back(grad_output, input[0]) + return tmp + + +def bias_geglu_impl(input, bias): + ori_shape = input.shape + assert len(ori_shape) in [2, 3] + input = input.view(-1, ori_shape[-1]) + if bias is not None: + output = BiasGeGLUFunction.apply(input, bias) + else: + output = GeGLUFunction.apply(input) + + return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1) diff --git a/megatron/core/fusions/fused_bias_gelu.py b/megatron/core/fusions/fused_bias_gelu.py new file mode 100644 index 0000000..2b54674 --- /dev/null +++ b/megatron/core/fusions/fused_bias_gelu.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + +from megatron.core.jit import jit_fuser + +###### BIAS GELU FUSION/ NO AUTOGRAD ################ +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) + + +@jit_fuser +def bias_gelu(bias, y): + x = bias + y + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@jit_fuser +def bias_gelu_back(g, bias, y): + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) + return ff * g + + +class GeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(bias, input) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_back(grad_output, bias, input) + return tmp, tmp + + +bias_gelu_impl = GeLUFunction.apply diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py new file mode 100644 index 0000000..fd3ac3e --- /dev/null +++ b/megatron/core/fusions/fused_bias_swiglu.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import torch +import torch.nn.functional as F + +from megatron.core.jit import jit_fuser + +###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################ + + +@jit_fuser +def swiglu(y): + y_1, y_2 = torch.chunk(y, 2, -1) + return F.silu(y_1) * y_2 + + +@jit_fuser +def bias_swiglu(y, bias): + y = y + bias + return swiglu(y) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@jit_fuser +def swiglu_back(g, y): + y_1, y_2 = torch.chunk(y, 2, -1) + return torch.cat( + (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1 + ) + + +@jit_fuser +def bias_swiglu_back(g, y, bias): + y = y + bias + return swiglu_back(g, y) + + +class BiasSwiGLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias, fp8_input_store): + input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input + ctx.save_for_backward(input_for_backward, bias) + ctx.ori_input_dtype = input.dtype + ctx.fp8_input_store = fp8_input_store + return bias_swiglu(input, bias) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input + tmp = bias_swiglu_back(grad_output, input, bias) + return tmp, tmp, None + + +class SwiGLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, fp8_input_store): + input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input + ctx.save_for_backward(input_for_backward) + ctx.ori_input_dtype = input.dtype + ctx.fp8_input_store = fp8_input_store + return swiglu(input) + + @staticmethod + def backward(ctx, grad_output): + input = ctx.saved_tensors[0] + input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input + tmp = swiglu_back(grad_output, input) + return tmp, None + + +def bias_swiglu_impl(input, bias, fp8_input_store=False): + ori_shape = input.shape + assert len(ori_shape) in [2, 3] + input = input.view(-1, ori_shape[-1]) + if bias is not None: + output = BiasSwiGLUFunction.apply(input, bias, fp8_input_store) + else: + output = SwiGLUFunction.apply(input, fp8_input_store) + + return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1) + + +# bias_swiglu_impl = BiasSwiGLUFunction.apply +# swiglu_impl = SwiGLUFunction.apply diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py new file mode 100644 index 0000000..e10c04c --- /dev/null +++ b/megatron/core/fusions/fused_cross_entropy.py @@ -0,0 +1,153 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from typing import Tuple + +import torch + +from megatron.core.jit import jit_fuser +from megatron.core.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from megatron.core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy +from megatron.core.tensor_parallel.utils import VocabUtility + + +@jit_fuser +def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + + vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max( + vocab_parallel_logits + ) + + return vocab_parallel_logits, logits_max + + +@jit_fuser +def calculate_predicted_logits( + vocab_parallel_logits: torch.Tensor, + target: torch.Tensor, + logits_max: torch.Tensor, + vocab_start_index: int, + vocab_end_index: int, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + ( + target_mask, + masked_target_1d, + predicted_logits, + sum_exp_logits, + exp_logits, + ) = VocabParallelCrossEntropy.calculate_predicted_logits( + vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index + ) + + predicted_logits_sum_exp_logits = torch.cat((predicted_logits, sum_exp_logits)) + + return target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits + + +@jit_fuser +def calculate_cross_entropy_loss( + exp_logits: torch.Tensor, predicted_logits_sum_exp_logits: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + + split_val = predicted_logits_sum_exp_logits.size()[0] // 2 + predicted_logits, sum_exp_logits = torch.split(predicted_logits_sum_exp_logits, split_val) + + exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss( + exp_logits, predicted_logits, sum_exp_logits + ) + + return exp_logits, loss + + +@jit_fuser +def calculate_gradients( + softmax: torch.Tensor, + grad_output: torch.Tensor, + target_mask: torch.Tensor, + masked_target_1d: torch.Tensor, +) -> torch.Tensor: + + ( + grad_2d, + arange_1d, + softmax_update, + grad_input, + ) = VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask) + + grad_input = VocabParallelCrossEntropy.calculate_gradients( + grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output + ) + + grad_input = grad_input.to(torch.bfloat16) + + return grad_input + + +class _VocabParallelCrossEntropy(torch.autograd.Function): + @staticmethod + def forward(ctx, vocab_parallel_logits, target): + + vocab_parallel_logits, logits_max = calculate_logits_max(vocab_parallel_logits) + torch.distributed.all_reduce( + logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group() + ) + + # Get the partition's vocab indices + get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size + partition_vocab_size = vocab_parallel_logits.size()[-1] + rank = get_tensor_model_parallel_rank() + world_size = get_tensor_model_parallel_world_size() + vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size) + + ( + target_mask, + masked_target_1d, + predicted_logits_sum_exp_logits, + exp_logits, + ) = calculate_predicted_logits( + vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index + ) + + # All reduce is needed to get the chunks from other GPUs. + # In the fused case, tensors are batches to invoke a single + # AllReduce call + torch.distributed.all_reduce( + predicted_logits_sum_exp_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_tensor_model_parallel_group(), + ) + + exp_logits, loss = calculate_cross_entropy_loss(exp_logits, predicted_logits_sum_exp_logits) + + # Store softmax, target-mask and masked-target for backward pass. + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output): + + # Retreive tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + + grad_input = calculate_gradients(softmax, grad_output, target_mask, masked_target_1d) + + return grad_input, None + + +def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target): + """ + Performs cross entropy loss when logits are split across tensor parallel ranks + + Args: + vocab_parallel_logits: logits split across tensor parallel ranks + dimension is [sequence_length, batch_size, hidden_size] + + target: correct vocab ids of dimseion [sequence_length, micro_batch_size] + + """ + return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py new file mode 100644 index 0000000..a2241b3 --- /dev/null +++ b/megatron/core/fusions/fused_layer_norm.py @@ -0,0 +1,169 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import importlib +import inspect +import numbers + +import torch +from torch import Tensor +from torch.nn import init +from torch.nn.parameter import Parameter + +from megatron.core.transformer import TransformerConfig +from megatron.core.utils import make_viewless_tensor + +try: + from apex.contrib.layer_norm.layer_norm import FastLayerNormFN + + HAVE_PERSIST_LAYER_NORM = True +except: + HAVE_PERSIST_LAYER_NORM = False + +try: + from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction + + HAVE_FUSED_LAYER_NORM = True +except: + HAVE_FUSED_LAYER_NORM = False + + +class FusedLayerNorm(torch.nn.Module): + """Layer Norm, fused into a single CUDA kernel. + + Args: + hidden_size (int): Transformer hidden dimension. + + eps (float): Epsilon added to denominator, for numerical stability. + + persist_layer_norm (bool): Use persistent fused layer norm kernel. + This kernel supports only a set of hidden sizes. Please + check persist_ln_hidden_sizes if your hidden size is supported. + + zero_centered_gamma (bool): Adjust LayerNorm weights such that they are + centered around zero. This improves numerical stability. + + config (TransformerConfig): Transformer config. Include to match custom + layer norm interfaces. + + normalization (str): Normalization type, used for Transformer Engine. + Must equal 'LayerNorm' here. + """ + + def __init__( + self, + config: TransformerConfig, + hidden_size: int, + eps: float = 1e-5, + persist_layer_norm: bool = True, + zero_centered_gamma: bool = False, + normalization: str = "LayerNorm", # included to match TE interface + ): + super().__init__() + + self.config = config + + self.zero_centered_gamma = self.config.layernorm_zero_centered_gamma + assert ( + self.config.normalization == "LayerNorm" + ), f'({self.config.normalization}) is not supported in FusedLayerNorm' + + # List of hiddens sizes supported in the persistent layer norm kernel + # If the hidden size is not supported, fall back to the non-persistent + # kernel. + persist_ln_hidden_sizes = [ + 1024, + 1536, + 2048, + 2304, + 3072, + 3840, + 4096, + 5120, + 6144, + 8192, + 10240, + 12288, + 12800, + 15360, + 16384, + 18432, + 20480, + 24576, + 25600, + 30720, + 32768, + 40960, + 49152, + 65536, + ] + persist_layer_norm = self.config.persist_layer_norm + if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM: + persist_layer_norm = False + + if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM: + # TODO: Add pytorch only layer norm + raise ValueError(f'Apex must be installed to use FusedLayerNorm.') + + if isinstance(hidden_size, numbers.Integral): + hidden_size = (hidden_size,) + self.hidden_size = torch.Size(hidden_size) + self.eps = eps + # Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2. + self.weight = Parameter(torch.empty(*hidden_size)) + self.bias = Parameter(torch.empty(*hidden_size)) + self.reset_parameters() + self.persist_layer_norm = persist_layer_norm + self.sequence_parallel = self.config.sequence_parallel + + # set sequence parallelism flag on weight and bias parameters + setattr(self.weight, 'sequence_parallel', self.sequence_parallel) + setattr(self.bias, 'sequence_parallel', self.sequence_parallel) + + def reset_parameters(self): + + if self.zero_centered_gamma: + init.zeros_(self.weight) + init.zeros_(self.bias) + else: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + + weight = self.weight + 1 if self.zero_centered_gamma else self.weight + + if self.persist_layer_norm: + if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args: + output = FastLayerNormFN.apply( + input, weight, self.bias, self.eps, self.config.memory_efficient_layer_norm + ) + else: + output = FastLayerNormFN.apply(input, weight, self.bias, self.eps) + + # Apex's fast layer norm function outputs a 'view' tensor (i.e., has + # a populated '_base' field). This will result in schedule.py's + # deallocate_output_tensor() throwing an error, so a viewless tensor is + # created to prevent this. + output = make_viewless_tensor( + inp=output, requires_grad=input.requires_grad, keep_graph=True + ) + + else: + if ( + 'memory_efficient' + in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args + ): + return FusedLayerNormAffineFunction.apply( + input, + weight, + self.bias, + self.hidden_size, + self.eps, + self.config.memory_efficient_layer_norm, + ) + else: + return FusedLayerNormAffineFunction.apply( + input, weight, self.bias, self.hidden_size, self.eps + ) + + return output diff --git a/megatron/core/fusions/fused_softmax.py b/megatron/core/fusions/fused_softmax.py new file mode 100644 index 0000000..c7bfbb7 --- /dev/null +++ b/megatron/core/fusions/fused_softmax.py @@ -0,0 +1,220 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from typing import Optional + +import torch +import torch.nn as nn + +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.utils import get_default_causal_mask + + +class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_upper_triang_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0]) + + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_upper_triang_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_upper_triang_masked_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + + return input_grads, None + + +class ScaledMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply the mask. + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, mask, scale): + import scaled_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class ScaledSoftmax(torch.autograd.Function): + """ + Fused operation which performs following two operations in sequence + 1. Scale the tensor. + 2. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_softmax_cuda + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class FusedScaleMaskSoftmax(nn.Module): + """ + fused operation: scaling + mask + softmax + + Args: + input_in_fp16: flag to indicate if input in fp16 data format. + input_in_bf16: flag to indicate if input in bf16 data format. + attn_mask_type: attention mask type (pad or causal) + scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + scale: scaling factor used in input tensor scaling. + """ + + def __init__( + self, + input_in_fp16, + input_in_bf16, + attn_mask_type, + scaled_masked_softmax_fusion, + mask_func, + softmax_in_fp32, + scale, + ): + super(FusedScaleMaskSoftmax, self).__init__() + self.input_in_fp16 = input_in_fp16 + self.input_in_bf16 = input_in_bf16 + assert not ( + self.input_in_fp16 and self.input_in_bf16 + ), "both fp16 and bf16 flags cannot be active at the same time." + self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.scale = scale + + assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled" + + def forward(self, input: torch.Tensor, mask: Optional[torch.Tensor]): + """Forward pass of softmax with masked input. + + In case attn_mask_type is causal the mask is generated and None can be passed. + A user-defined mask is only needed when attn_mask_type is not causal. + """ + # [b, np, sq, sk] + assert input.dim() == 4 + + if self.is_kernel_available(mask, *input.size()): + return self.forward_fused_softmax(input, mask) + else: + return self.forward_torch_softmax(input, mask) + + def is_kernel_available(self, mask, b, np, sq, sk): + attn_batches = b * np + + if ( + self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_float16 # input must be fp16 + and 16 < sk <= 4096 # sk must be 16 ~ 2048 + and sq % 4 == 0 # sq must be divisor of 4 + and sk % 4 == 0 # sk must be divisor of 4 + and attn_batches % 4 == 0 # np * b must be divisor of 4 + ): + if 0 <= sk <= 4096: + batch_per_block = self.get_batch_per_block(sq, sk, b, np) + + if self.attn_mask_type == AttnMaskType.causal: + if attn_batches % batch_per_block == 0: + return True + else: + if sq % batch_per_block == 0: + return True + return False + + def forward_fused_softmax(self, input, mask): + b, np, sq, sk = input.size() + scale = self.scale if self.scale is not None else 1.0 + + if self.attn_mask_type == AttnMaskType.causal: + assert sq == sk, "causal mask is only for self attention" + + # input is 3D tensor (attn_batches, sq, sk) + input = input.view(-1, sq, sk) + probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) + return probs.view(b, np, sq, sk) + else: + # input is 4D tensor (b, np, sq, sk) + if mask is not None: + return ScaledMaskedSoftmax.apply(input, mask, scale) + else: + return ScaledSoftmax.apply(input, scale) + + def forward_torch_softmax(self, input, mask): + if self.input_in_float16 and self.softmax_in_fp32: + input = input.float() + + if self.scale is not None: + input = input * self.scale + + # Generate causal mask if not given + sq, sk = input.size(2), input.size(3) + if self.attn_mask_type == AttnMaskType.causal and mask is None and sq > 1: + # If sq == 1 then either KV cache is used or one-element context is passed + # so keeping mask=None in this case; subsequent code should handle it + assert sq == sk, "causal mask is only for self attention" + mask = get_default_causal_mask(sq) + + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + + if self.input_in_float16 and self.softmax_in_fp32: + if self.input_in_fp16: + probs = probs.half() + else: + probs = probs.bfloat16() + + return probs + + @staticmethod + def get_batch_per_block(sq, sk, b, np): + import scaled_masked_softmax_cuda + + return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np) diff --git a/megatron/core/inference/__init__.py b/megatron/core/inference/__init__.py new file mode 100644 index 0000000..f801100 --- /dev/null +++ b/megatron/core/inference/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/inference/ammo_support/__init__.py b/megatron/core/inference/ammo_support/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/inference/ammo_support/gpt/__init__.py b/megatron/core/inference/ammo_support/gpt/__init__.py new file mode 100644 index 0000000..f801100 --- /dev/null +++ b/megatron/core/inference/ammo_support/gpt/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/inference/ammo_support/gpt/model_specs.py b/megatron/core/inference/ammo_support/gpt/model_specs.py new file mode 100644 index 0000000..e3d8e08 --- /dev/null +++ b/megatron/core/inference/ammo_support/gpt/model_specs.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import TEDotProductAttention, TENorm +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + + +# Use this spec for ModelOpt PTQ and TensorRT-LLM export +def get_gpt_layer_modelopt_spec( + remap_te_layernorm: bool = False, qk_layernorm: bool = False +) -> ModuleSpec: + """Mix the native spec with TENorm. + + This is essentially the native local spec except for the layernorm implementation + is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex + has stopped supporting RMSNorm needed by llama. + """ + sharded_state_dict_keys_map = {} + if remap_te_layernorm: + sharded_state_dict_keys_map = { + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + } + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=TENorm if qk_layernorm else IdentityOp, + k_layernorm=TENorm if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + # Map TE-layernorm-fusion keys back + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + ), + ) diff --git a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py new file mode 100644 index 0000000..f81c4f5 --- /dev/null +++ b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py @@ -0,0 +1,145 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from logging import getLogger + +import torch + +logger = getLogger(__name__) + + +def mcore_gpt_load_legacy_state_dict_pre_hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, +): + """Register a pre-hook to fix the state_dict key difference. + + This prehook is used when trying to load the legacy Megatron-LM GPTModel into its + megatron/core variant that uses native ParallelLinear and Transformer-Engine Norm. + Only this particular spec supports post-training quantization and TensorRT-LLM + config export through `nvidia-modelopt` package. + + Args: + state_dict: state dictionary + prefix: module name prefix + local_metadata: local metatdata + strict: whether is in strict mode + missing_keys: missing state dict keys + unexpected_keys: unexpected state dict keys + error_msgs: error messages + """ + if "modelopt_state" in state_dict: + state_dict.pop("modelopt_state") + + if "language_model" in state_dict: + language_model_state_dict = state_dict.pop("language_model") + if "embedding" in language_model_state_dict: + if "word_embeddings" in language_model_state_dict["embedding"]: + for key, param in language_model_state_dict["embedding"]["word_embeddings"].items(): + state_dict.update({"embedding.word_embeddings." + key: param}) + if "position_embeddings" in language_model_state_dict["embedding"]: + for key, param in language_model_state_dict["embedding"][ + "position_embeddings" + ].items(): + state_dict.update({"embedding.position_embeddings." + key: param}) + if "transformer" in language_model_state_dict: + for key, param in language_model_state_dict["transformer"].items(): + state_dict.update({"decoder." + key: param}) + else: + for key, param in language_model_state_dict["encoder"].items(): + state_dict.update({"decoder." + key: param}) + if "output_layer" in language_model_state_dict: + for key, param in language_model_state_dict["output_layer"].items(): + state_dict.update({"output_layer." + key: param}) + + if torch.distributed.get_rank() == 0: + logger.info("ModelOptGPTModel {}".format(state_dict.keys())) + + module_name_rewrite_list = [ + ("input_norm", "input_layernorm"), + (".attention.query_key_value", ".self_attention.linear_qkv"), + (".attention.dense", ".self_attention.linear_proj"), + ("self_attention.query_key_value", "self_attention.linear_qkv"), + ("self_attention.dense", "self_attention.linear_proj"), + ("post_attention_layernorm", "pre_mlp_layernorm"), + ("post_attention_norm", "pre_mlp_layernorm"), + ("dense_h_to_4h", "linear_fc1"), + ("dense_4h_to_h", "linear_fc2"), + ("final_norm", "final_layernorm"), + ] + + key_rewrite_list = [] + + for key, _ in state_dict.items(): + for old_name, new_name in module_name_rewrite_list: + if old_name in key: + key_rewrite_list += [(key, key.replace(old_name, new_name))] + + for old_key, new_key in key_rewrite_list: + if torch.distributed.get_rank() == 0: + logger.info("replace {} with {}".format(old_key, new_key)) + state_dict[new_key] = state_dict[old_key] + state_dict.pop(old_key) + + +def mcore_gpt_load_te_state_dict_pre_hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, +): + """Register a pre-hook to fix the state_dict key difference of. + + This prehook is used when trying to load the megatron/core GPTModel that uses a + fused Transformer-Engine ParallelLinear into the variant that uses native ParallelLinear + and Transformer-Engine Norm (effectively to restore the fusion). + Only this particular spec supports post-training quantization and TensorRT-LLM + config export through `nvidia-modelopt` package. + + Args: + state_dict: state dictionary + prefix: module name prefix + local_metadata: local metatdata + strict: whether is in strict mode + missing_keys: missing state dict keys + unexpected_keys: unexpected state dict keys + error_msgs: error messages + """ + if "modelopt_state" in state_dict: + state_dict.pop("modelopt_state") + + key_with_te_extra_state_to_pop = [] + + for key, _ in state_dict.items(): + if "_extra_state" in key: + key_with_te_extra_state_to_pop += [key] + + for key in key_with_te_extra_state_to_pop: + state_dict.pop(key) + + module_name_rewrite_list = [ + ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"), + ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"), + ("mlp.linear_fc1.layer_norm_weight", "pre_mlp_layernorm.weight"), + ("mlp.linear_fc1.layer_norm_bias", "pre_mlp_layernorm.bias"), + ] + + key_rewrite_list = [] + + for key, _ in state_dict.items(): + for old_name, new_name in module_name_rewrite_list: + if old_name in key: + key_rewrite_list += [(key, key.replace(old_name, new_name))] + + for old_key, new_key in key_rewrite_list: + if torch.distributed.get_rank() == 0: + logger.info("replace {} with {}".format(old_key, new_key)) + state_dict[new_key] = state_dict[old_key] + state_dict.pop(old_key) diff --git a/megatron/core/inference/common_inference_params.py b/megatron/core/inference/common_inference_params.py new file mode 100644 index 0000000..1311afd --- /dev/null +++ b/megatron/core/inference/common_inference_params.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass + + +@dataclass +class CommonInferenceParams: + """Inference parameters sent along with the prompts + + For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910 + """ + + temperature: float = 1.0 + top_k: int = 0 + top_p: float = 0.0 + return_log_probs: bool = False + num_tokens_to_generate: int = 30 + + def add_attributes(self, attribute_value_pair: dict): + """Utility to add more attributes to inference params + + Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows + c = CommonInferenceParams + c.add_attributes({'min_length':4, 'eod_id':153}) + + Args: + attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. + """ + for key, value in attribute_value_pair.items(): + setattr(self, key, value) diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py new file mode 100644 index 0000000..009d790 --- /dev/null +++ b/megatron/core/inference/communication_utils.py @@ -0,0 +1,49 @@ +import torch + +from megatron.core import parallel_state + + +def _is_cuda(tensor): + """Check if a tensor is not none and is cuda.""" + assert tensor is not None + assert tensor.is_cuda + + +def broadcast_from_last_pipeline_stage(size, dtype, tensor=None): + """Broadcast a tensor from last pipeline stage to all ranks.""" + + if parallel_state.is_pipeline_last_stage(): + _is_cuda(tensor) + assert tensor.is_contiguous() + else: + tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device()) + # Get the group and corresponding source rank. + src = parallel_state.get_pipeline_model_parallel_last_rank() + group = parallel_state.get_pipeline_model_parallel_group() + torch.distributed.broadcast(tensor, src, group) + return tensor + + +def recv_from_prev_pipeline_rank_(recv_buffer=None): + """Receive from previous pipeline stage and update the + input buffer inplace.""" + recv_prev_op = torch.distributed.P2POp( + torch.distributed.irecv, recv_buffer, parallel_state.get_pipeline_model_parallel_prev_rank() + ) + reqs = torch.distributed.batch_isend_irecv([recv_prev_op]) + for req in reqs: + req.wait() + # To protect against race condition when using batch_isend_irecv(). + torch.cuda.synchronize() + + +def send_to_next_pipeline_rank(tensor=None): + """Send output to the next pipeline stage.""" + send_next_op = torch.distributed.P2POp( + torch.distributed.isend, tensor, parallel_state.get_pipeline_model_parallel_next_rank() + ) + reqs = torch.distributed.batch_isend_irecv([send_next_op]) + for req in reqs: + req.wait() + # To protect against race condition when using batch_isend_irecv(). + torch.cuda.synchronize() diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/inference/engines/abstract_engine.py b/megatron/core/inference/engines/abstract_engine.py new file mode 100644 index 0000000..42201d6 --- /dev/null +++ b/megatron/core/inference/engines/abstract_engine.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod +from typing import List + + +class AbstractEngine(ABC): + @staticmethod + @abstractmethod + def generate(self) -> dict: + """The abstract backend's generate function. + + To define a new backend, implement this and return the outputs as a dictionary. + + Returns: + dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`. + """ + pass diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py new file mode 100644 index 0000000..0741f65 --- /dev/null +++ b/megatron/core/inference/engines/mcore_engine.py @@ -0,0 +1,91 @@ +from typing import Dict, List + +import torch + +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.engines.abstract_engine import AbstractEngine +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.scheduler import Scheduler +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( + SimpleTextGenerationController, +) + + +class MCoreEngine(AbstractEngine): + def __init__( + self, + text_generation_controller: SimpleTextGenerationController, + max_batch_size, + random_seed: int = None, + ): + """The Megatron core backend constructor + + This is the backend that does a simple forward pass on the model. Supports any model that is callable (Accepts the inputs and outputs the tensor) + + Args: + text_generation_controller (SimpleTextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. + max_batch_size : The maxinum number of requests to process at once + random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None. + """ + + self.text_generation_controller = text_generation_controller + self.random_seed = random_seed + self.scheduler = Scheduler(max_batch_size=max_batch_size) + + def generate(self, prompts: List[str], common_inference_params: CommonInferenceParams) -> dict: + """The megatron core inference backend generate function + + This backend returns the output generations as a dictionary. It returns the prompt tokens along with the generated tokens, the prompt plus the generated string and the output log probabilities if requested + + Args: + prompts (List[str]): All the prompts as a list of strings + common_inference_params (CommonInferenceParams): The inference parameters + + Returns: + List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required + """ + # TODO :M core- get rng state tracker + if self.random_seed: + torch.random.manual_seed(self.random_seed) + + for prompt in prompts: + prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt) + self.scheduler.add_request( + prompt=prompt, + prompt_tokens=prompt_tokens, + inference_parameters=common_inference_params, + ) + + self.run_engine() + + result: List[InferenceRequest] = self.scheduler.completed_request_pool.values() + return result + + def run_engine(self): + """Main functionality to run inference + + Runs the engine until there are no requests in the queue. + + Args: + dynamic_generation (bool, optional): Set this to True, if you want to enable dynamic batching. Mainly used with an inference server. Defaults to False. + """ + while self.scheduler.have_requests_pending(): + active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy() + result_dict: Dict[int, InferenceRequest] = ( + self.text_generation_controller.generate_all_output_tokens_static_batch( + active_requests + ) + ) + + self.scheduler.update_requests_pools(result_dict=result_dict) + + # TODO: Later for dynamic batching we will do something like this + """ + if dynamic_batching: + result_dict: Dict[ + int, InferenceRequest + ] = self.text_generation_controller.generate_output_tokens_one_step_dynamic_batch( + active_requests + ) + self.scheduler.update_requests_pools(result_dict=result_dict) + """ diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py new file mode 100644 index 0000000..5238414 --- /dev/null +++ b/megatron/core/inference/inference_request.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass +from enum import Enum +from typing import List + +import torch + +from megatron.core.inference.common_inference_params import CommonInferenceParams + + +# class syntax +class Status(Enum): + WAITING_IN_QUEUE = 1 + ACTIVE_AND_GENERATING_TOKENS = 2 + ACTIVE_BUT_NOT_GENERATING_TOKENS = 3 + COMPLETED = 4 + + +@dataclass +class InferenceRequest: + request_id: str + prompt: str + inference_parameters: CommonInferenceParams + prompt_tokens: List[int] + arrival_time: float + status: Status + generated_text: str = None + generated_tokens: torch.Tensor = None + generated_log_probs: torch.Tensor = None + generated_length: int = 0 diff --git a/megatron/core/inference/model_inference_wrappers/__init__.py b/megatron/core/inference/model_inference_wrappers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py new file mode 100644 index 0000000..50edb84 --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py @@ -0,0 +1,233 @@ +import abc +import math +from argparse import Namespace +from typing import Iterable, List, Union + +import torch + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.communication_utils import ( + recv_from_prev_pipeline_rank_, + send_to_next_pipeline_rank, +) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from megatron.core.inference_params import InferenceParams +from megatron.core.models.gpt.gpt_model import GPTModel + + +class AbstractModelInferenceWrapper(abc.ABC): + def __init__( + self, + model: Union['LegacyGPTModel', GPTModel], + inference_wrapper_config: InferenceWrapperConfig, + ): + """Constructor for the model inference wrapper + + The wrapper prepares the model for inference, provides the required input data and runs the forward pass. + + Args: + model (Union[GPTModel, LegacyGPTModel]): The actual GPT model (MCore or MLM) + args (Namespace): The commadline arguments that were passed + """ + assert not isinstance( + model, Iterable + ), 'interleaving schedule is not supported for inference' + self.model = model + self.inference_wrapper_config = inference_wrapper_config + self.pipeline_communication_dtype = ( + torch.float + if self.inference_wrapper_config.fp32_residual_connection + else self.inference_wrapper_config.params_dtype + ) + + def prep_model_for_inference(self, prompts_tokens: torch.Tensor): + """A utility function for preparing model for inference + + The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass. + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len] + + """ + self.model.eval() + + # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True + self.model_is_pipeline_parallel = not ( + parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() + ) + self.prompts_tokens = prompts_tokens + batch_size, max_sequence_length = self.prompts_tokens.shape + self.inference_params = InferenceParams(batch_size, max_sequence_length) + + @abc.abstractmethod + def get_batch_for_context_window(self) -> List: + """Returns the input data for inference + + This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. + + """ + pass + + def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor: + """Utility to carry out simple forward pass for TP or no model parallel models + + Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism. + + Args: + inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask] + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] + """ + tokens, position_ids, attention_mask = inference_input + logits = self.model( + tokens, position_ids, attention_mask, inference_params=self.inference_params + ) + logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) + self.inference_params.sequence_len_offset += tokens.size(1) + + return logits + + def _allocate_recv_buffer(self, batch_size, seq_len): + """Receive happens between the layers with size [seq_len, batch_size, hidden_size].""" + recv_size = (seq_len, batch_size, self.inference_wrapper_config.hidden_size) + return torch.empty( + recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device() + ) + + def forward_pass_with_pipeline_parallel_small_input_batch( + self, inference_input: List + ) -> torch.Tensor: + """Utility to carry out forward pass for PP models with very small inputs + + If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method + + Args: + inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask] + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] + """ + tokens, position_ids, attention_mask = inference_input + batch_size, seq_len = tokens.shape + recv_buffer = None + if not parallel_state.is_pipeline_first_stage(): + recv_buffer = self._allocate_recv_buffer(batch_size, seq_len) + recv_from_prev_pipeline_rank_(recv_buffer) + + self.model.set_input_tensor(recv_buffer) + output_tensor = self.model( + tokens, position_ids, attention_mask, inference_params=self.inference_params + ) + + if not parallel_state.is_pipeline_last_stage(): + send_to_next_pipeline_rank(output_tensor.type(dtype=self.pipeline_communication_dtype)) + + self.inference_params.sequence_len_offset += seq_len + + logits = None + if parallel_state.is_pipeline_last_stage(): + logits = output_tensor + logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) + + return logits + + def forward_pass_with_pipeline_parallel_large_input_batch( + self, inference_input: List + ) -> torch.Tensor: + """Utility to carry out forward pass PP models. + + Runs the forward pass for models which are pipeline parallel. This is more complex than forward_pass_with_pipeline_parallel_small_input_batch coz this splits the global batch into small micro batches and runs them through the model. + + Args: + inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask] + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] + """ + tokens, position_ids, attention_mask = inference_input + micro_batch_size = max( + 1, + self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1), + ) + batch_size, seq_len = tokens.shape + # Round up to account for the last partial micro batch if present + num_micro_batches = math.ceil(batch_size / micro_batch_size) + + logits = None + # Preallocate memory for output logits. + if parallel_state.is_pipeline_last_stage(): + logits = torch.empty( + (batch_size, seq_len, self.inference_wrapper_config.padded_vocab_size), + dtype=torch.float32, + device=torch.cuda.current_device(), + ) + + recv_buffer = None + if not parallel_state.is_pipeline_first_stage(): + recv_buffer = self._allocate_recv_buffer(micro_batch_size, seq_len) + for micro_batch_index in range(num_micro_batches): + start = micro_batch_index * micro_batch_size + end = min(start + micro_batch_size, batch_size) + tokens2use = tokens[start:end, ...] + position_ids2use = position_ids[start:end, ...] + current_micro_batch_size = end - start + + # Need to change recv buffer shape for the last partial microbatch (if exists) + if current_micro_batch_size != micro_batch_size: + recv_buffer = self._allocate_recv_buffer(current_micro_batch_size, seq_len) + + if not parallel_state.is_pipeline_first_stage(): + recv_from_prev_pipeline_rank_(recv_buffer) + + self.model.set_input_tensor(recv_buffer) + output_tensor = self.model( + tokens2use, position_ids2use, attention_mask, inference_params=self.inference_params + ) + + if not parallel_state.is_pipeline_last_stage(): + send_to_next_pipeline_rank(output_tensor) + + self.inference_params.batch_size_offset += current_micro_batch_size + + if parallel_state.is_pipeline_last_stage(): + output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region( + output_tensor + ) + logits[start:end, ...] = output_tensor + + # Once done with all micro batches, we reset batch size offset and seq len offset + self.inference_params.sequence_len_offset += seq_len + self.inference_params.batch_size_offset = 0 + + # NOTE: Only returns the logits on the last pipeline stage + return logits + + def run_one_forward_step(self, inference_input: List) -> torch.Tensor: + """The forward pass of the model for inference + + Appropriate utility is called for the forward pass depending on the type of model parallelism used + + Args: + inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask] + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models. + """ + if self.model_is_pipeline_parallel: + tokens = inference_input[0] + current_batch_size, seq_len = tokens.shape + # If input batch is large, we need to split into micro batches and run the forward pass + if ( + current_batch_size * seq_len + > self.inference_wrapper_config.inference_batch_times_seqlen_threshold + ): + return self.forward_pass_with_pipeline_parallel_large_input_batch(inference_input) + else: + # If input batch is very small we can do a simple forward pass on the entire global batch + return self.forward_pass_with_pipeline_parallel_small_input_batch(inference_input) + else: + return self.forward_pass_without_pipeline_parallel(inference_input) diff --git a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py new file mode 100644 index 0000000..0e6b9ef --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py @@ -0,0 +1,84 @@ +from argparse import Namespace +from typing import List, Tuple + +import torch + +from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( + AbstractModelInferenceWrapper, +) +from megatron.core.models.gpt import GPTModel + + +class GPTInferenceWrapper(AbstractModelInferenceWrapper): + def __init__(self, model: GPTModel, args: Namespace): + """Constructor for the model inference wrapper + + The wrapper prepares the model for inference, provides the required input data, and runs the forward pass + + Args: + model (GPTModel): The GPT model (MCore or legacy) + args (Namespace): The command line arguments that were passed + """ + super().__init__(model, args) + + def prep_model_for_inference(self, prompts_tokens: torch.Tensor): + """A utility function for preparing model for inference + + This function is called before the forward pass. It puts the model in eval mode, builds position ids, and creates attention masks so that required slices can be extracted during the forward pass. + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len] + """ + + super().prep_model_for_inference(prompts_tokens=prompts_tokens) + self.attention_mask, self.position_ids = self._build_attention_mask_and_position_ids( + prompts_tokens + ) + + def _build_attention_mask_and_position_ids( + self, prompts_tokens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Builds the full attention mask and position ids for the input tokens + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len] + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The attention mask of shape [1, 1, max_seq_len, max_seq_len] and position ids of shape [batch_size, max_seq_len] + """ + seq_length = prompts_tokens.size(1) + attention_mask = torch.tril( + torch.ones((1, seq_length, seq_length), device=prompts_tokens.device) + ).view(1, 1, seq_length, seq_length) + # Convert to boolean + attention_mask = attention_mask < 0.5 + + position_ids = ( + torch.arange(seq_length, dtype=torch.long, device=prompts_tokens.device) + .unsqueeze(0) + .expand_as(prompts_tokens) + ) + + return attention_mask, position_ids + + def get_batch_for_context_window( + self, context_start_position: int, context_end_position: int + ) -> List: + """Returns the inference data given context window + + This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data. + + Args: + context_start_position (int): Start of the context window. During the first inference step it is mostly 0 + context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length. + + Returns: + List: A list of inputs that will be used by your model in the forward step + """ + tokens2use = self.prompts_tokens[:, context_start_position:context_end_position] + positions2use = self.position_ids[:, context_start_position:context_end_position] + attention_mask2use = self.attention_mask[ + ..., context_start_position:context_end_position, :context_end_position + ] + data_at_step_idx = [tokens2use, positions2use, attention_mask2use] + return data_at_step_idx diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py new file mode 100644 index 0000000..7677eac --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py @@ -0,0 +1,39 @@ +from dataclasses import dataclass + +import torch + + +@dataclass +class InferenceWrapperConfig: + """Config for the model inference wrapper + + NOTE : All the arguments here are obtained from arguments.py file + """ + + hidden_size: int + """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]""" + + params_dtype: torch.dtype + """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used""" + + inference_batch_times_seqlen_threshold: int + """if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.""" + + padded_vocab_size: int + """The final padded vocab size (Padded to make it divisible by --make-vocab-size-divisible-by value)""" + + fp32_residual_connection: bool = False + """Move residual connections to fp32. Obtained from arguments.py""" + + def add_attributes(self, attribute_value_pair: dict): + """Utility to add more attributes to inference params + + Use this method to pass in a custom dictonary to add more config to the instance you created. Use as follows + c = InferenceWrapperConfig + c.add_attributes({'precision':'fp32'}) + + Args: + attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. + """ + for key, value in attribute_value_pair.items(): + setattr(self, key, value) diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py new file mode 100644 index 0000000..08d2544 --- /dev/null +++ b/megatron/core/inference/scheduler.py @@ -0,0 +1,116 @@ +import time +import typing +from collections import OrderedDict +from typing import Dict, List + +import torch + +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.utils import Counter + + +class Scheduler: + def __init__(self, max_batch_size: int): + """Scheduler for handling requests to inference engine + + This class is responsible for handing of all the incomign requests + + Args: + max_batch_size (int): The max batch size that we can pass to the inference engine at a time. + """ + self.max_batch_size = max_batch_size + self.active_request_pool: Dict[int, InferenceRequest] = OrderedDict() + self.waiting_request_pool: Dict[int, InferenceRequest] = OrderedDict() + self.completed_request_pool: Dict[int, InferenceRequest] = OrderedDict() + self.request_counter = Counter() + + def add_request( + self, + prompt: str, + prompt_tokens: torch.Tensor, + inference_parameters: CommonInferenceParams, + arrival_time: float = None, + ): + """Add an incoming request + + This method will add the request to either the active pool or the waiting pool depending on the batch size. + + Args: + prompt (str): Input prompt string + prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized + inference_parameters (CommonInferenceParams): The inference parameters + arrival_time (float, optional): The incoming request time. Defaults to None. + """ + request_id = str(next(self.request_counter)) + + if arrival_time is None: + arrival_time = time.time() + + status = ( + Status.ACTIVE_BUT_NOT_GENERATING_TOKENS + if len(self.active_request_pool) < self.max_batch_size + else Status.WAITING_IN_QUEUE + ) + + inference_request = InferenceRequest( + request_id=request_id, + prompt=prompt, + inference_parameters=inference_parameters, + arrival_time=arrival_time, + prompt_tokens=prompt_tokens, + status=status, + ) + + if status == status.ACTIVE_BUT_NOT_GENERATING_TOKENS: + self.active_request_pool[request_id] = inference_request + else: + self.waiting_request_pool[request_id] = inference_request + + def have_requests_pending(self) -> bool: + """Method to check if there are requests pending + + This method returns False only when there are no active requests or waiting requests. + """ + num_requests_pending = len(self.active_request_pool) + len(self.waiting_request_pool) + return num_requests_pending > 0 + + def add_earliest_waiting_request_to_active_pool(self): + """Utility to add the waiting request to active pool + + This method will add the earliest request (FIFO) that is in the waiting request pool to the active request pool. + """ + assert ( + len(self.active_request_pool) < self.max_batch_size + ), "Active request pool is already full. Cant add any more requests" + if len(self.waiting_request_pool) > 0: + ( + earliest_waiting_request_request_id, + earliest_waiting_request, + ) = self.waiting_request_pool.popitem(last=False) + earliest_waiting_request.status = Status.ACTIVE_BUT_NOT_GENERATING_TOKENS + self.active_request_pool[earliest_waiting_request_request_id] = earliest_waiting_request + + def update_requests_pools(self, result_dict: typing.OrderedDict[int, InferenceRequest] = None): + """Update request pool status + + This method will full up the active request pool, if it has less than max batch size elements from the waiting request pool. + If provided with a request dict, it will put the completed requests into the completed request pool and add waiting request into active pool. + + Args: + result (typing.OrderedDict[int, InferenceRequest], optional): The result returned by the engine. A dictionary with keys as the request ids, and values as the requests. Defaults to None + """ + for result_request_id in list(result_dict.keys()): + active_request = self.active_request_pool[result_request_id] + + # If a request has completed put it into the completed request pool. + if active_request.status == Status.COMPLETED: + completed_request = self.active_request_pool.pop(result_request_id) + self.completed_request_pool[result_request_id] = completed_request + + # If the active request pool is not full, add waiting requests in FIFO order + while ( + len(self.active_request_pool) < self.max_batch_size + and len(self.waiting_request_pool) > 0 + ): + self.add_earliest_waiting_request_to_active_pool() diff --git a/megatron/core/inference/text_generation_controllers/__init__.py b/megatron/core/inference/text_generation_controllers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py new file mode 100644 index 0000000..333acc1 --- /dev/null +++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py @@ -0,0 +1,352 @@ +from typing import List, OrderedDict, Tuple + +import torch +import torch.nn.functional as F + +from megatron.core import parallel_state +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( + AbstractModelInferenceWrapper, +) + + +class SimpleTextGenerationController: + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): + """The basic text generation controller + + This class is responsible for tokenizing the input , running the inference, sampling and also detokenizing the output + + Args: + inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts + """ + self.inference_wrapped_model = inference_wrapped_model + self.tokenizer = tokenizer + + # For models without pipeline parallelism, is_first_stage and is_last_stage returns True + self.model_is_pipeline_parallel = not ( + parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() + ) + + def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the input prompts + + Args: + prompt (str): The input prompt + + Returns: + torch.Tensor: Returns the tokenized prompt + """ + return self.tokenizer.tokenize(prompt) + + def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: + """Detokenize the output generations + + Args: + prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt tokens plus the generated tokens + + Returns: + str: The detokenized output + """ + tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist() + return self.tokenizer.detokenize(tokens) + + def sample_from_logits( + self, + last_token_logits: torch.Tensor, + common_inference_params: CommonInferenceParams, + vocab_size: int = None, + ) -> torch.Tensor: + """Samples the logits to generate outputs + + Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples + + Args: + last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size] + common_inference_params (CommonInferenceParams): The paramters to use for inference + vocab_size (int): Obtained from the tokenizer. Defaults to None + + Returns: + torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements + """ + + top_p = common_inference_params.top_p + top_k = common_inference_params.top_k + temperature = common_inference_params.temperature + + assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero' + assert top_p <= 1.0, 'top-p should be in (0,1]' + + def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf.""" + filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(filter_, float('-Inf')) + + def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf.""" + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Filteration based on the cumulative sum. + filter_ = cumulative_probs > top_p + # This shift by 1 is weird and I cannot justify it. This existed + # in the original implementation: + # https://github.com/ari-holtzman/degen/blob/master/gen.py + # and I guess it is needed so keeping it for now. + filter_[:, 1:] = filter_[:, :-1].clone() + # Make sure we at least have one token to select from. + filter_[..., 0] = 0 + + # Fill in the filtered part + filter_ = filter_.scatter(1, sorted_indices, filter_) + logits.masked_fill_(filter_, float('-Inf')) + + # Greedy sampling + if top_k == 1: + sampled_logits = torch.argmax(last_token_logits, dim=-1) + else: + last_token_logits = last_token_logits.clone() + if temperature != 1.0: + last_token_logits.div_(temperature) + + if top_k > 1: + assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.' + if vocab_size: + assert top_k < vocab_size, 'top-k is larger than vocab size.' + modify_logits_for_top_k_filtering(last_token_logits, top_k) + + elif top_p > 0.0: + modify_logits_for_top_p_filtering(last_token_logits, top_p) + + # After filtering, we need to recalculate the distribution. + probabilities = last_token_logits.softmax(dim=-1) + sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1) + + # If vocab size is provided, make sure the samples are in in the range [0, vocab-size). + if vocab_size: + sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1)) + return sampled_logits + + def update_generation_status( + self, + updated_prompts_tokens: torch.Tensor, + generation_started: torch.Tensor, + current_context_end_position: int, + is_generation_done_tensor: torch.Tensor, + generated_sequence_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Checks which prompts have reached an end condition + + We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True. The generated sequence lengths increase as we keep generating, until that prompts hits an end condition. The generation_started tensor determines which prompts have started generating. + + Args: + updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest generated tokens. A tensor of shape [batch_size, max_seq_len] (i.e max_seq_len = max_prompt_len + tokens_to_generate) + generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens. + current_context_end_position (int): An integer indicating which position to extract from the prompts tokens to get the latest generated tokens. + is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has reached end condition. + generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. Each value represents the generated sequence lengths for that prompt. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean is_generation_done_tensor and the generated_sequence_lengths after updating it + """ + latest_samples = updated_prompts_tokens[:, current_context_end_position] + # Make sure we are checking eod criterion only for prompts that have started generating (i.e) We only look at the generated tokenns and not the input tokens. + reached_eod = (latest_samples == self.tokenizer.eod) & generation_started + is_generation_done_tensor = is_generation_done_tensor | reached_eod + # We increment generated sequence lengths when that prompt has not hit the EOD and generation has started + generated_sequence_lengths += ~is_generation_done_tensor & generation_started + + return is_generation_done_tensor, generated_sequence_lengths + + def pad_input_prompt_tokens( + self, + batch_prompt_tokens_list: List[List[int]], + max_prompt_length_in_batch: int, + num_tokens_to_generate: int, + ) -> torch.Tensor: + """Method to pad input prompts + + Given a list of prompts, pad them all to uniform length + + Args: + batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens + max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens + num_tokens_togenerate (int): The number of tokens to generate for each prompt + + Returns: + torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, with extra indices for each tensor padded with mask id. + """ + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + + for prompt_tokens in batch_prompt_tokens_list: + padding_size = max_seq_len - len(prompt_tokens) + prompt_tokens.extend([self.tokenizer.eod] * padding_size) + + return torch.tensor(batch_prompt_tokens_list).cuda() + + def generate_output_tokens_dynamic_batch( + self, + active_requests: OrderedDict[int, InferenceRequest], + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the output tokens and probabilities for the prompts + + This utility generates the output tokens for a dynamic batch. It will run one forward step at a time, and pass control back to the engine, which will update the request pool and call this method again. + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests after running one forward step. + """ + raise Exception("Not implemented yet") + + def generate_all_output_tokens_static_batch( + self, + active_requests: OrderedDict[int, InferenceRequest], + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the all the output tokens and probabilities for the prompts . + + This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + """ + batch_prompt_tokens_list = list( + map(lambda request: request.prompt_tokens, active_requests.values()) + ) + prompt_lengths_in_batch = torch.tensor( + [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list] + ).cuda() + max_prompt_length_in_batch = max(prompt_lengths_in_batch) + min_prompt_length_in_batch = min(prompt_lengths_in_batch) + + # For batch inference the inference params are the same for all request + common_inference_params: CommonInferenceParams = list(active_requests.values())[ + 0 + ].inference_parameters + + # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + batch_prompt_tokens = self.pad_input_prompt_tokens( + batch_prompt_tokens_list, + max_prompt_length_in_batch=max_prompt_length_in_batch, + num_tokens_to_generate=common_inference_params.num_tokens_to_generate, + ) + batch_size, max_sequence_length = batch_prompt_tokens.shape + + # Pre allocate log probs tensor + output_log_probs = None + if common_inference_params.return_log_probs: + output_log_probs = torch.empty( + (batch_size, max_sequence_length - 1), dtype=torch.float32 + ).cuda() + + # An array to check which of the prompts have reached end of generation condition + is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda() + + # An array to act as a counter to keep track of generated sequence lengths + generated_sequence_lengths = torch.zeros(batch_size).cuda() + + with torch.no_grad(): + self.inference_wrapped_model.prep_model_for_inference( + prompts_tokens=batch_prompt_tokens + ) + + context_start_position = 0 + # Pick the context window that we need to pass through the network. + for context_end_position in range(min_prompt_length_in_batch, max_sequence_length): + + inference_input = self.inference_wrapped_model.get_batch_for_context_window( + context_start_position, context_end_position + ) + + # Returns the final logits of shape [batch_size, context_length, vocab_size] + # Note: This is returned in all TP ranks or last PP stage in PP models + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + if self.model_is_pipeline_parallel: + context_length = context_end_position - context_start_position + logits = broadcast_from_last_pipeline_stage( + [batch_size, context_length, self.tokenizer.vocab_size], + dtype=torch.float32, + tensor=logits, + ) + + # Indicates which of the input prompts have started generating tokens. A 1D boolean tensor with [batch_size] elements (i.e) The shortest prompts will start generating first and so on + generation_started = prompt_lengths_in_batch <= context_end_position + last_token_logits = logits[:, -1, :] + sampled_logits = self.sample_from_logits( + last_token_logits, common_inference_params, self.tokenizer.vocab_size + ) + + # Substitute the sampled logits only for only the prompts that have started generating tokens + batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ + generation_started + ] + + if common_inference_params.return_log_probs: + log_probs = F.log_softmax(logits, dim=2) + indices = torch.unsqueeze( + batch_prompt_tokens[ + :, (context_start_position + 1) : (context_end_position + 1) + ], + 2, + ) + # Get the log probabilities for only the prompt tokens + output_log_probs[:, context_start_position:context_end_position] = torch.gather( + log_probs, 2, indices + ).squeeze(2) + + context_start_position = context_end_position + + # Check end of generation status for each tensor and update generated sequence lengths + ( + is_generation_done_tensor, + generated_sequence_lengths, + ) = self.update_generation_status( + updated_prompts_tokens=batch_prompt_tokens, + generation_started=generation_started, + current_context_end_position=context_end_position, + is_generation_done_tensor=is_generation_done_tensor, + generated_sequence_lengths=generated_sequence_lengths, + ) + # Boolean flag indicating if all prompts are finished + all_prompts_done = torch.all(is_generation_done_tensor) + if all_prompts_done: + break + + # Include all the generated tokens + batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)] + if common_inference_params.return_log_probs: + output_log_probs = output_log_probs[:, :context_end_position] + + generated_sequence_lengths[ + generated_sequence_lengths > common_inference_params.num_tokens_to_generate + ] = common_inference_params.num_tokens_to_generate + + for idx, request in enumerate(active_requests.values()): + input_prompt_length = int(prompt_lengths_in_batch[idx]) + # Shorter prompts might have generated more than required tokens. So we trim them down + required_sequence_length = int( + min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate) + ) + # Extract only the generated tokens + required_result_tokens = batch_prompt_tokens_with_generations[ + idx, input_prompt_length : (input_prompt_length + required_sequence_length) + ] + + request.generated_length = required_sequence_length + request.generated_tokens = required_result_tokens + request.generated_log_probs = ( + None + if output_log_probs is None + else output_log_probs[idx, input_prompt_length:required_sequence_length] + ) + request.status = Status.COMPLETED + request.generated_text = self.detokenize_generations(required_result_tokens) + + return active_requests diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py new file mode 100644 index 0000000..d23808c --- /dev/null +++ b/megatron/core/inference/utils.py @@ -0,0 +1,16 @@ +class Counter: + """A simple counter class + + This class is responsible for assigning request ids to incoming requests + """ + + def __init__(self, start: int = 0) -> None: + self.counter = start + + def __next__(self) -> int: + i = self.counter + self.counter += 1 + return i + + def reset(self) -> None: + self.counter = 0 diff --git a/megatron/core/inference_params.py b/megatron/core/inference_params.py new file mode 100644 index 0000000..4b749a1 --- /dev/null +++ b/megatron/core/inference_params.py @@ -0,0 +1,30 @@ +class InferenceParams: + """Inference parameters that are passed to the main model in order + to efficienly calculate and store the context during inference.""" + + def __init__(self, max_batch_size, max_sequence_length): + self.max_sequence_length = max_sequence_length + self.max_batch_size = max_batch_size + self.sequence_len_offset = 0 + self.batch_size_offset = 0 + self.key_value_memory_dict = {} + + def swap_key_value_dict(self, batch_idx): + "swap between batches" + if len(self.key_value_memory_dict) == 0: + raise ValueError("should not swap when dict in empty") + + for layer_number in self.key_value_memory_dict.keys(): + inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number] + assert ( + len(batch_idx) == inference_key_memory.shape[1] + ) # make sure batch size is the same + new_inference_key_memory = inference_key_memory[:, batch_idx] + new_inference_value_memory = inference_value_memory[:, batch_idx] + self.key_value_memory_dict[layer_number] = ( + new_inference_key_memory, + new_inference_value_memory, + ) + + def __str__(self): + return f"InferenceParams(max_seq_len = {self.max_sequence_length}, max_batch_size = {self.max_batch_size}, sequence_len_offset = {self.sequence_len_offset}, batch_size_offset = {self.batch_size_offset}, key_value_memory_dict = {self.key_value_memory_dict.keys()})" diff --git a/megatron/core/jit.py b/megatron/core/jit.py new file mode 100644 index 0000000..8bb18d3 --- /dev/null +++ b/megatron/core/jit.py @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import torch + +TORCH_MAJOR = int(torch.__version__.split(".")[0]) +TORCH_MINOR = int(torch.__version__.split(".")[1]) + +jit_fuser = torch.jit.script +# nvFuser is deprecated in PyTorch JIT starting from 2.2 +if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2): + jit_fuser = torch.compile diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py new file mode 100644 index 0000000..6bf7c8e --- /dev/null +++ b/megatron/core/model_parallel_config.py @@ -0,0 +1,324 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Callable, ContextManager, Optional + +import torch + + +@dataclass +class ModelParallelConfig: + """Base configuration for Megatron Core + + The initialization function has an argument for each parameter. + """ + + ################### + # Model parallelism + ################### + tensor_model_parallel_size: int = 1 + """Intra-layer model parallelism. Splits tensors across GPU ranks.""" + + pipeline_model_parallel_size: int = 1 + """Inter-layer model parallelism. Splits transformer layers across GPU ranks.""" + + virtual_pipeline_model_parallel_size: Optional[int] = None + """Interleaved pipeline parallelism is used to improve performance by reducing the pipeline + bubble. Considers a transformer block as a list of smaller transformer (virtual) blocks. + The number of virtual blocks per pipeline model parallel rank is the virtual model parallel + size. See Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM: + arxiv.org/pdf/2104.04473.pdf for more details. + """ + + sequence_parallel: bool = False + """Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms + and dropout sequentially. See Reducing Activation Recomputation in Large Transformer Models + (https://arxiv.org/abs/2205.05198) for more details. + """ + + context_parallel_size: int = 1 + """Splits network input along sequence dimension across GPU ranks.""" + + expert_model_parallel_size: int = 1 + """Distributes Moe Experts across sub data parallel dimension.""" + + moe_extended_tp: bool = False + """Alternative parallelization strategy for expert parallelism. Instead of distributing experts + across expert_model_parallel_size, each expert is sharded along extendended tensor parallel + domain (tensor_model_paralle_size * expert_model_parallel_size). It avoids the load balancing + problem with MOE training. + """ + + ################### + # Initialization + ################### + perform_initialization: bool = True + """If true, weights are initialized. This option can be useful when you know you are going to + load values from a checkpoint. + """ + + use_cpu_initialization: bool = False + """When set to False, we initialize the weights directly on the GPU. CPU initialization is the + same regardless of tensor model parallelism, but GPU initialization is not. Transferring + weights from CPU to GPU can take a significant amount of time for large models. + """ + + ################### + # Training + ################### + fp16: bool = False + """If true, train with fp16 mixed precision training.""" + + bf16: bool = False + """If true, train with bf16 mixed precision training.""" + + params_dtype: torch.dtype = torch.float32 + """dtype used when intializing the weights.""" + + timers: Callable = None + """Timers object to call for various timing functions. See megatron.core.timers.Timers""" + + finalize_model_grads_func: Callable = None + """Function that finalizes gradients on all workers. Could include ensuring that grads are + all-reduced across data parallelism, pipeline parallelism, and sequence parallelism + dimensions. + """ + + grad_scale_func: Callable = None + """If using loss scaling, this function should take the loss and return the scaled loss. If + None, no function is called on the loss. + """ + + no_sync_func: Callable = None + """Function that creates a context that suppresses asynchronous data-parallel communication. If + the model is an instance of core.distributed.DistributedDataParallel, the default is to use + core.distributed.DistributedDataParallel.no_sync. + """ + + grad_sync_func: Callable = None + """Function that launches asynchronous gradient reductions (e.g. distributed optimizer gradient + reduce-scatters). The function should take one argument: an iterable of parameters whose + gradients are to be synchronized. + """ + + param_sync_func: Callable = None + """Function that launches asynchronous parameter synchronizations (e.g. distributed optimizer + parameter all-gathers). The function should take one argument: an iterable of parameters to + be synchronized. + """ + + deterministic_mode: bool = False + """If true, code that has deterministic execution will be chosen. This usually + means slower execution, but is good for debugging and testing. Defaults to False.""" + + enable_autocast: bool = False + """If true runs the forward step function inside torch.autocast context.""" + + autocast_dtype: torch.dtype = None + """dtype to pass to torch.amp.autocast when enabled. If None, is set to pipeline_dtype.""" + + num_microbatches_with_partial_activation_checkpoints: Optional[int] = None + """If int, set the number of microbatches where not all of the layers will be checkpointed and + recomputed. The rest of the microbatches within the window of maximum outstanding + microbatches will recompute all layers (either full recompute or selective recompute). If + None, the checkpoint and recompute will be left up to the forward_step function. + + """ + + ################### + # Optimizations + ################### + gradient_accumulation_fusion: bool = False + """If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA extension + fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install + APEX with --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\" + --global-option=\"--cuda_ext\" ". Note that the extension requires CUDA>=11. Otherwise, you + must turn off gradient accumulation fusion. + """ + + async_tensor_model_parallel_allreduce: bool = False + """NOTE: Deprecated. This flag is ignored.""" + + use_te_rng_tracker: bool = False + """If true, uses RNG state tracker in TransformerEngine if exists. + """ + + tp_comm_overlap: bool = False + """If true, allows overlapping of Linear layer execution with tensor parallel communication + collectives like AllGather/ReduceScatter. Overlapping is done for the linear layers wherever + possible during the forward and the backward pass. + """ + + tp_comm_bulk_wgrad: bool = True + """If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't care if + tp_comm_overlap is False. + """ + + tp_comm_bulk_dgrad: bool = True + """If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if + tp_comm_overlap is False. + """ + + tp_comm_overlap_ag: bool = True + """If true, allows All-Gather overlap with GEMM by pipelining the GEMM and All-Gather. + Don't care if tp_comm_overlap is False. + """ + + tp_comm_overlap_rs: bool = True + """If true, allows Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter. + Don't care if tp_comm_overlap is False. + """ + + tp_comm_overlap_rs_dgrad: bool = False + """If true, allows Reduce-Scatter overlap with DGRAD GEMM by pipelining the + GEMM and Reduce-Scatter splits. Don't care if tp_comm_overlap is False. + """ + + tp_comm_split_ag: bool = True + """Deprecated from TransformerEngine v1.6.0. + If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather + splits. Don't care if tp_comm_overlap is False. + """ + + tp_comm_atomic_ag: bool = False + """Deprecated from TransformerEngine v1.6.0. + If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both + done atomically. Don't care if tp_comm_overlap is False. + """ + + tp_comm_split_rs: bool = True + """Deprecated from TransformerEngine v1.6.0. + If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and + Reduce-Scatter splits. Don't care if tp_comm_overlap is False. + """ + + tp_comm_atomic_rs: bool = False + """Deprecated from TransformerEngine v1.6.0. + If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and + Reduce-Scatter both done atomically. Don't care if tp_comm_overlap is False. + """ + + cross_entropy_loss_fusion: bool = False + """If this is enabled, the fused cross entropy implementation would be used. + Defaults to False. + """ + + ################### + # Pipeline Parallel + ################### + pipeline_dtype: torch.dtype = None + """dtype used in p2p communication, usually params_dtype""" + + variable_seq_lengths: bool = False + """Support for variable sequence lengths across microbatches. Setting this communicates the size + of tensors during pipeline parallelism communication, because of this extra overhead it + should only be set if the sequence length varies by microbatch within a global batch. + """ + + overlap_p2p_comm: bool = False + """When True some of the peer to peer communication for pipeline parallelism will overlap with + computation. Must be False if batch_p2p_comm is true. + """ + + batch_p2p_comm: bool = True + """Use batch_isend_irecv instead of individual isend/irecv calls. Must be False if + overlap_p2p_comm is True. + """ + + batch_p2p_sync: bool = True + """When using batch_isend_irecv, do a cuda.device.synchronize afterward to work around a bug in + older version of PyTorch. + """ + + use_ring_exchange_p2p: bool = False + """Use custom ring_exchange kernel instead of torch.distributed.batch_isend_irecv(). Requires + custom built torch with torch.distributed.ring_exchange. + """ + + deallocate_pipeline_outputs: bool = False + """If True, output data is deallocated after the tensor is sent to the next pipeline stage. + Helps with saving memory, does nothing when pipeline parallel is not used. + """ + + defer_embedding_wgrad_compute: bool = False + """If true, defers the embedding WGRAD GEMMs while pipeline flush is + taking place enabling us to hide pipeline flush latency. Defaults to False. + """ + + wgrad_deferral_limit: int = 0 + """This value tunes the number of micro-batches for which the embedding weight gradient compute + needs to be deferred to pipeline flush, this argument is invalid if `defer_embedding_wgrad_compute` is False. + Defaults to 0, which means all micro-batches are deferred. + """ + + pipeline_model_parallel_split_rank: Optional[int] = None + """If int, rank where encoder and decoder should be split in cases where the model has both an + encoder and decoder (e.g., T5). Ignored if None. + """ + + ################### + # CPU Offloading + ################### + cpu_offloading: bool = False + """When set to True, all the activations are offloaded to the CPU asynchronously.""" + + cpu_offloading_num_layers: int = 0 + """Tells the number of transformer layers for which activations has to be offloaded.""" + + _cpu_offloading_context: ContextManager = ( + None # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible. + ) + """For internal use only, do not set.""" + + cpu_offloading_activations: bool = True + """If True, offloads the activations to CPU.""" + + cpu_offloading_weights: bool = True + """If True, offloads the weights to CPU.""" + + ################### + # Timing + ################### + barrier_with_L1_time: bool = True + """If true, use barrier with level 1 time measurements. It is up to the user to make sure + calling barrier with their timers will not result in hangs. This can happen if for example + the user adds a level 1 timer that is not called by all ranks. + """ + + def __post_init__(self): + """Python dataclass method that is used to modify attributes after initialization. + See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details. + """ + if self.sequence_parallel: + if self.tensor_model_parallel_size <= 1: + raise ValueError("Can not use sequence paralllelism without tensor parallelism") + + if self.pipeline_model_parallel_size > 1: + if self.pipeline_dtype is None: + raise ValueError( + "When using pipeline parallelism, pipeline_dtype must be specified" + ) + + if self.autocast_dtype is None: + self.autocast_dtype = self.params_dtype + + if self.defer_embedding_wgrad_compute and self.pipeline_model_parallel_size == 1: + raise ValueError( + "Cannot defer embedding wgrad compute when pipeline model parallel is not used" + ) + + if self.defer_embedding_wgrad_compute and not self.gradient_accumulation_fusion: + raise ValueError( + "Cannot defer embedding wgrad compute when gradient accumulation fusion is not used" + ) + + if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0: + raise ValueError( + "Wgrad deferral limit should be greater than or equal to 0 when this optimization is enabled!" + ) + + if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1: + if self.sequence_parallel is False: + raise ValueError( + "When using expert parallelism and tensor parallelism, sequence parallelism must be used" + ) diff --git a/megatron/core/models/T5/__init__.py b/megatron/core/models/T5/__init__.py new file mode 100644 index 0000000..f65859a --- /dev/null +++ b/megatron/core/models/T5/__init__.py @@ -0,0 +1 @@ +from .t5_model import T5Model diff --git a/megatron/core/models/T5/t5_model.py b/megatron/core/models/T5/t5_model.py new file mode 100644 index 0000000..87a5ba8 --- /dev/null +++ b/megatron/core/models/T5/t5_model.py @@ -0,0 +1,449 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import logging +from typing import List, Literal, Optional, Tuple + +import torch +from torch import Tensor + +from megatron.core import InferenceParams, parallel_state, tensor_parallel +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.transformer.enums import AttnMaskType, ModelType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import TransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint + + +class T5LMHead(MegatronModule): + """Masked LM head for T5 + + Args: + config (TransformerConfig): transformer config + parallel_output (bool): wether output logits being distributed or not. + vocab_size (int): vocabulary size + pre_process (bool): Include embedding layer + share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are + shared. + """ + + def __init__( + self, + config: TransformerConfig, + parallel_output: bool, + vocab_size: int, + pre_process: bool = True, + share_embeddings_and_output_weights: bool = False, + ): + super(T5LMHead, self).__init__(config=config) + + self.parallel_output = parallel_output + + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + vocab_size, + config=config, + init_method=config.init_method, + bias=share_embeddings_and_output_weights, + skip_bias_add=not share_embeddings_and_output_weights, + gather_output=not self.parallel_output, + skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights, + ) + + def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor: + """Forward pass. + + Args: + hidden_states (Tensor): output hidden states from decoder + word_embeddings_weight (Tensor): word embedding weight + + Returns: + Tensor: logits tensor + """ + + logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight) + return logits + + +class T5Model(LanguageModule): + """T5 Language model. + + Args: + config (TransformerConfig): transformer config + + transformer_encoder_layer_spec (ModuleSpec): transformer layer customization specs for encoder + + transformer_decoder_layer_spec (ModuleSpec): transformer layer customization specs for decoder + + vocab_size (int): vocabulary size + + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + + pre_process (bool): Include embedding layer (used with pipeline parallelism) + post_process (bool): Include an output layer (used with pipeline parallelism) + + fp16_lm_cross_entropy (bool, optional): Defaults to False + + parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks + + share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are + shared. Defaults to False. + + position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. + Defaults is 'learned_absolute'. + + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. + + seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences. + The value must be a float larger than 1.0. Defaults to None. + """ + + def __init__( + self, + config: TransformerConfig, + transformer_encoder_layer_spec: ModuleSpec, + transformer_decoder_layer_spec: ModuleSpec, + vocab_size: int, + max_sequence_length: int, + pre_process: bool = True, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', + rotary_percent: float = 1.0, + seq_len_interpolation_factor: Optional[float] = None, + ): + + super(T5Model, self).__init__(config=config) + + self.config: TransformerConfig = config + self.transformer_encoder_layer_spec: ModuleSpec = transformer_encoder_layer_spec + self.transformer_decoder_layer_spec: ModuleSpec = transformer_decoder_layer_spec + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.pre_process = pre_process + self.post_process = post_process + self.add_encoder = True + self.add_decoder = True + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + + # megatron core pipelining currently depends on model type + self.model_type = ModelType.encoder_and_decoder + + # Embeddings. + if self.pre_process: + self.embedding = LanguageModelEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + position_embedding_type=self.position_embedding_type, + ) + + # Rotary Position Embeddings + if self.position_embedding_type == 'rope': + self.rotary_pos_emb = RotaryEmbedding( + kv_channels=self.config.kv_channels, + rotary_percent=rotary_percent, + rotary_interleaved=self.config.rotary_interleaved, + seq_len_interpolation_factor=seq_len_interpolation_factor, + use_cpu_initialization=self.config.use_cpu_initialization, + ) + + # Transformer encoder + encoder_spec, decoder_spec = ( + self.transformer_encoder_layer_spec, + self.transformer_decoder_layer_spec, + ) + self.encoder = TransformerBlock( + config=self.config, + spec=encoder_spec, + pre_process=self.pre_process, + post_process=self.post_process, + ) + # Transformer decoder + self.decoder = TransformerBlock( + config=self.config, + spec=decoder_spec, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + # Output + if post_process: + self.lm_head = T5LMHead( + config, + parallel_output, + self.vocab_size, + self.pre_process, + self.share_embeddings_and_output_weights, + ) + self.output_layer = self.lm_head.output_layer + + if self.pre_process or self.post_process: + self.setup_embeddings_and_output_layer() + + def forward( + self, + encoder_input_ids: Tensor, + decoder_input_ids: Tensor, + encoder_attn_mask: Tensor, + decoder_attn_mask: Tensor, + encoder_decoder_attn_mask: Tensor, + lm_labels: Tensor = None, + encoder_hidden_states: Tensor = None, + output_encoder_hidden_only: bool = False, + inference_params: InferenceParams = None, + ) -> Tensor: + """Forward pass. + + Args: + encoder_input_ids (Tensor): input ids for encoder + decoder_input_ids (Tensor): input ids for decoder + encoder_attn_mask (Tensor): self-attention mask for encoder + decoder_attn_mask (Tensor): self-attention mask for decoder + encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder + lm_labels (Tensor): labels for decoder output + inference_params (InferenceParams): relevant arguments for inferencing + + Returns: + Tensor: loss tensor + """ + + ( + encoder_attn_mask, + decoder_attn_mask, + encoder_decoder_attn_mask, + ) = t5_extended_attention_mask( + [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask] + ) + + ## Encoder forward + if encoder_hidden_states is None: + # Encoder position ids + encoder_position_ids = t5_position_ids(encoder_input_ids) + + # Encoder embedding. + if self.pre_process: + encoder_input = self.embedding( + input_ids=encoder_input_ids, position_ids=encoder_position_ids + ) + else: + # intermediate stage of pipeline + encoder_input = None + + # Rotary positional embeddings + rotary_pos_emb = None + if self.position_embedding_type == 'rope': + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + inference_params, self.encoder, encoder_input, self.config + ) + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run encoder. + encoder_hidden_states = self.encoder( + hidden_states=encoder_input, + attention_mask=encoder_attn_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + # Return encoder hiddenstates if output_encoder_hidden_only is True + if output_encoder_hidden_only: + return encoder_hidden_states + + ## Decoder forward + # Decoder position ids + decoder_position_ids = t5_position_ids(decoder_input_ids) + + # Decoder embedding. + if self.pre_process: + decoder_input = self.embedding( + input_ids=decoder_input_ids, position_ids=decoder_position_ids + ) + else: + # intermediate stage of pipeline + decoder_input = None ### should it take encoder_hidden_states + + # Rotary positional embeddings + rotary_pos_emb = None + if self.position_embedding_type == 'rope': + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + inference_params, self.decoder, decoder_input, self.config + ) + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run decoder. + decoder_hidden_states = self.decoder( + hidden_states=decoder_input, + attention_mask=decoder_attn_mask, + context=encoder_hidden_states, + context_mask=encoder_decoder_attn_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + # Return if not post_process + if not self.post_process: + return decoder_hidden_states + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + logits = self.lm_head(decoder_hidden_states, word_embeddings_weight=output_weight) + + if lm_labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous() + + loss = self.compute_language_model_loss(lm_labels, logits) + + return loss + + def set_input_tensor(self, input_tensor): + """See megatron.model.transformer.set_input_tensor()""" + + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + if self.add_encoder and self.add_decoder: + assert ( + len(input_tensor) == 1 + ), 'input_tensor should only be length 1 for stage with both encoder and decoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_encoder: + assert ( + len(input_tensor) == 1 + ), 'input_tensor should only be length 1 for stage with only encoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_decoder: + if len(input_tensor) == 2: + self.decoder.set_input_tensor(input_tensor[0]) + self.encoder_hidden_state = input_tensor[1] + elif len(input_tensor) == 1: + self.decoder.set_input_tensor(None) + self.encoder_hidden_state = input_tensor[0] + else: + raise Exception('input_tensor must have either length 1 or 2') + else: + raise Exception('Stage must have at least either encoder or decoder') + + def shared_embedding_or_output_weight(self) -> Tensor: + """Function to share the input embeddings and output logit weights.""" + + if self.pre_process: + return self.embedding.word_embeddings.weight + elif self.post_process: + return self.lm_head.output_layer.weight + return None + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None + ) -> ShardedStateDict: + assert not sharded_offsets, "Unexpected sharded offsets" + sharded_state_dict = {} + + if self.pre_process: + embedding_prefix = f'{prefix}embedding.' + embedding_sharded_state_dict = self.embedding.sharded_state_dict( + prefix=embedding_prefix, metadata=metadata + ) + sharded_state_dict.update(embedding_sharded_state_dict) + + encoder_prefix = f'{prefix}encoder.' + encoder_sharded_state_dict = self.encoder.sharded_state_dict( + prefix=encoder_prefix, metadata=metadata + ) + sharded_state_dict.update(encoder_sharded_state_dict) + + decoder_prefix = f'{prefix}decoder.' + decoder_sharded_state_dict = self.decoder.sharded_state_dict( + prefix=decoder_prefix, metadata=metadata + ) + sharded_state_dict.update(decoder_sharded_state_dict) + + if self.post_process: + output_layer_prefix = f'{prefix}output_layer.' + output_layer_weight_key = f'{output_layer_prefix}weight' + output_layer_bias_key = f'{output_layer_prefix}bias' + if self.share_embeddings_and_output_weights: + if not self.pre_process: + # when sharing embeddings with last stage, we need to use the weights from the first stage + # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight + tensor = self.shared_embedding_or_output_weight() + first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' + dp_rank = parallel_state.get_data_parallel_rank() + dp_size = parallel_state.get_data_parallel_world_size() + last_stage_word_emb_replica_id = ( + dp_rank + dp_size + ) # copy of first stage embedding + + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=tensor, + key=first_stage_word_emb_key, + replica_id=last_stage_word_emb_replica_id, + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_weight_key] = sharded_output_layer_tensor + # output_layer.weight is shared, but we still need to process output_layer.bias + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=self.lm_head.output_layer.bias, + key=output_layer_bias_key, + allow_shape_mismatch=True, + ) + sharded_state_dict[output_layer_bias_key] = sharded_output_layer_tensor + else: + output_layer_state_dict = self.output_layer.state_dict( + prefix=output_layer_prefix, keep_vars=True + ) + output_layer_tensor = output_layer_state_dict[output_layer_weight_key] + # independent output layer + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=output_layer_tensor, + key=output_layer_weight_key, + replica_id=parallel_state.get_data_parallel_rank(), + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_weight_key] = sharded_output_layer_tensor + + return sharded_state_dict + + +def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> List[Tensor]: + def attn_mask_postprocess(attn_mask): + # [b, 1, s, s] + extended_attention_mask = attn_mask.unsqueeze(1) + return extended_attention_mask + + return [ + (attn_mask_postprocess(attn_mask) if attn_mask is not None else None) + for attn_mask in attention_mask_list + ] + + +def t5_position_ids(token_ids: Tensor) -> Tensor: + """Calculate position ids from token ids + Args: + token_ids (Tensor): input tokens + + Returns: + Tensor: position ids + """ + seq_length = token_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(token_ids) + + return position_ids diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py new file mode 100644 index 0000000..e837285 --- /dev/null +++ b/megatron/core/models/T5/t5_spec.py @@ -0,0 +1,253 @@ +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import ( + CrossAttention, + CrossAttentionSubmodules, + SelfAttention, + SelfAttentionSubmodules, +) +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import ( + TransformerBlockSubmodules, + get_num_layers_to_build, +) +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TENorm, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + + +def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec: + """T5 encoder TE spec (uses Transformer Engine components).""" + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), + ) + + +def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec: + """T5 decoder TE spec (uses Transformer Engine components).""" + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_cross_attn_layernorm=TENorm, + cross_attention=ModuleSpec( + module=CrossAttention, + submodules=CrossAttentionSubmodules( + linear_q=TEColumnParallelLinear, + linear_kv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + cross_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), + ) + + +def encoder_model_with_local_spec() -> ModuleSpec: + """T5 encoder local spec (uses Megatron-Core components).""" + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) + + +def decoder_model_with_local_spec() -> ModuleSpec: + """T5 decoder local spec (uses Megatron-Core components).""" + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_cross_attn_layernorm=LNImpl, + cross_attention=ModuleSpec( + module=CrossAttention, + submodules=CrossAttentionSubmodules( + linear_q=ColumnParallelLinear, + linear_kv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ), + cross_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) + + +def get_t5_encoder_with_transformer_engine_block_spec( + num_layers: int, +) -> TransformerBlockSubmodules: + """T5 encoder block spec for Transformer Engine + + Args: + config (TransformerConfig): config, containing number of layers for encoder + """ + + layer_spec = encoder_model_with_transformer_engine_default_spec() + block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm) + return block_spec + + +def get_t5_decoder_with_transformer_engine_block_spec( + num_layers: int, +) -> TransformerBlockSubmodules: + """T5 decoder block spec for Transformer Engine + + Args: + config (TransformerConfig): config, containing number of layers for decoder + """ + + layer_spec = decoder_model_with_transformer_engine_default_spec() + block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm) + return block_spec + + +def get_t5_encoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules: + """T5 encoder block spec for local (uses Megatron-Core components) + + Args: + num_layers (int): number of encoder layers + """ + + layer_spec = encoder_model_with_local_spec() + block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm) + return block_spec + + +def get_t5_decoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules: + """T5 decoder block spec for local (uses Megatron-Core components) + + Args: + num_layers (int): number of decoder layers + """ + + layer_spec = decoder_model_with_local_spec() + block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm) + return block_spec diff --git a/megatron/core/models/__init__.py b/megatron/core/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/bert/__init__.py b/megatron/core/models/bert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py new file mode 100644 index 0000000..fefe922 --- /dev/null +++ b/megatron/core/models/bert/bert_layer_specs.py @@ -0,0 +1,95 @@ +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + +# Use this spec to use lower level Transformer Engine modules (required for fp8 training) +bert_layer_with_transformer_engine_spec = ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), +) + +# Use this spec for an implementation using only modules in megatron core +bert_layer_local_spec = ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), +) diff --git a/megatron/core/models/bert/bert_lm_head.py b/megatron/core/models/bert/bert_lm_head.py new file mode 100644 index 0000000..548c046 --- /dev/null +++ b/megatron/core/models/bert/bert_lm_head.py @@ -0,0 +1,59 @@ +import torch +from torch import Tensor + +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import get_linear_layer + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + + +class BertLMHead(MegatronModule): + """Masked LM head for Bert. + + Args: + hidden_size: hidden size + config (TransformerConfig): TransformerConfig object + """ + + def __init__( + self, + hidden_size: int, + config: TransformerConfig, + ): + super().__init__(config=config) + + # TODO: Should switch this to TE ? + self.dense = get_linear_layer( + hidden_size, hidden_size, config.init_method, config.perform_initialization + ) + + setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel) + setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel) + + self.layer_norm = LNImpl( + config=config, + hidden_size=hidden_size, + eps=config.layernorm_epsilon, + ) + + self.gelu = torch.nn.functional.gelu + + def forward(self, hidden_states: Tensor) -> Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.gelu(hidden_states) + hidden_states = self.layer_norm(hidden_states) + return hidden_states diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py new file mode 100644 index 0000000..6f40cdc --- /dev/null +++ b/megatron/core/models/bert/bert_model.py @@ -0,0 +1,284 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import os +from collections import OrderedDict +from typing import Dict, Literal, Optional + +import torch +from torch import Tensor + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.models.bert.bert_lm_head import BertLMHead +from megatron.core.models.bert.pooler import Pooler +from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.transformer.enums import AttnMaskType, ModelType +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import TransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import get_linear_layer +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint + + +class BertModel(LanguageModule): + """Transformer language model. + + Args: + config (TransformerConfig): transformer config + num_tokentypes (int) : Set to 2 when args.bert_binary_head is True, and 0 otherwise. Defaults to 0. + transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers + vocab_size (int): vocabulary size + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + pre_process (bool): Include embedding layer (used with pipeline parallelism) + post_process (bool): Include an output layer (used with pipeline parallelism) + parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks + share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False. + position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. + Defaults is 'learned_absolute'. + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. + """ + + def __init__( + self, + config: TransformerConfig, + num_tokentypes: int, + transformer_layer_spec: ModuleSpec, + vocab_size: int, + max_sequence_length: int, + pre_process: bool = True, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', + rotary_percent: float = 1.0, + seq_len_interpolation_factor: Optional[float] = None, + add_binary_head=True, + return_embeddings=False, + ): + super(BertModel, self).__init__(config=config) + + if return_embeddings: + assert self.post_process and self.add_binary_head + + assert ( + os.getenv('NVTE_ALLOW_NONDETERMINISTIC_ALGO') == '0' + or os.getenv('NVTE_FLASH_ATTN') == '0' + ), "Bert currently does not support flash attention. Please set env variable NVTE_FLASH_ATTN=0 or set NVTE_ALLOW_NONDETERMINISTIC_ALGO=0" + + self.config: TransformerConfig = config + self.transformer_layer_spec: ModuleSpec = transformer_layer_spec + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + self.add_binary_head = add_binary_head + self.return_embeddings = return_embeddings + + # megatron core pipelining currently depends on model type + self.model_type = ModelType.encoder_or_decoder + + # Embeddings. + if self.pre_process: + self.embedding = LanguageModelEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + position_embedding_type=position_embedding_type, + num_tokentypes=num_tokentypes, + ) + + if self.position_embedding_type == 'rope': + self.rotary_pos_emb = RotaryEmbedding( + kv_channels=self.config.kv_channels, + rotary_percent=rotary_percent, + rotary_interleaved=self.config.rotary_interleaved, + seq_len_interpolation_factor=seq_len_interpolation_factor, + use_cpu_initialization=self.config.use_cpu_initialization, + ) + + # Transformer. + self.encoder = TransformerBlock( + config=self.config, + spec=self.transformer_layer_spec, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + # Output + if post_process: + # TODO: Make sure you are passing in the mpu_vocab_size properly + self.lm_head = BertLMHead( + config.hidden_size, + config, + ) + + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + self.vocab_size, + config=config, + init_method=config.init_method, + bias=True, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights, + ) + + self.binary_head = None + if self.add_binary_head: + # TODO: Shoudl switch this to TE ? + self.binary_head = get_linear_layer( + config.hidden_size, 2, config.init_method, config.perform_initialization + ) + + self.pooler = Pooler( + config.hidden_size, config.init_method, config, config.sequence_parallel + ) + + if self.pre_process or self.post_process: + self.setup_embeddings_and_output_layer() + + def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor: + """Creates the extended attention mask + + Converts the attention mask of dimension [batch size, 1, seq len] to [batch size, 1, seq len, seq len] and makes it binary + + Args: + attention_mask (Tensor): The input attention mask + + Returns: + Tensor: The extended binary attention mask + """ + # We create a 3D attention mask from a 2D tensor mask. + # [b, 1, s] + attention_mask_b1s = attention_mask.unsqueeze(1) + # [b, s, 1] + attention_mask_bs1 = attention_mask.unsqueeze(2) + # [b, s, s] + attention_mask_bss = attention_mask_b1s * attention_mask_bs1 + # [b, 1, s, s] + extended_attention_mask = attention_mask_bss.unsqueeze(1) + + # Convert attention mask to binary: + extended_attention_mask = extended_attention_mask < 0.5 + + return extended_attention_mask + + def bert_position_ids(self, token_ids): + # Create position ids + seq_length = token_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(token_ids) + + return position_ids + + def set_input_tensor(self, input_tensor: Tensor) -> None: + """Sets input tensor to the model. + + See megatron.model.transformer.set_input_tensor() + + Args: + input_tensor (Tensor): Sets the input tensor for the model. + """ + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert' + self.encoder.set_input_tensor(input_tensor[0]) + + def forward( + self, + input_ids: Tensor, + attention_mask: Tensor, + tokentype_ids: Tensor = None, + lm_labels: Tensor = None, + inference_params=None, + ): + """Forward function of BERT model + + Forward function of the BERT Model This function passes the input tensors + through the embedding layer, and then the encoder and finally into the post + processing layer (optional). + + It either returns the Loss values if labels are given or the final hidden units + """ + extended_attention_mask = self.bert_extended_attention_mask(attention_mask) + + if parallel_state.is_pipeline_first_stage(): + input_ids = input_ids + position_ids = self.bert_position_ids(input_ids) + else: + position_ids = None + input_ids = None + + # Encoder embedding. + if self.pre_process: + encoder_input = self.embedding( + input_ids=input_ids, position_ids=position_ids, tokentype_ids=tokentype_ids + ) + else: + # intermediate stage of pipeline + # encoder will get hidden_states from encoder.input_tensor + encoder_input = None + + # Rotary positional embeddings (Why not move this into BERT/GPTEmberdding ?) + rotary_pos_emb = None + if self.position_embedding_type == 'rope': + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + inference_params, self.encoder, encoder_input, self.config + ) + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run encoder. + hidden_states = self.encoder( + hidden_states=encoder_input, + attention_mask=extended_attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + if not self.post_process: + return hidden_states + + if self.add_binary_head: + pooled_output = self.pooler(hidden_states, 0) + + if self.return_embeddings: + embeddings = torch.transpose(hidden_states, 0, 1) + masks = torch.sum(attention_mask, dim=1) + # Collect masked embeddings. + output = torch.zeros( + size=(embeddings.shape[0], embeddings.shape[2]), + dtype=torch.float32, + device=torch.cuda.current_device(), + ) + for i, (embedding, mask) in enumerate(zip(embeddings, masks)): + output[i, :] = torch.mean(embedding[1 : mask - 1], dim=0) + return output + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + + hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states) + logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight) + + binary_logits = None + if self.binary_head is not None: + binary_logits = self.binary_head(pooled_output) + + if lm_labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous(), binary_logits + + loss = self.compute_language_model_loss(lm_labels, logits) + + return loss, binary_logits diff --git a/megatron/core/models/bert/pooler.py b/megatron/core/models/bert/pooler.py new file mode 100644 index 0000000..c144d8c --- /dev/null +++ b/megatron/core/models/bert/pooler.py @@ -0,0 +1,51 @@ +import torch +from torch import Tensor + +from megatron.core import tensor_parallel +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import get_linear_layer + + +class Pooler(MegatronModule): + """Pooler layer. + + Pool hidden states of a specific token (for example start of the + sequence) and add a linear transformation followed by a tanh. + + Args: + hidden_size (int): The hidden size_ + init_method (callable): weight initialization method for the linear layer. bias is set to zero. + config (TransformerConfig): The transformer configuration + sequence_parallel (bool): Using squence parallel ? Defaults to False + """ + + def __init__( + self, + hidden_size: int, + init_method: callable, + config: TransformerConfig, + sequence_parallel: bool = False, + ): + super(Pooler, self).__init__(config) + # TODO: Shoudl switch this to TE ? + self.dense = get_linear_layer( + hidden_size, hidden_size, init_method, config.perform_initialization + ) + self.sequence_parallel = sequence_parallel + + def forward(self, hidden_states: Tensor, sequence_index=0): + # hidden_states: [s, b, h] + # sequence_index: index of the token to pool. + + # gather data along sequence dimensions + # same pooler is run on all tensor parallel nodes + if self.sequence_parallel: + hidden_states = tensor_parallel.gather_from_sequence_parallel_region( + hidden_states, tensor_parallel_output_grad=False + ) + + pooled = hidden_states[sequence_index, :, :] + pooled = self.dense(pooled) + pooled = torch.tanh(pooled) + return pooled diff --git a/megatron/core/models/common/__init__.py b/megatron/core/models/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py new file mode 100644 index 0000000..bc1a2de --- /dev/null +++ b/megatron/core/models/common/embeddings/language_model_embedding.py @@ -0,0 +1,137 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from typing import Literal + +import torch +from torch import Tensor + +from megatron.core import tensor_parallel +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig + + +class LanguageModelEmbedding(MegatronModule): + """Language model embeddings. + + Args: + config (TransformerConfig): config object with all necessary configs for TransformerBlock + vocab_size (int): vocabulary size + max_sequence_length (int): maximum size of sequence. This + is used for positional embedding + add_position_embedding (bool): Add a position embedding. + embedding_dropout_prob (float): dropout probability for embeddings + num_tokentypes (int): Set to 0 without binary head, and 2 with a binary head . Defaults to 0. + """ + + def __init__( + self, + config: TransformerConfig, + vocab_size: int, + max_sequence_length: int, + position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute', + num_tokentypes: int = 0, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + self.vocab_size: int = vocab_size + self.max_sequence_length: int = max_sequence_length + self.add_position_embedding: bool = position_embedding_type == 'learned_absolute' + self.num_tokentypes = num_tokentypes + self.reduce_scatter_embeddings = ( + (not self.add_position_embedding) + and self.num_tokentypes <= 0 + and self.config.sequence_parallel + ) + + # Word embeddings (parallel). + self.word_embeddings = tensor_parallel.VocabParallelEmbedding( + num_embeddings=self.vocab_size, + embedding_dim=self.config.hidden_size, + init_method=self.config.init_method, + reduce_scatter_embeddings=self.reduce_scatter_embeddings, + config=self.config, + ) + + # Position embedding (serial). + if self.add_position_embedding: + self.position_embeddings = torch.nn.Embedding( + self.max_sequence_length, self.config.hidden_size + ) + + # Initialize the position embeddings. + if self.config.perform_initialization: + self.config.init_method(self.position_embeddings.weight) + + if self.num_tokentypes > 0: + self.tokentype_embeddings = torch.nn.Embedding( + self.num_tokentypes, self.config.hidden_size + ) + # Initialize the token-type embeddings. + if self.config.perform_initialization: + self.config.init_method(self.tokentype_embeddings.weight) + else: + self.tokentype_embeddings = None + + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout) + + def zero_parameters(self): + """Zero out all parameters in embedding.""" + self.word_embeddings.weight.data.fill_(0) + self.word_embeddings.weight.shared = True + self.position_embeddings.weight.data.fill_(0) + self.position_embeddings.weight.shared = True + if self.num_tokentypes > 0: + self.tokentype_embeddings.weight.data.fill_(0) + self.tokentype_embeddings.weight.shared = True + + def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int = None) -> Tensor: + """Forward pass of the embedding module. + + Args: + input_ids (Tensor): The input tokens + position_ids (Tensor): The position id's used to calculate position embeddings + tokentype_ids (int): The token type ids. Used when args.bert_binary_head is set to True. Defaults to None + + Returns: + Tensor: The output embeddings + """ + word_embeddings = self.word_embeddings(input_ids) + if self.add_position_embedding: + position_embeddings = self.position_embeddings(position_ids) + embeddings = word_embeddings + position_embeddings + else: + embeddings = word_embeddings + + if not self.reduce_scatter_embeddings: + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + + if tokentype_ids is not None: + assert self.tokentype_embeddings is not None + # [b s h] -> [s b h] (So that it can be added with embeddings) + tokentype_embedding = self.tokentype_embeddings(tokentype_ids).permute(1, 0, 2) + embeddings = embeddings + tokentype_embedding + else: + assert self.tokentype_embeddings is None + + # If the input flag for fp32 residual connection is set, convert for float. + if self.config.fp32_residual_connection: + embeddings = embeddings.float() + + # Dropout. + if self.config.sequence_parallel: + if not self.reduce_scatter_embeddings: + embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings) + # `scatter_to_sequence_parallel_region` returns a view, which prevents + # the original tensor from being garbage collected. Clone to facilitate GC. + # Has a small runtime cost (~0.5%). + if self.config.clone_scatter_output_in_embedding: + embeddings = embeddings.clone() + with tensor_parallel.get_cuda_rng_tracker().fork(): + embeddings = self.embedding_dropout(embeddings) + else: + embeddings = self.embedding_dropout(embeddings) + + return embeddings diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py new file mode 100644 index 0000000..f89d790 --- /dev/null +++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py @@ -0,0 +1,255 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from megatron.core.transformer.transformer_config import TransformerConfig + from megatron.core.transformer.transformer_block import TransformerBlock + +import logging + +import torch +from torch import Tensor, nn + +from megatron.core import parallel_state + +logger = logging.getLogger(__name__) + +try: + from apex.transformer.functional import ( + fused_apply_rotary_pos_emb, + fused_apply_rotary_pos_emb_thd, + ) + + HAVE_APPLY_ROPE_FUSION = True +except: + HAVE_APPLY_ROPE_FUSION = False + + +__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] + + +def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim): + cp_size = parallel_state.get_context_parallel_world_size() + cp_rank = parallel_state.get_context_parallel_rank() + cp_idx = torch.tensor( + [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True + ).cuda(non_blocking=True) + pos_emb = pos_emb.view( + *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :] + ) + pos_emb = pos_emb.index_select(seq_dim, cp_idx) + pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :]) + return pos_emb + + +class RotaryEmbedding(nn.Module): + """Rotary Embedding for language model. + + Args: + kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None + rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000. + use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on the GPU. Defaults to False + """ + + def __init__( + self, + kv_channels: int, + rotary_percent: float, + rotary_interleaved: bool = False, + seq_len_interpolation_factor: float = None, + rotary_base: int = 10000, + use_cpu_initialization: bool = False, + ) -> None: + super().__init__() + + dim = kv_channels + if rotary_percent < 1.0: + dim = int(dim * rotary_percent) + self.rotary_interleaved = rotary_interleaved + + self.seq_len_interpolation_factor = seq_len_interpolation_factor + device = 'cpu' if use_cpu_initialization else torch.cuda.current_device() + self.inv_freq = 1.0 / ( + rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + + def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: + """Forward pass of RoPE embedding. + + Args: + max_seq_len (int): Maximum size of sequence + offset (int, optional): _description_. Defaults to 0. + + Returns: + Tensor: Embeddings after applying RoPE. + """ + if self.inv_freq.device.type == 'cpu': + # move `inv_freq` to GPU once at the first micro-batch forward pass + self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device()) + seq = ( + torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + + offset + ) + + if self.seq_len_interpolation_factor is not None: + seq *= 1 / self.seq_len_interpolation_factor + + freqs = torch.outer(seq, self.inv_freq) + # first part even vector components, second part odd vector components, + # 2 * dim in dimension size + if not self.rotary_interleaved: + emb = torch.cat((freqs, freqs), dim=-1) + else: + emb = torch.stack((freqs.view(-1, 1), freqs.view(-1, 1)), dim=-1).view( + freqs.shape[0], -1 + ) + # emb [seq_length, .., dim] + emb = emb[:, None, None, :] + if parallel_state.get_context_parallel_world_size() > 1: + # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank + emb = get_pos_emb_on_this_cp_rank(emb, 0) + return emb + + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + state_dict.pop(f'{prefix}inv_freq', None) + return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + def get_rotary_seq_len( + self, + inference_params, + transformer: TransformerBlock, + transformer_input: Tensor, + transformer_config: TransformerConfig, + ) -> float: + """Function to get the rotary sequence length. + + Args: + inference_params : Used during Inference time + transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model + transformer_input (Tensor): _description_ + transformer_config (TransformerConfig): Transformer config used by the model + + Returns: + float: The rotary sequence length + """ + if inference_params is not None: + rotary_seq_len = inference_params.max_sequence_length + else: + if transformer.input_tensor is not None: + rotary_seq_len = transformer.input_tensor.size(0) + else: + rotary_seq_len = transformer_input.size(0) + + if transformer_config.sequence_parallel: + rotary_seq_len *= transformer_config.tensor_model_parallel_size + + rotary_seq_len *= transformer_config.context_parallel_size + + return rotary_seq_len + + +def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor: + """Change sign so the last dimension becomes [-odd, +even] + + Args: + x (Tensor): Input tensor + + Returns: + Tensor: Tensor rotated half + """ + if not rotary_interleaved: + x1, x2 = torch.chunk(x, 2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1 = x[:, :, :, ::2] + x2 = x[:, :, :, 1::2] + x_new = torch.stack((-x2, x1), dim=-1) + return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1) + + +def apply_rotary_pos_emb_bshd(t: Tensor, freqs: Tensor, rotary_interleaved: bool = False) -> Tensor: + """Apply rotary positional embedding to input tensor T. + + check https://kexue.fm/archives/8265 for detailed formulas + + Args: + t (Tensor): Input tensor T is of shape [seq_length, ... , dim] + freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim] + + Returns: + Tensor: The input tensor after applying RoPE + """ + rot_dim = freqs.shape[-1] + + # ideally t_pass is empty so rotary pos embedding is applied to all tensor t + t, t_pass = t[..., :rot_dim], t[..., rot_dim:] + + # first part is cosine component + # second part is sine component, need to change signs with _rotate_half method + cos_ = torch.cos(freqs).to(t.dtype) + sin_ = torch.sin(freqs).to(t.dtype) + + t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_) + return torch.cat((t, t_pass), dim=-1) + + +def apply_rotary_pos_emb_thd( + t: Tensor, cu_seqlens: Tensor, freqs: Tensor, rotary_interleaved: bool = False +) -> Tensor: + """A baseline implementation of applying RoPE for `thd` format. + + Args: + t (Tensor): Input tensor T is of shape [t, h, d] + cu_seqlens(Tensor): Cumulative sum of sequence lengths in a batch for `t`, + with shape [b + 1] and dtype torch.int32. + freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d] + + Returns: + Tensor: Shape [t, h, d]. The input tensor after applying RoPE. + """ + + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return torch.cat( + [ + apply_rotary_pos_emb_bshd(x.unsqueeze(1), freqs[: x.size(0)]) + for x in torch.split(t, seqlens) + ] + ).squeeze(1) + + +def apply_rotary_pos_emb( + t: Tensor, + freqs: Tensor, + config: TransformerConfig, + cu_seqlens: Optional[Tensor] = None, +): + """ + Reroute to the appropriate apply_rotary_pos_emb function depending on + fused/unfused kernels, or bshd (conventional) / thd (packed seq) format + """ + if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: + # setting apply_rope_fusion in config to False so that subsequent queries to this config also return False + config.apply_rope_fusion = False + if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): + logger.warning( + "Setting apply_rope_fusion to false because its implementation" + " is not included in Apex. Try upgrading to the latest version" + ) + apply_rotary_pos_emb.printed_fused_warning = True + if config.apply_rope_fusion: + if cu_seqlens is None: + return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) + else: + return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) + else: + if cu_seqlens is None: + return apply_rotary_pos_emb_bshd(t, freqs, rotary_interleaved=config.rotary_interleaved) + else: + return apply_rotary_pos_emb_thd( + t, cu_seqlens, freqs, rotary_interleaved=config.rotary_interleaved + ) diff --git a/megatron/core/models/common/language_module/__init__.py b/megatron/core/models/common/language_module/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py new file mode 100644 index 0000000..fcd683c --- /dev/null +++ b/megatron/core/models/common/language_module/language_module.py @@ -0,0 +1,204 @@ +import logging +from typing import Optional, Tuple + +import torch +from torch import Tensor + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint + + +class LanguageModule(MegatronModule): + """Base language module that has common helper functions used across GPT, BERT etc. + + Args: + config (TransformerConfig): Input transformer config for the model + """ + + def __init__(self, config: TransformerConfig) -> None: + super().__init__(config=config) + + def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor: + """Computes the language model loss (Cross entropy across vocabulary) + + Args: + labels (Tensor): The labels of dimension [batch size, seq length] + logits (Tensor): The final logits returned by the output layer of the transformer model + + Returns: + Tensor: Loss tensor of dimensions [batch size, sequence_length] + """ + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + if self.config.cross_entropy_loss_fusion: + loss = fused_vocab_parallel_cross_entropy(logits, labels) + else: + loss = tensor_parallel.vocab_parallel_cross_entropy(logits, labels) + + # [s b] => [b, s] + loss = loss.transpose(0, 1).contiguous() + return loss + + def setup_embeddings_and_output_layer(self) -> None: + """Sets up embedding layer in first stage and output layer in last stage. + + This function initalizes word embeddings in the final stage when we are + using pipeline parallelism and sharing word embeddings, and sets up param + attributes on the embedding and output layers. + """ + + # Set `is_embedding_or_output_parameter` attribute. + if self.pre_process: + self.embedding.word_embeddings.weight.is_embedding_or_output_parameter = True + if self.post_process and self.output_layer.weight is not None: + self.output_layer.weight.is_embedding_or_output_parameter = True + + if not self.share_embeddings_and_output_weights: + return + + if self.pre_process and self.post_process: + # Zero out wgrad if sharing embeddings between two layers on same + # pipeline stage to make sure grad accumulation into main_grad is + # correct and does not include garbage values (e.g., from torch.empty). + self.shared_embedding_or_output_weight().zero_out_wgrad = True + return + + if self.pre_process and not self.post_process: + assert parallel_state.is_pipeline_first_stage() + self.shared_embedding_or_output_weight().shared_embedding = True + + if self.post_process and not self.pre_process: + assert not parallel_state.is_pipeline_first_stage() + # set word_embeddings weights to 0 here, then copy first + # stage's weights using all_reduce below. + self.output_layer.weight.data.fill_(0) + self.output_layer.weight.shared = True + self.output_layer.weight.shared_embedding = True + + # Parameters are shared between the word embeddings layers, and the + # heads at the end of the model. In a pipelined setup with more than + # one stage, the initial embedding layer and the head are on different + # workers, so we do the following: + # 1. Create a second copy of word_embeddings on the last stage, with + # initial parameters of 0.0. + # 2. Do an all-reduce between the first and last stage to ensure that + # the two copies of word_embeddings start off with the same + # parameter values. + # 3. In the training loop, before an all-reduce between the grads of + # the two word_embeddings layers to ensure that every applied weight + # update is the same on both stages. + + # Ensure that first and last stages have the same initial parameter + # values. + if torch.distributed.is_initialized(): + if parallel_state.is_rank_in_embedding_group(): + weight = self.shared_embedding_or_output_weight() + weight.data = weight.data.cuda() + torch.distributed.all_reduce( + weight.data, group=parallel_state.get_embedding_group() + ) + + elif not getattr(LanguageModule, "embedding_warning_printed", False): + logging.getLogger(__name__).warning( + "Distributed processes aren't initialized, so the output layer " + "is not initialized with weights from the word embeddings. " + "If you are just manipulating a model this is fine, but " + "this needs to be handled manually. If you are training " + "something is definitely wrong." + ) + LanguageModule.embedding_warning_printed = True + + def shared_embedding_or_output_weight(self) -> Tensor: + """Gets the emedding weight or output logit weights when share embedding and output weights set to True. + + Returns: + Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight + """ + if self.pre_process: + return self.embedding.word_embeddings.weight + elif self.post_process: + return self.output_layer.weight + return None + + def sharded_state_dict( + self, + prefix: str = '', + sharded_offsets: Tuple[Tuple[int, int, int]] = (), + metadata: Optional[dict] = None, + ) -> ShardedStateDict: + """ Sharded state dict implementation that handles the output layer weights tying. + + Args: + prefix (str): Module name prefix. + sharded_offsets (tuple): PP related offsets, expected to be empty at this module level. + metadata (Optional[Dict]): metadata controlling sharded state dict creation. + + Returns: + ShardedStateDict: sharded state dict for the LanguageModel + """ + assert not sharded_offsets, "Unexpected sharded offsets" + sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata) + + first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' + output_layer_weight_key = f'{prefix}output_layer.weight' + output_layer_bias_key = f'{prefix}output_layer.bias' + + if self.share_embeddings_and_output_weights: + self.tie_embeddings_and_output_weights_state_dict( + sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key + ) + elif self.post_process: + # Make sure the output layer follows the embeddings padding logic + sharded_state_dict[output_layer_weight_key].allow_shape_mismatch = True + + # Regardless of sharing the output weights with embeddings, we must handle the bias padding + if self.post_process and output_layer_bias_key in sharded_state_dict: + sharded_state_dict[output_layer_bias_key].allow_shape_mismatch = True + + return sharded_state_dict + + def tie_embeddings_and_output_weights_state_dict( + self, + sharded_state_dict: ShardedStateDict, + output_layer_weight_key: str, + first_stage_word_emb_key: str, + ) -> None: + """Ties the embedding and output weights in a given sharded state dict. + + Args: + sharded_state_dict (ShardedStateDict): state dict with the weight to tie + output_layer_weight_key (str): key of the output layer weight in the state dict. + This entry will be replaced with a tied version + first_stage_word_emb_key (str): this must be the same as the + ShardedTensor.key of the first stage word embeddings. + + Returns: None, acts in-place + """ + if not self.post_process: + # No output layer + assert output_layer_weight_key not in sharded_state_dict, sharded_state_dict.keys() + return + + if self.pre_process: + # Output layer is equivalent to the embedding already + return + + # Replace the default output layer with a one sharing the weights with the embedding + del sharded_state_dict[output_layer_weight_key] + tensor = self.shared_embedding_or_output_weight() + last_stage_word_emb_replica_id = ( + 1, # copy of first stage embedding + 0, + parallel_state.get_data_parallel_rank(with_context_parallel=True), + ) + + sharded_state_dict[output_layer_weight_key] = make_tp_sharded_tensor_for_checkpoint( + tensor=tensor, + key=first_stage_word_emb_key, + replica_id=last_stage_word_emb_replica_id, + allow_shape_mismatch=True, + ) diff --git a/megatron/core/models/common/vision_module/__init__.py b/megatron/core/models/common/vision_module/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/common/vision_module/vision_module.py b/megatron/core/models/common/vision_module/vision_module.py new file mode 100644 index 0000000..5dc5187 --- /dev/null +++ b/megatron/core/models/common/vision_module/vision_module.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Megatron Vision Module.""" + +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig + + +# Note: This is only a stub at the moment. This will be expanded in follow-up changes. +class VisionModule(MegatronModule): + """Base vision module that has common helper functions used across CLIP, ViT, etc. + + Args: + config (TransformerConfig): Input transformer config for the model + """ + + def __init__(self, config: TransformerConfig) -> None: + super().__init__(config=config) diff --git a/megatron/core/models/gpt/__init__.py b/megatron/core/models/gpt/__init__.py new file mode 100644 index 0000000..2d5eb86 --- /dev/null +++ b/megatron/core/models/gpt/__init__.py @@ -0,0 +1 @@ +from .gpt_model import GPTModel diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py new file mode 100644 index 0000000..726b6fb --- /dev/null +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.moe.moe_layer import MoELayer +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import TransformerBlockSubmodules +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelGroupedLinear, + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TENorm, + TERowParallelGroupedLinear, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + + +# Use this spec to use lower level Transformer Engine modules (required for fp8 training) +def get_gpt_layer_with_transformer_engine_spec( + num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False +) -> ModuleSpec: + mlp = _get_mlp_module_spec( + use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + ) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + # TENorm significantly harms convergence when used + # for QKLayerNorm; we instead use the Apex implementation. + q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, + k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + + +# Use this spec for an implementation using only modules in megatron core +def get_gpt_layer_local_spec( + num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False +) -> ModuleSpec: + mlp = _get_mlp_module_spec( + use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + ) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=LNImpl if qk_layernorm else IdentityOp, + k_layernorm=LNImpl if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) + + +# Helper function to get module spec for MLP/MoE +def _get_mlp_module_spec( + use_te: bool = True, num_experts: int = None, moe_grouped_gemm: bool = False +) -> ModuleSpec: + if num_experts is None: + # Dense MLP w/ or w/o TE modules. + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, + ), + ) + else: + # Mixture of experts with modules in megatron core. + if use_te and moe_grouped_gemm: + linear_fc1 = TEColumnParallelGroupedLinear + linear_fc2 = TERowParallelGroupedLinear + else: + linear_fc1 = ColumnParallelLinear + linear_fc2 = RowParallelLinear + + use_te_grouped_gemm = use_te and TEColumnParallelGroupedLinear is not None + + return ModuleSpec( + module=MoELayer, + submodules=( + MLPSubmodules(linear_fc1=linear_fc1, linear_fc2=linear_fc2) + if not moe_grouped_gemm or use_te_grouped_gemm + else None + ), + ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py new file mode 100644 index 0000000..bf372e0 --- /dev/null +++ b/megatron/core/models/gpt/gpt_model.py @@ -0,0 +1,240 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import logging +from typing import Dict, Literal, Optional, Tuple, Union + +import torch +from torch import Tensor + +from megatron.core import InferenceParams, parallel_state, tensor_parallel +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.transformer.enums import AttnMaskType, ModelType +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import TransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint + + +class GPTModel(LanguageModule): + """GPT Transformer language model. + + Args: + config (TransformerConfig): Transformer config + transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers + vocab_size (int): Vocabulary size + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True. + post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True. + fp16_lm_cross_entropy (bool, optional): Defaults to False. + parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True. + share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False. + position_embedding_type (Literal[learned_absolute,rope], optional): Position embedding type.. Defaults to 'learned_absolute'. + rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0. + rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000. + seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None. + """ + + def __init__( + self, + config: TransformerConfig, + transformer_layer_spec: ModuleSpec, + vocab_size: int, + max_sequence_length: int, + pre_process: bool = True, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute', + rotary_percent: float = 1.0, + rotary_base: int = 10000, + seq_len_interpolation_factor: Optional[float] = None, + ) -> None: + super().__init__(config=config) + + self.transformer_layer_spec: ModuleSpec = transformer_layer_spec + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + + # megatron core pipelining currently depends on model type + # TODO: remove this dependency ? + self.model_type = ModelType.encoder_or_decoder + + # These 2 attributes are needed for TensorRT-LLM export. + self.max_position_embeddings = max_sequence_length + self.rotary_percent = rotary_percent + + if self.pre_process: + self.embedding = LanguageModelEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + position_embedding_type=position_embedding_type, + ) + + if self.position_embedding_type == 'rope': + self.rotary_pos_emb = RotaryEmbedding( + kv_channels=self.config.kv_channels, + rotary_percent=rotary_percent, + rotary_interleaved=self.config.rotary_interleaved, + seq_len_interpolation_factor=seq_len_interpolation_factor, + rotary_base=rotary_base, + use_cpu_initialization=self.config.use_cpu_initialization, + ) + + # Transformer. + self.decoder = TransformerBlock( + config=self.config, + spec=transformer_layer_spec, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + # Output + if post_process: + if self.config.defer_embedding_wgrad_compute: + # The embedding activation buffer preserves a reference to the input activations + # of the final embedding projection layer GEMM. It will hold the activations for + # all the micro-batches of a global batch for the last pipeline stage. Once we are + # done with all the back props for all the microbatches for the last pipeline stage, + # it will be in the pipeline flush stage. During this pipeline flush we use the + # input activations stored in embedding activation buffer and gradient outputs stored + # in gradient buffer to calculate the weight gradients for the embedding final linear layer. + self.embedding_activation_buffer = [] + self.grad_output_buffer = [] + else: + self.embedding_activation_buffer = None + self.grad_output_buffer = None + + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + self.vocab_size, + config=config, + init_method=config.init_method, + bias=False, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=self.pre_process + and self.share_embeddings_and_output_weights, + embedding_activation_buffer=self.embedding_activation_buffer, + grad_output_buffer=self.grad_output_buffer, + ) + + if self.pre_process or self.post_process: + self.setup_embeddings_and_output_layer() + + def set_input_tensor(self, input_tensor: Tensor) -> None: + """Sets input tensor to the model. + + See megatron.model.transformer.set_input_tensor() + + Args: + input_tensor (Tensor): Sets the input tensor for the model. + """ + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert' + self.decoder.set_input_tensor(input_tensor[0]) + + def forward( + self, + input_ids: Tensor, + position_ids: Tensor, + attention_mask: Tensor, + decoder_input: Tensor = None, + labels: Tensor = None, + inference_params: InferenceParams = None, + packed_seq_params: PackedSeqParams = None, + extra_block_kwargs: dict = None, + ) -> Tensor: + """Forward function of the GPT Model This function passes the input tensors + through the embedding layer, and then the decoeder and finally into the post + processing layer (optional). + + It either returns the Loss values if labels are given or the final hidden units + """ + # If decoder_input is provided (not None), then input_ids and position_ids are ignored. + # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. + + # Decoder embedding. + if decoder_input is not None: + pass + elif self.pre_process: + decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + else: + # intermediate stage of pipeline + # decoder will get hidden_states from encoder.input_tensor + decoder_input = None + + # Rotary positional embeddings (embedding is None for PP intermediate devices) + rotary_pos_emb = None + if self.position_embedding_type == 'rope': + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + inference_params, self.decoder, decoder_input, self.config + ) + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run decoder. + hidden_states = self.decoder( + hidden_states=decoder_input, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + packed_seq_params=packed_seq_params, + **(extra_block_kwargs or {}), + ) + + if not self.post_process: + return hidden_states + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + logits, _ = self.output_layer(hidden_states, weight=output_weight) + + if labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous() + + loss = self.compute_language_model_loss(labels, logits) + + return loss + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None + ) -> ShardedStateDict: + """Sharded state dict implementation for GPTModel backward-compatibility (removing extra state). + + Args: + prefix (str): Module name prefix. + sharded_offsets (tuple): PP related offsets, expected to be empty at this module level. + metadata (Optional[Dict]): metadata controlling sharded state dict creation. + + Returns: + ShardedStateDict: sharded state dict for the GPTModel + """ + sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata) + output_layer_extra_state_key = f'{prefix}output_layer._extra_state' + + # Old GPT checkpoints only stored the output layer weight key. So we remove the _extra_state key + # but check that it doesn't contain any data anyway + output_extra_state = sharded_state_dict.pop(output_layer_extra_state_key, None) + assert not ( + output_extra_state and output_extra_state.data + ), f'Expected output layer extra state to be empty, got: {output_extra_state}' + + return sharded_state_dict diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py new file mode 100644 index 0000000..f09944d --- /dev/null +++ b/megatron/core/models/mamba/__init__.py @@ -0,0 +1 @@ +from .mamba_model import MambaModel diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py new file mode 100644 index 0000000..91224bf --- /dev/null +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -0,0 +1,69 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules +from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules +from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + +mamba_stack_spec = ModuleSpec( + module=MambaStack, + submodules=MambaStackSubmodules( + mamba_layer=ModuleSpec( + module=MambaLayer, + submodules=MambaLayerSubmodules( + mixer=ModuleSpec( + module=MambaMixer, + submodules=MambaMixerSubmodules( + in_proj=TELayerNormColumnParallelLinear, + out_proj=TERowParallelLinear, + ), + ), + mamba_bda=get_bias_dropout_add, + ), + ), + # Started with spec from gpt_layer_specs.py (with MLP removed) + # Using the TE spec because we had problems getting the non-TE spec + # working + attention_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + ), + ), + # Started with spec from gpt_layer_specs.py + # Using the TE spec because we had problems getting the non-TE spec + # working + mlp_layer=ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), + ), + ), +) diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py new file mode 100644 index 0000000..50c4b87 --- /dev/null +++ b/megatron/core/models/mamba/mamba_model.py @@ -0,0 +1,210 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from typing import Literal, Optional + +from torch import Tensor + +from megatron.core import InferenceParams, tensor_parallel +from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + + +class MambaModel(LanguageModule): + """Mamba language model. + + Args: + config (TransformerConfig): Transformer config + mamba_stack_spec (ModuleSpec): Specifies the modules to use for the various layer types + vocab_size (int): Vocabulary size + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True. + mamba_ssm_ngroups (int, optional): Specifies the number of groups to use. The default value is 8, as in the NVIDIA Mamba2 (pure and hybrid) 8b. However, in the original Mamba2 paper, the checkpoints use a setting of 1. Defaults to 8. + hybrid_attention_ratio (float, optional): The target ratio of attention layers to total layers + hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers + hybrid_override_pattern (str, optional): The hybrid layer pattern to override with + post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True. + fp16_lm_cross_entropy (bool, optional): Defaults to False. + parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True. + share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False. + position_embedding_type (Literal[learned_absolute,rope,none], optional): Position embedding type. Defaults to 'none'. + rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0. + rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000. + seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None. + """ + + def __init__( + self, + config: TransformerConfig, + mamba_stack_spec: ModuleSpec, + vocab_size: int, + max_sequence_length: int, + mamba_ssm_ngroups: int = 8, + pre_process: bool = True, + hybrid_attention_ratio: float = 0.0, + hybrid_mlp_ratio: float = 0.0, + hybrid_override_pattern: str = None, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + # Mamba with no attention has no need for position embeddings, so none is default + position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none', + rotary_percent: float = 1.0, + rotary_base: int = 10000, + seq_len_interpolation_factor: Optional[float] = None, + ) -> None: + super().__init__(config=config) + + self.mamba_stack_spec: ModuleSpec = mamba_stack_spec + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.mamba_ssm_ngroups = mamba_ssm_ngroups + self.pre_process = pre_process + self.hybrid_attention_ratio = hybrid_attention_ratio + self.hybrid_mlp_ratio = hybrid_mlp_ratio + self.hybrid_override_pattern = hybrid_override_pattern + self.post_process = post_process + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + + # megatron core pipelining currently depends on model type + # TODO: remove this dependency ? + self.model_type = ModelType.encoder_or_decoder + + if self.pre_process: + self.embedding = LanguageModelEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + position_embedding_type=position_embedding_type, + ) + + if self.position_embedding_type == 'rope': + self.rotary_pos_emb = RotaryEmbedding( + kv_channels=self.config.kv_channels, + rotary_percent=rotary_percent, + seq_len_interpolation_factor=seq_len_interpolation_factor, + rotary_base=rotary_base, + use_cpu_initialization=self.config.use_cpu_initialization, + ) + + self.decoder = build_module( + mamba_stack_spec, + self.config, + mamba_ssm_ngroups=self.mamba_ssm_ngroups, + pre_process=self.pre_process, + hybrid_attention_ratio=self.hybrid_attention_ratio, + hybrid_mlp_ratio=self.hybrid_mlp_ratio, + hybrid_override_pattern=self.hybrid_override_pattern, + post_process=self.post_process, + dtype=config.params_dtype, + ) + + # Output + if post_process: + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + self.vocab_size, + config=config, + init_method=config.init_method, + bias=False, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=self.pre_process + and self.share_embeddings_and_output_weights, + ) + + if self.pre_process or self.post_process: + self.setup_embeddings_and_output_layer() + + def set_input_tensor(self, input_tensor: Tensor) -> None: + """Sets input tensor to the model. + + See megatron.model.transformer.set_input_tensor() + + Args: + input_tensor (Tensor): Sets the input tensor for the model. + """ + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert' + self.decoder.set_input_tensor(input_tensor[0]) + + def forward( + self, + input_ids: Tensor, + position_ids: Tensor, + attention_mask: Tensor, + decoder_input: Tensor = None, + labels: Tensor = None, + inference_params: InferenceParams = None, + ) -> Tensor: + """Forward function of the Mamba model. This function passes the input tensors + through the embedding layer, and then the decoder and finally into the post + processing layer (optional). + + It either returns the Loss values if labels are given or the final hidden units + """ + # If decoder_input is provided (not None), then input_ids and position_ids are ignored. + # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. + + # Decoder embedding. + if decoder_input is not None: + pass + elif self.pre_process: + decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + else: + # intermediate stage of pipeline + # decoder will get hidden_states from encoder.input_tensor + decoder_input = None + + rotary_pos_emb = None + if self.position_embedding_type == 'rope': + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + inference_params, self.decoder, decoder_input, self.config + ) + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # The following assert will currently fail when running inference. + # Commented out for now. + # TODO (duncan/rwaleffe): (1) confirm that the externally-generated + # attention mask is not needed and is ignored by the model in + # inference mode, (2) reduce the size of the externally-generated + # attention mask to prevent CPU OOM (as we did for training), (3) + # force the attention mask passed to the model in inference mode to + # be None, so this assert will succeed. + # assert attention_mask is None, "The attention mask is ignored and should be set to None" + + # Run decoder. + hidden_states = self.decoder( + hidden_states=decoder_input, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + if not self.post_process: + return hidden_states + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + logits, _ = self.output_layer(hidden_states, weight=output_weight) + + if labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous() + + loss = self.compute_language_model_loss(labels, logits) + + return loss diff --git a/megatron/core/models/multimodal/__init__.py b/megatron/core/models/multimodal/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py new file mode 100644 index 0000000..17ca173 --- /dev/null +++ b/megatron/core/models/multimodal/llava_model.py @@ -0,0 +1,237 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import logging +from collections import namedtuple +from functools import partial +from typing import List + +import torch + +from megatron.core import InferenceParams, parallel_state +from megatron.core.models.gpt import GPTModel +from megatron.core.models.vision.clip_vit_model import CLIPViTModel +from megatron.core.models.vision.multimodal_projector import MultimodalProjector +from megatron.core.transformer import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_config import TransformerConfig + + +# Note: This is under development and may be missing features. +class LLaVAModel(MegatronModule): + """LLaVA multi-modal model. + + Args: + language_transformer_config (TransformerConfig): Transformer config for the language model. + language_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the language model. + language_vocab_size (int): Language model vocabulary size. + language_max_sequence_length (int): Language model maximum sequence length. This is used for positional embedding. + vision_transformer_config (TransformerConfig): Transformer config for the vision model. + vision_transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers of the vision model. + drop_vision_class_token (bool): Drop vision class token(s) before input to the language model. + vision_projection_config (TransformerConfig): Config for the projection from vision model outputs to language model inputs. + vision_projection_layer_spec (ModuleSpec): Specifies the module to use for the vision projection. + vision_projection_type (str): Type of the vision projection to use. Default is a 2-layer MLP. + allow_missing_vision_projection_checkpoint (bool): Allow vision projection weights to be missing when loading a checkpoint. Default False. + parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks. This is typically True for training and False for inference. + language_position_embedding_type (str): Position embedding type to use in the language model. Default learned absolute. + language_rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings in the language model. Defaults to 1.0. + img_embedding_idx (int): Index in the language_embeddings tensor where image_embeddings should be inserted. Defaults to 0. + """ + + def __init__( + self, + language_transformer_config: TransformerConfig, + language_transformer_layer_spec: ModuleSpec, + language_vocab_size: int, + language_max_sequence_length: int, + vision_transformer_config: TransformerConfig, + vision_transformer_layer_spec: ModuleSpec, + drop_vision_class_token: bool, + vision_projection_config: TransformerConfig, + vision_projection_layer_spec: ModuleSpec, + vision_projection_type: str = "mlp", + allow_missing_vision_projection_checkpoint: bool = False, + parallel_output: bool = True, + language_position_embedding_type: str = 'learned_absolute', + language_rotary_percent: float = 1.0, + language_rotary_base: int = 10000, + img_embedding_idx: int = 0, + ) -> None: + super().__init__(config=language_transformer_config) + + logging.getLogger(__name__).warning( + "LLaVA model is under development and may be missing features." + ) + + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + raise NotImplementedError("pipeline parallelism is not supported in this model yet.") + + self.language_model = GPTModel( + config=language_transformer_config, + transformer_layer_spec=language_transformer_layer_spec, + vocab_size=language_vocab_size, + max_sequence_length=language_max_sequence_length, + parallel_output=parallel_output, + position_embedding_type=language_position_embedding_type, + rotary_percent=language_rotary_percent, + rotary_base=language_rotary_base, + ) + + self.vision_model = CLIPViTModel(vision_transformer_config, vision_transformer_layer_spec) + self._drop_vision_class_token = drop_vision_class_token + + # Map (intermediate) vision model outputs to the language model input dimension. + self.vision_projection = MultimodalProjector( + vision_projection_config, + vision_projection_layer_spec, + vision_projection_type, + vision_transformer_config.hidden_size, # input size to the projection. + ) + + # This allows ignoring missing weights for the vision projection during checkpoint loading. + # This should be disabled by default but can be enabled if your checkpoint contains pretrained + # vision and language models but not the projection from vision model outputs to language model inputs. + if allow_missing_vision_projection_checkpoint: + vision_projection_param_names = [ + f"vision_projection.{name}" for name in self.vision_projection.state_dict().keys() + ] + self.vision_projection.register_load_state_dict_post_hook( + partial(_load_state_dict_hook_ignore_param_names, vision_projection_param_names) + ) + + self.img_embedding_idx = img_embedding_idx + + def set_input_tensor(self, input_tensor: torch.Tensor) -> None: + """Sets input tensor to the model. + + NOTE: Pipeline parallelism is not supported in this model yet. This is just a placeholder implementation. + + Args: + input_tensor (Tensor): Sets the input tensor for the model. + """ + self.vision_model.set_input_tensor(input_tensor) + + def freeze( + self, freeze_language_model: bool, freeze_vision_model: bool, freeze_vision_projection: bool + ): + """Freeze model modules. + + Make specific modules non-trainable by setting requires_grad to False for the module's parameters. + + Args: + freeze_language_model (bool): Freeze the language model module. + freeze_vision_model (bool): Freeze the vision model module. + freeze_vision_projection (bool): Freeze the vision projection module. + """ + modules = [] + if freeze_language_model: + modules.append(self.language_model) + if freeze_vision_model: + modules.append(self.vision_model) + if freeze_vision_projection: + modules.append(self.vision_projection) + + for module in modules: + for param in module.parameters(): + param.requires_grad = False + + def forward( + self, + images: torch.Tensor, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + attention_mask: torch.Tensor, + labels: torch.Tensor = None, + inference_params: InferenceParams = None, + ) -> torch.Tensor: + """Forward function of the LLaVA model. + + Args: + images (torch.Tensor): input image of shape [batch, img_h, img_w]. + input_ids (torch.Tensor): input text ids [batch, text_seq_len]. + position_ids (torch.Tensor): input text position ids [batch, text_seq_len]. + attention_mask (torch.Tensor): attention mask for the language model [batch, 1, combined_seq_len, combined_seq_len]. + labels (torch.Tensor): Optional target text labels [batch, combined_seq_len]. + inference_params (InferenceParams): Inference-time parameters including KV cache. + + Returns: + output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size]. + """ + + language_embeddings = self.language_model.embedding( + input_ids=input_ids, position_ids=position_ids + ) # [text_seq_len, b, h_language] + + # If running inference, we can skip image token computation if they were computed already earlier for this sample. + if ( + inference_params is not None + and "image_tokens_count" in inference_params.key_value_memory_dict + ): + combined_embeddings = language_embeddings + else: + image_embeddings = self.vision_model(images) # [b, img_seq_len, h_vision] + + if self._drop_vision_class_token: + image_embeddings = image_embeddings[:, self.vision_model.class_token_len :, :] + + image_embeddings = image_embeddings.permute(1, 0, 2) # [img_seq_len, b, h_vision] + + # map vision model output size to language model input size. + image_embeddings = self.vision_projection( + image_embeddings + ) # [img_seq_len, b, h_vision] + + # If running inference, the language model KV cache will be updated for image token positions. + # Here we store the image tokens sequence length, which can be used as an offset to the KV cache later. + if inference_params is not None: + inference_params.key_value_memory_dict["image_tokens_count"] = ( + image_embeddings.shape[0] + ) + + combined_embeddings = torch.cat( + [ + language_embeddings[: self.img_embedding_idx], + image_embeddings, + language_embeddings[self.img_embedding_idx :], + ], + dim=0, + ) # [combined_seq_len, b, h_language] + + # Embedding is computed above so we can discard input and position ids. + input_ids = None + position_ids = None + + # Note: This returns loss if labels are provided, otherwise logits. + output = self.language_model( + input_ids, + position_ids, + attention_mask, + decoder_input=combined_embeddings, + labels=labels, + inference_params=inference_params, + ) + + return output + + +def _load_state_dict_hook_ignore_param_names( + param_names: List[str], module: torch.nn.Module, incompatible_keys: namedtuple +): + """Hook to ignore missing keys during checkpoint loading. + + By default, this should not be used to avoid accidentally missing weights in checkpoint loading. + + Example use case: Use this for the vision projection if you want to load a checkpoint that contains vision and language model weights + but not the vision projection weights. + + Args: + param_names (list of str): Parameter names allowed to be missing when calling load_state_dict. + module (torch.nn.Module): The torch module this hook applies to. Unused here but required by the torch API. + incompatible_keys (namedtuple): Namedtuple with fields missing_keys and unexpected_keys, which collect the missing and unexpected + keys when calling load_state_dict on this torch module, respectively. + """ + for param_name in param_names: + if param_name in incompatible_keys.missing_keys: + logging.getLogger(__name__).warning( + f"{param_name} being removed from incompatible_keys.missing_keys in LlavaModel" + ) + incompatible_keys.missing_keys.remove(param_name) diff --git a/megatron/core/models/retro/__init__.py b/megatron/core/models/retro/__init__.py new file mode 100644 index 0000000..ea7cea6 --- /dev/null +++ b/megatron/core/models/retro/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Exports: + + - RetroConfig: configuration dataclass for RetroModel. + - RetroModel: The Retro model. + - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block. +""" + +from .config import RetroConfig +from .decoder_spec import get_retro_decoder_block_spec +from .model import RetroModel diff --git a/megatron/core/models/retro/base_attention.py b/megatron/core/models/retro/base_attention.py new file mode 100644 index 0000000..741f712 --- /dev/null +++ b/megatron/core/models/retro/base_attention.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Base class for decoder and encoder attention modules.""" + +from megatron.core.models.retro.config import RetroConfig +from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule + + +class BaseRetroCrossAttention(MegatronModule): + + """Base class for Retro cross attention, for both encoder & decoder layers. + + This class collects the retro arguments below (i.e., num neighbors, chunk + length, and retrieve length) for use in Retro's custom cross attention + operators. + + Args: + config (RetroConfig): Retro config. + submodules (CrossAttentionSubmodules): Cross attention submodules. + layer_number (int): Layer number within transformer block. + attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding'). + """ + + def __init__( + self, + config: RetroConfig, + submodules: CrossAttentionSubmodules, + layer_number: int = 1, + attn_mask_type: AttnMaskType = AttnMaskType.padding, + ): + super().__init__(config=config) + + self.attn = CrossAttention( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + ) + + self.retro_num_neighbors = config.retro_num_neighbors + self.retro_chunk_length = config.retro_chunk_length + self.retro_retrieved_length = config.retro_retrieved_length diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py new file mode 100644 index 0000000..b9a5eb9 --- /dev/null +++ b/megatron/core/models/retro/config.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Configuration dataclass for a RetroModel.""" + +import os +import types +from dataclasses import dataclass +from importlib.metadata import version + +from pkg_resources import packaging + +from megatron.core.transformer import TransformerConfig + + +@dataclass +class RetroConfig(TransformerConfig): + """Configuration object for Retro models. """ + + # Retro. + retro_project_dir: str = None + """Retro project directory, which contains the preprocessed data for for pretraining. This + directory is built during preprocessing (see tools/retro/README.md), and contains + subdirectories for the chunk database and pretraining neighbors. + """ + + retro_block_size: int = None + """Number of records to load per data file, as saved during preprocessing. Block processing is + used for efficient data preprocessing. + """ + + retro_chunk_length: int = None + """Chunk length used for performing chunked- cross-attention (CCA).""" + + retro_encoder_num_layers: int = 2 + """Number of layers to use for the retrieval encoder.""" + + retro_encoder_hidden_dropout: float = 0.1 + """Hidden dropout for retrieval encoder.""" + + retro_encoder_attention_dropout: float = 0.1 + """Attention dropout for retrieval encoder.""" + + retro_neighbor_dirs: dict = None + """Directory names of saved neighbor id files for train, valid, and test datasets.""" + + retro_num_neighbors: int = 2 + """Number of neighbors to retrieve during pretraining.""" + + retro_num_retrieved_chunks: int = 2 + """Number of chunks to retrieve from the retrieval database.""" + + retro_retrieved_length: int = None + """Cached value of retro_num_retrieved_chunks * retro_chunk_length (i.e., the total number of + retrieved tokens; neighbor + continuation). + """ + + retro_split_preprocessing: str = None + """Data split used during data preprocessing.""" + + retro_verify_neighbor_count: bool = True + """Verify that len(GPT dataset) == len(saved neighbors).""" + + def __post_init__(self) -> None: + """Validate Retro config.""" + + super().__post_init__() + + # Validate Transformer Engine version. + te_version = packaging.version.Version(version("transformer-engine")) + if te_version >= packaging.version.Version("1.3"): + try: + assert os.getenv("NVTE_FLASH_ATTN") == "0" + assert os.getenv("NVTE_FUSED_ATTN") == "0" + except Exception as e: + raise Exception( + "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s." + % ( + os.getenv("NVTE_FLASH_ATTN", "[unset]"), + os.getenv("NVTE_FUSED_ATTN", "[unset]"), + ) + ) + + # Preprocessing split should be defined. + assert self.retro_split_preprocessing is not None + + # Pre-compute retrieved length. + self.retro_retrieved_length = self.retro_num_retrieved_chunks * self.retro_chunk_length diff --git a/megatron/core/models/retro/decoder_attention.py b/megatron/core/models/retro/decoder_attention.py new file mode 100644 index 0000000..f459163 --- /dev/null +++ b/megatron/core/models/retro/decoder_attention.py @@ -0,0 +1,309 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Retro's cross attention modules for the decoder block.""" + +from functools import partial +from typing import Callable + +import numpy as np +import torch +from torch import Tensor + +from megatron.core import InferenceParams +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.retro.base_attention import BaseRetroCrossAttention +from megatron.core.models.retro.config import RetroConfig +from megatron.core.models.retro.utils import get_all_true_mask +from megatron.core.transformer import ModuleSpec +from megatron.core.transformer.attention import CrossAttentionSubmodules +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_block import TransformerBlock + + +class RetroDecoderCrossAttention(BaseRetroCrossAttention): + + """Retro decoder's chunked cross attention operator. + + See this paper for more details: https://arxiv.org/abs/2112.04426. + Neighboring chunks retrieved from the chunk database are used here for + chunked-cross attention. + + ** Note about 'encoder_block_spec' ** + + Retro is an encoder-decoder model that uses its encoder for encoding + neighboring chunks that are retrieved from a chunk database. These + encoded neighbors are then used in the decoder stack for performing + chunked-cross attention (see paper link above). + + In contrast to the T5 model, the encoder and decoder are computationally + intertwined, since the input to the encoder is the output of the self- + attention of the first decoder layer. As such, the encoder block itself + is instantiated within the first Retro decoder layer, in order to receive + the self-attention's output. (Note, that only the first decoder layer + instantiates an encoder block, and the remaining decoder layers use the + encoder output from the first decoder layer.) + + Args: + config (RetroConfig): Retro config. + submodules (CrossAttentionSubmodules): Cross attention submodules. + layer_number (int): Layer number within transformer block. + attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding'). + encoder_block_spec (ModuleSpec): The first Retro decoder layer is provided with a transformer block spec to construct the neighbor encoder. + """ + + def __init__( + self, + config: RetroConfig, + submodules: CrossAttentionSubmodules, + layer_number: int = 1, + attn_mask_type: AttnMaskType = AttnMaskType.padding, + encoder_block_spec: ModuleSpec = None, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + ) + + if encoder_block_spec: + self.encoder = TransformerBlock( + config=config, spec=encoder_block_spec, pre_process=True, post_process=False, + ) + # self._encoder_key = 'encoder' # ... necessary? + else: + self.encoder = None + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + key_value_states: Tensor = None, + inference_params: InferenceParams = None, + # rotary_pos_emb: Tensor = None, # ... unsupported for retro. + ) -> dict: + """Cross attention for Retro decoder. + + Notation: + ns : Sequence length. + bs : Batch size. + d : Hidden size. + l : Number of chunks per sample (i.e., seq_length/chunk_length). + m : Number of tokens per chunk. + k : Number of neighbors. + r : Number of retrieved tokens (neighbors + continuation). + + Args: + hidden_states (Tensor): Transformer layer hidden states. + attention_mask (Tensor): Attention mask. + key_value_states (Tensor): Neighbor embeddings if first decoder layer, else encoder output. + inference_params (InferenceParams): Inference params. + + Returns: + A dict consisting of the attention output and context, along with other scalars necessary for performing the downstream bias-dropout-add. + """ + + # hidden_states: [ ns, bs, d ] + # key_value_states: [ r, k*bs*l, d ] + + ns, bs, d = hidden_states.shape + l = int(np.ceil(ns / self.retro_chunk_length)) + + # Retrieve neighbors. + if self.encoder: + + # Sequence length remainder. + first_ns = ns % self.retro_chunk_length + + # Case 1: Sequence length not divisible by chunk length. + if first_ns > 0: + + # Split sequence into first partial chunk & remaining chunks. + first_chunk, rest_chunk = hidden_states[:first_ns], hidden_states[first_ns:] + + # Pad partial chunk with zeros. + first_chunk = torch.nn.functional.pad( + first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), 'constant', 0, + ) + + # Concatenate padded chunk with remaining chunks. + chunked_output = torch.cat((first_chunk, rest_chunk), dim=0) # [ l*m, bs, d ] + + # Case 2: Sequence length is divisible by chunk length. + else: + chunked_output = hidden_states # [ l*m, bs, d ] + + # Chunk & permute hidden states. + # - hidden_states: [ l*m, bs, d ] + # - chunked_output: [ m, bs*l, d ] + chunked_output = ( + chunked_output.reshape(l, self.retro_chunk_length, bs, d) + .permute(1, 2, 0, 3) + .reshape(self.retro_chunk_length, bs * l, d) + .contiguous() + ) + + # flash attn: [ b, h, sq, sk ] + # fused attn: [ b, 1, 1, sq ] + chunked_output_mask = get_all_true_mask( + size=(1, 1, chunked_output.shape[0], key_value_states.shape[0]), + device=chunked_output.device, + ) + + # Encode neighbors. (Note: 'key_value_states' re-assigned here.) + key_value_states = self.encoder( + hidden_states=key_value_states, + attention_mask=attention_mask, + context=chunked_output, + context_mask=chunked_output_mask, + inference_params=inference_params, + ) # [ r, k*bs*l, d ] + key_value_states = key_value_states.reshape( + self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d + ) # [ r*k, bs*l, d ] + + # Attend starting at last token of first chunk. + pad = (ns - 1) % self.retro_chunk_length + attending_chunks = hidden_states[pad:] + + # Pad attending tokens to sequence length. + padded_chunks = torch.nn.functional.pad( + attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), 'constant', 0, + ) + + # Permute attending chunks. + # - padded_chunks: [ l*m, bs, d ] + # - padded_chunked_output: [ m, bs*l, d ] (matches 'chunked_output' above) + padded_chunked_output = padded_chunks.reshape(l, self.retro_chunk_length, bs, d).permute( + 1, 2, 0, 3 + ) + padded_chunked_output = padded_chunked_output.reshape( + self.retro_chunk_length, bs * l, d + ).contiguous() + + # flash attn: [ b, h, sq, sk ] + # fused attn: [ b, 1, 1, sq ] + padded_chunked_output_mask = get_all_true_mask( + size=(1, 1, padded_chunked_output.shape[0], key_value_states.shape[0]), + device=padded_chunked_output.device, + ) + + # Attend to encoded neighbors. + attention_output, attention_bias = self.attn( + hidden_states=padded_chunked_output, + attention_mask=padded_chunked_output_mask, + key_value_states=key_value_states, + ) + + # Return dimensions for bias-dropout step. + return { + "ns": ns, + "bs": bs, + "d": d, + "l": l, + "pad": pad, + "attention_output": attention_output, # [ m, bs*l, d ] + "attention_bias": attention_bias, # [ d ] + "context": key_value_states, # [ r*k, bs*l, d ] + } + + +class RetroDecoderBiasDropoutAdd(MegatronModule): + + """Retro decoder's bias-dropout-add operator. + + This operator takes care of reshaping and permuting the output from the + chunk dimension to the sequence dimension. + + Args: + config (RetroConfig): Retro config. + """ + + def __init__( + self, config: RetroConfig, + ): + super().__init__(config=config) + self.retro_chunk_length = config.retro_chunk_length + + @classmethod + def _forward( + cls, + x_with_bias: dict, + residual: Tensor, + prob: float, + retro_chunk_length: int, + bias_dropout_add: Callable, + ) -> Tensor: + """Per-chunk bias-dropout-add. + + Args: + x_with_bias (dict): Attention output and bias, along with other Retro relevant parameters. + residual (Tensor): Transformer layer residual. + prob (float): Dropout probability. + retro_chunk_length (int): Retro chunk length (e.g., 64). + bias_dropout_add (Callable): Bias-dropout-add function. + + Returns: + Output of bias-dropout-add. + """ + + # Extract input dict. + ns = x_with_bias["ns"] + bs = x_with_bias["bs"] + d = x_with_bias["d"] + l = x_with_bias["l"] + pad = x_with_bias["pad"] + attention_output = x_with_bias["attention_output"] # [ m, bs*l, d ] + attention_bias = x_with_bias["attention_bias"] # [ d ] + + # Re-enable torch grad to enable fused optimization. + with torch.enable_grad(): + + # Bias-dropout-add. + x = bias_dropout_add( + ( + attention_output, + None if attention_bias is None else attention_bias.expand_as(attention_output), + ), + torch.zeros_like(attention_output), + prob, + ) + + # Permute chunks back to sequence dimension. + # 1. [ m, bs*l, d ] + # 2. [ m, bs, l, d ] + # 3. [ l, m, bs, d ] + # 4. [ m*l, bs, d ] == [ ns, bs, d ] + x = ( + x.reshape(retro_chunk_length, bs, l, d) + .permute(2, 0, 1, 3) + .reshape(retro_chunk_length * l, bs, d) + ) + + # Prepend zeros for non-attending tokens. + x = torch.nn.functional.pad(x, (0, 0, 0, 0, pad, 0), 'constant', 0,)[ + :ns + ] # [ ns, bs, d ] + + # Add residual. [ ns, bs, d ] + x = x + residual + + # Output. [ ns, bs, d ] + return x + + def forward(self, training: bool, fused: bool) -> partial: + """Retro decoder bias-dropout-add. + + Args: + training (bool): If training, then apply dropout. + fused (bool): Fuse bias-dropout-add. + + Returns: + The partial function for performing bias-dropout-add. + """ + return partial( + self._forward, + retro_chunk_length=self.retro_chunk_length, + bias_dropout_add=get_bias_dropout_add(training, fused), + ) diff --git a/megatron/core/models/retro/decoder_spec.py b/megatron/core/models/retro/decoder_spec.py new file mode 100644 index 0000000..0c16ccc --- /dev/null +++ b/megatron/core/models/retro/decoder_spec.py @@ -0,0 +1,184 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Specs for Retro decoder.""" + +import typing + +from megatron.core import parallel_state +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) +from megatron.core.models.retro.config import RetroConfig +from megatron.core.models.retro.decoder_attention import ( + RetroDecoderBiasDropoutAdd, + RetroDecoderCrossAttention, +) +from megatron.core.models.retro.encoder_spec import get_retro_encoder_block_spec +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer import ModuleSpec +from megatron.core.transformer.attention import CrossAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.transformer_block import ( + TransformerBlockSubmodules, + get_num_layers_to_build, +) + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TENorm, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + + +def get_retro_decoder_layer_te_spec( + encoder_block_spec: typing.Union[ModuleSpec, TransformerBlockSubmodules, None] = None +) -> ModuleSpec: + """Retro decoder TE spec (uses Transformer Engine components). + + A Retro decoder layer uses custom attention and bias-dropout-add operators + to perform chunked-cross attention. Additionally, the first Retro decoder + layer instantiates an entire encoder transformer block. As such, the decoder + cross attention module takes an optional encoder block spec, which is only + provided for the first Retro decoder layer. + + Args: + encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for the first Retro decoder layer. + + Returns: + A module spec with Transformer Engine modules. + """ + spec = get_gpt_layer_with_transformer_engine_spec() + spec.submodules.pre_cross_attn_layernorm = TENorm + spec.submodules.cross_attention = ModuleSpec( + module=RetroDecoderCrossAttention, + params={ + "encoder_block_spec": encoder_block_spec, + }, + submodules=CrossAttentionSubmodules( + linear_q=TEColumnParallelLinear, + linear_kv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ) + spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd) + return spec + + +def get_retro_decoder_layer_local_spec( + encoder_block_spec: typing.Optional[ModuleSpec] = None, +) -> ModuleSpec: + """Retro decoder local spec (uses Megatron-Core components). + + A Retro decoder layer uses custom attention and bias-dropout-add operators + to perform chunked-cross attention. Additionally, the first Retro decoder + layer instantiates an entire encoder transformer block. As such, the decoder + cross attention module takes an optional encoder block spec, which is only + provided for the first Retro decoder layer. + + Args: + encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for the first Retro decoder layer. + + Returns: + A module spec with local modules. + """ + spec = get_gpt_layer_local_spec() + spec.submodules.pre_cross_attn_layernorm = LNImpl + spec.submodules.cross_attention = ModuleSpec( + module=RetroDecoderCrossAttention, + params={ + "encoder_block_spec": encoder_block_spec, + }, + submodules=CrossAttentionSubmodules( + linear_q=ColumnParallelLinear, + linear_kv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ) + spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd) + return spec + + +def get_retro_decoder_block_spec( + config: RetroConfig, use_transformer_engine: bool +) -> TransformerBlockSubmodules: + """Retro decoder block spec. + + Retro decoder block implementation details: + - The retro decoder block consists of interleaved GPT layers and customized Retro decoder layers. + - The Retro decoder layers are spaced three layers apart, and start on layer 6 or 9 (depending on the total number of layers). + - The first decoder layer instantiates an encoder block, and it therefore passes in an encoder_block_spec. + + Args: + config (RetroConfig): Retro config. + use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules. + + Returns: + Transformer block submodules for the given spec. + """ + + # Num layers. + assert ( + parallel_state.get_pipeline_model_parallel_world_size() == 1 + ), "retro does not currently support pipeline parallelism." + assert ( + parallel_state.get_virtual_pipeline_model_parallel_world_size() is None + ), "retro does not currently support virtual pipeline parallelism." + num_layers = get_num_layers_to_build(config) + + # Retro layer numbers. + retro_layer_start = 6 if num_layers <= 15 else 9 + retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3)) + + # Layer specs. + gpt_layer_spec = ( + get_gpt_layer_with_transformer_engine_spec() + if use_transformer_engine + else get_gpt_layer_local_spec() + ) + get_retro_decoder_layer_spec = ( + get_retro_decoder_layer_te_spec + if use_transformer_engine + else get_retro_decoder_layer_local_spec + ) + retro_layer_spec = get_retro_decoder_layer_spec() + retro_layer_spec_with_retriever = get_retro_decoder_layer_spec( + get_retro_encoder_block_spec(config, use_transformer_engine) + ) + + layer_specs = [] + for layer_number in range(1, num_layers + 1): + if layer_number == retro_layer_numbers[0]: + layer_specs.append(retro_layer_spec_with_retriever) + elif layer_number in retro_layer_numbers: + layer_specs.append(retro_layer_spec) + else: + layer_specs.append(gpt_layer_spec) + + # Block spec. + block_spec = TransformerBlockSubmodules(layer_specs=layer_specs) + + return block_spec diff --git a/megatron/core/models/retro/encoder_attention.py b/megatron/core/models/retro/encoder_attention.py new file mode 100644 index 0000000..a2226c0 --- /dev/null +++ b/megatron/core/models/retro/encoder_attention.py @@ -0,0 +1,233 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Retro's cross attention modules for the encoder block.""" + +from functools import partial +from typing import Callable, List, Optional, Tuple, Type + +import torch +from torch import Tensor + +from megatron.core import InferenceParams +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.retro.base_attention import BaseRetroCrossAttention +from megatron.core.models.retro.config import RetroConfig +from megatron.core.models.retro.utils import get_all_true_mask +from megatron.core.transformer.module import MegatronModule + + +class RetroEncoderCrossAttention(BaseRetroCrossAttention): + + """Retro encoder's cross attention operator. + + See this paper for more details: https://arxiv.org/abs/2112.04426. + Neighboring chunks are retrieved from the chunk database, encoded, and + used by the decoder layers for chunked cross attention. + + Args: + config (RetroConfig): Retro config. + submodules (CrossAttentionSubmodules): Cross attention submodules. + layer_number (int): Layer number within transformer block. + attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding'). + """ + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + key_value_states: Tensor = None, + inference_params: InferenceParams = None, + # rotary_pos_emb: Tensor = None, # unsupported for retro. + ) -> List[Tuple[Tensor, Optional[Tensor], Tensor]]: + """Cross attention for Retro encoder. + + Notation: + ns : Sequence length. + bs : Batch size. + d : Hidden size. + l : Number of chunks per sample (i.e., seq_length/chunk_length). + k : Number of neighbors. + r : Number of retrieved tokens (neighbors + continuation). + + Args: + hidden_states (Tensor): Transformer layer hidden states. + attention_mask (Tensor): Attention mask. + key_value_states (Tensor): Neighbor embeddings. + inference_params (InferenceParams): Inference params. + + Returns: + List of tuples, where each tuple is (attention_output, attention_bias, residual). + """ + + # Input shape. [ r, bs*l*k, d ] + ns, bs, d = hidden_states.shape + + # Reshape sequence into neighboring chunks. + # - hidden_states: [ r, bs*l*k, d ] + # - chunked_outputs: [ r, bs*l, k, d ] + chunked_outputs = hidden_states.reshape( + self.retro_retrieved_length, -1, self.retro_num_neighbors, d + ) + + # flash attn: [ b, h, sq, sk ] + # fused attn: [ b, 1, 1, sq ] + chunked_output_mask = get_all_true_mask( + size=(1, 1, chunked_outputs.shape[0], key_value_states.shape[0]), + device=chunked_outputs.device, + ) + + # Per-chunk attention. + attention_output_tuples = [] + for k in range(self.retro_num_neighbors): + + # Attend to current neighboring chunks. + # - chunked_output: [ r, bs*l, d ] + # - key_value_states: [ m, bs*l, d ] + # - attention_output: [ r, bs*l, d ] + # - attention_bias: [ d ] + chunked_output = chunked_outputs[:, :, k].contiguous() + attention_output, attention_bias = self.attn( + hidden_states=chunked_output, # Q (neighbor embedding) + attention_mask=chunked_output_mask, + key_value_states=key_value_states, # K, V (hidden act) + ) + + # Residual connection. [ r, bs*l, d ] + residual = chunked_output + + # Collect tensors. + attention_output_tuples.append((attention_output, attention_bias, residual,)) + + # Output. (List[Tuple[( [ r, bs*l, d ], [ d ] )]]) + return attention_output_tuples + + +class RetroEncoderBiasDropoutAdd(MegatronModule): + + """Retro encoder's bias-dropout-add operator. + + This operator applies bias-dropout-add individually on each neighboring + chunk that is retrieved from the chunk database. + + Args: + config (RetroConfig): Retro config. + """ + + def __init__( + self, config: RetroConfig, + ): + super().__init__(config=config) + self.retro_num_neighbors = config.retro_num_neighbors + + @classmethod + def _forward( + cls, + x_with_bias: List[Tuple[Tensor, Optional[Tensor], Tensor]], + residual: Tensor, + prob: float, + retro_num_neighbors: int, + bias_dropout_add: Callable, + ) -> Tensor: + """Per-chunk bias-dropout-add. + + Args: + x_with_bias (dict): Attention output and bias tuple. + residual (Tensor): Transformer layer residual. + prob (float): Dropout probability. + retro_num_neighbors (int): Number of retrieved neighbor chunks (e.g., 2). + bias_dropout_add (Callable): Bias-dropout-add function. + + Returns: + Output of bias-dropout-add. + """ + + # Re-enable torch grad to enable fused optimization. + with torch.enable_grad(): + + # Per-neighbor bias-dropout-add. + # - attention_output: [ r, bs*l, d ] + # - attention_bias: [ d ] + # - residual: [ r, bs*l, d ] + # - output: [ r, bs*l, d ] + outputs = [ + bias_dropout_add( + ( + attention_output, + None if attention_bias is None else attention_bias.expand_as(residual), + ), + residual, + prob, + ) + for attention_output, attention_bias, residual in x_with_bias + ] + + # Concatenate outputs (to shape [r, k*bs*l, d]; see notation above). + r, _, d = outputs[0].shape + output = torch.stack(outputs, dim=1).reshape(r, -1, d) + + # Output. [ r, k*bs*l, d ] + return output + + def forward(self, training: bool, fused: bool) -> partial: + """Retro decoder bias-dropout-add. + + Args: + training (bool): If training, then apply dropout. + fused (bool): Fuse bias-dropout-add. + + Returns: + A partial function for performing bias-dropout-add. + """ + return partial( + self._forward, + retro_num_neighbors=self.retro_num_neighbors, + bias_dropout_add=get_bias_dropout_add(training, fused), + ) + + +class RetroEncoderLayerNorm(MegatronModule): + + """Retro encoder's layernorm operator. + + This operator applies layernorm individually on each neighboring chunk that + is retrieved from the chunk database, and then concatenates the chunks into + a single tensor. + + Args: + config (RetroConfig): Retro config. + submodules (Type): Layer norm class. (Named 'submodules' to fit external interface.) + """ + + def __init__( + self, config: RetroConfig, submodules: Type, **kwargs: dict, + ): + super().__init__(config=config) + norm_class = submodules + self.norm = norm_class(config=config, **kwargs) + self.retro_num_neighbors = config.retro_num_neighbors + + def forward(self, input: Tensor) -> Tensor: + """Per-chunk layer norm. + + Args: + input (Tensor): Input chunks, concatenated into a single tensor. + + Returns: + Output of the layer norm. + """ + + # Input shape: [ r, k*bs*l, d ]. (see notation above in attention module) + + # Split input into 'num_neighbors' tensors. + chunk_size = input.shape[1] // self.retro_num_neighbors + inputs = torch.split(input, chunk_size, dim=1) + + # Norm. + outputs = [self.norm(inp.contiguous()) for inp in inputs] + + # Concatenate layer norms (to shape [r, k*bs*l, d]; see notation above). + r, _, d = inputs[0].shape + output = torch.stack(outputs, dim=1).reshape(r, -1, d) + + # Output. [ r, k*bs*l, d ] + return output diff --git a/megatron/core/models/retro/encoder_spec.py b/megatron/core/models/retro/encoder_spec.py new file mode 100644 index 0000000..ac0eb15 --- /dev/null +++ b/megatron/core/models/retro/encoder_spec.py @@ -0,0 +1,186 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Specs for Retro encoder.""" + +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) +from megatron.core.models.retro.config import RetroConfig +from megatron.core.models.retro.encoder_attention import ( + RetroEncoderBiasDropoutAdd, + RetroEncoderCrossAttention, + RetroEncoderLayerNorm, +) +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer import ModuleSpec +from megatron.core.transformer.attention import CrossAttentionSubmodules +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.transformer_block import TransformerBlockSubmodules + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TENorm, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import apex + + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + + HAVE_APEX = True + LNImpl = FusedLayerNorm +except ImportError: + import warnings + + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') + LNImpl = WrappedTorchLayerNorm + + +def get_retro_encoder_layer_te_spec() -> ModuleSpec: + """Retro encoder TE spec (uses Transformer Engine components). + + A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm + operators to encode neighboring chunks that are retrieved from the chunk + database. Each operator is responsible for iterating the retrieved chunks + and processing them individually. + + Returns: + A module spec if Transformer Engine modules. + """ + spec = get_gpt_layer_with_transformer_engine_spec() + spec.submodules.pre_cross_attn_layernorm = TENorm + spec.submodules.cross_attention = ModuleSpec( + module=RetroEncoderCrossAttention, + params={ + "attn_mask_type": AttnMaskType.padding, + }, + submodules=CrossAttentionSubmodules( + linear_q=TEColumnParallelLinear, + linear_kv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ) + spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd) + spec.submodules.pre_mlp_layernorm = ModuleSpec( + module=RetroEncoderLayerNorm, + submodules=TENorm, + ) + spec.submodules.mlp = ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ) + return spec + + +def get_retro_encoder_layer_local_spec() -> ModuleSpec: + """Retro encoder local spec (uses Megatron-Core components). + + A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm + operators to encode neighboring chunks that are retrieved from the chunk + database. Each operator is responsible for iterating the retrieved chunks + and processing them individually. + + Returns: + A module spec if local modules. + """ + spec = get_gpt_layer_local_spec() + spec.submodules.pre_cross_attn_layernorm = LNImpl + spec.submodules.cross_attention = ModuleSpec( + module=RetroEncoderCrossAttention, + params={ + "attn_mask_type": AttnMaskType.padding, + }, + submodules=CrossAttentionSubmodules( + linear_q=ColumnParallelLinear, + linear_kv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ) + spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd) + spec.submodules.pre_mlp_layernorm = ModuleSpec( + module=RetroEncoderLayerNorm, + submodules=LNImpl, + ) + spec.submodules.mlp = ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), + ) + spec.submodules.sharded_state_dict_keys_map = { + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + } # pre_mlp_layernorm doesn't need remapping + return spec + + +def get_retro_encoder_block_spec( + config: RetroConfig, use_transformer_engine: bool +) -> TransformerBlockSubmodules: + """Retro encoder block spec. + + The retro encoder block consists of one customized Retro encoder layer + (layer 1), and all of the following layers are standard GPT layers. + + Args: + config (RetroConfig): Retro config. + use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules). + + Returns: + Transformer block submodules for the given spec. + """ + + # Num layers. + num_layers = config.retro_encoder_num_layers + retro_layer_numbers = [1] + + # Layer specs. + gpt_layer_spec = ( + get_gpt_layer_with_transformer_engine_spec() + if use_transformer_engine + else get_gpt_layer_local_spec() + ) + get_retro_encoder_layer_spec = ( + get_retro_encoder_layer_te_spec + if use_transformer_engine + else get_retro_encoder_layer_local_spec + ) + retro_layer_spec = get_retro_encoder_layer_spec() + for spec in (gpt_layer_spec, retro_layer_spec): + spec.params["hidden_dropout"] = config.retro_encoder_hidden_dropout + spec.submodules.self_attention.params["attn_mask_type"] = AttnMaskType.padding + spec.submodules.self_attention.submodules.core_attention = ModuleSpec( + module=TEDotProductAttention if use_transformer_engine else DotProductAttention, + params={ + "attention_dropout": config.retro_encoder_attention_dropout, + }, + ) + + layer_specs = [] + for layer_number in range(1, num_layers + 1): + if layer_number in retro_layer_numbers: + layer_specs.append(retro_layer_spec) + else: + layer_specs.append(gpt_layer_spec) + + # Block spec. + block_spec = TransformerBlockSubmodules(layer_specs=layer_specs) + + return block_spec diff --git a/megatron/core/models/retro/model.py b/megatron/core/models/retro/model.py new file mode 100644 index 0000000..32c6d26 --- /dev/null +++ b/megatron/core/models/retro/model.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Retro Model.""" +from typing import Dict, Optional + +from torch import Tensor + +from megatron.core import InferenceParams +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.models.gpt import GPTModel + + +class RetroModel(GPTModel): + + """Retro Model. + + A Retro model mostly re-uses the GPTModel interface, with the only difference + being the embedding of the 'context' this is used by Retro for processing + neighbor tokens. This embedded context is then forwarded to the Transformer + Block. + """ + + def forward( + self, + input_ids: Tensor, + position_ids: Tensor, + attention_mask: Tensor, + context_input_ids: Tensor = None, + context_position_ids: Tensor = None, + context_mask: Tensor = None, + decoder_input: Tensor = None, + labels: Tensor = None, + inference_params: InferenceParams = None, + ) -> Tensor: + """RetroModel forward method. + + Foward input tokens & mask, along with neighbor tokens & mask, through + the Retro model.. + + Args: + input_ids (Tensor): Input token IDs. + position_ids (Tensor): Input position IDs. + attention_mask (Tensor): Input attention mask. + context_input_ids (Tensor): Context (i.e., neighbor) token IDs. + context_position_ids (Tensor): Context (i.e., neighbor) position IDs. + context_mask (Tensor): Context (i.e., neighbor) attention mask. + decoder_input (Tensor): When using pipeline parallelism, input_ids and position_ids will only be used on the first stage, and for all other stages decoder_input will be provided via communication from the previous stage. + labels (Tensor): The labels of dimension [batch size, seq length]. + inference_params (InferenceParams): Parameters for inference. + + Returns: + Output tensor of forward pass. + """ + + # Argument shapes: + # Notation: + # ns : Sequence length. + # bs : Batch size. + # d : Hidden size. + # l : Number of chunks per sample (i.e., seq_length/chunk_length). + # k : Number of neighbors. + # r : Number of retrieved tokens (neighbors + continuation). + # - input_ids: [ bs, ns ] + # - context_ids: [ k*bs*l, r ] + # - context: [ r, k*bs*l, d ] + # - output: [ ns, bs, d ] + + # Context embedding (e.g., for Retro neighbor tokens). + if context_input_ids is not None: + context = self.embedding(context_input_ids, context_position_ids) + else: + context = None + + # Call GPTModel.forward, and pass in embedded context. + return super().forward( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + decoder_input=decoder_input, + labels=labels, + inference_params=inference_params, + extra_block_kwargs={"context": context, "context_mask": context_mask,}, + ) + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None + ) -> ShardedStateDict: + """Get sharded state dict. + + Args: + prefix (str): Module name prefix. + sharded_offsets (tuple): Offsets of local shard within global tensor. + metadata (Optional[Dict]): Shard metadata. + + Returns: + A ? + """ + metadata = metadata or {} + metadata['non_homogeneous_layers'] = True + return super().sharded_state_dict(prefix, sharded_offsets, metadata) diff --git a/megatron/core/models/retro/utils.py b/megatron/core/models/retro/utils.py new file mode 100644 index 0000000..7d83c5d --- /dev/null +++ b/megatron/core/models/retro/utils.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os + +import torch + + +def get_config_path(project_dir: str) -> str: + """Config copy stored within retro project dir.""" + return os.path.join(project_dir, "config.json") + + +def get_gpt_data_dir(project_dir: str) -> str: + """Get project-relative directory of GPT bin/idx datasets.""" + return os.path.join(project_dir, "data") + + +# ** Note ** : Retro's compatibility between cross attention and Flash/Fused +# Attention is currently a work in progress. We default to returning None for +# now. +# def get_all_true_mask(size, device): +# return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device) +def get_all_true_mask(size, device): + return None diff --git a/megatron/core/models/vision/__init__.py b/megatron/core/models/vision/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py new file mode 100644 index 0000000..84be735 --- /dev/null +++ b/megatron/core/models/vision/clip_vit_model.py @@ -0,0 +1,139 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from typing import Optional, Union + +import torch + +from megatron.core.models.common.vision_module.vision_module import VisionModule +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_block import TransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig + + +# Note: This is under development and is missing features like position embedding interpolation. +class CLIPViTModel(VisionModule): + """CLIP ViT vision model. + + Args: + transformer_config (TransformerConfig): Transformer config. + transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers. + ln_pre_impl (ModuleSpec or type): Specifies the layer norm type to use for ln_pre. + patch_dim (int): Image patch size. + img_h (int): Input image height. + img_w (int): Input image width. + add_class_token (bool, optional): Include a class token. Defaults to True. + class_token_len (int): Class token length. Defaults to 1 but 8 may be faster. + """ + + def __init__( + self, + transformer_config: TransformerConfig, + transformer_layer_spec: ModuleSpec, + ln_pre_impl: Union[ModuleSpec, type] = TENorm, + patch_dim: int = 14, + img_h: int = 336, + img_w: int = 336, + add_class_token: bool = True, + class_token_len: int = 1, + ) -> None: + super().__init__(config=transformer_config) + + self.visual_hidden_size = transformer_config.hidden_size + self.patch_dim = patch_dim + self.img_h = img_h + self.img_w = img_w + assert self.img_h % self.patch_dim == 0 + assert self.img_w % self.patch_dim == 0 + self.num_patches_per_dim_h = self.img_h // self.patch_dim + self.num_patches_per_dim_w = self.img_w // self.patch_dim + self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w + + self.add_class_token = add_class_token + self.class_token_len = class_token_len + + self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0) + + self.conv1 = torch.nn.Conv2d( + in_channels=3, + out_channels=self.visual_hidden_size, + kernel_size=self.patch_dim, + stride=self.patch_dim, + bias=False, + ) + + self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() + + self.position_embeddings = torch.nn.Embedding(self.seq_length, self.visual_hidden_size) + + self.add_class_token = add_class_token + if self.add_class_token: + self.class_token = torch.nn.Parameter( + torch.randn(1, self.class_token_len, self.visual_hidden_size) + ) + + self.ln_pre = build_module( + ln_pre_impl, + config=transformer_config, + hidden_size=self.visual_hidden_size, + eps=transformer_config.layernorm_epsilon, + ) + + self.model_type = ModelType.encoder_or_decoder + + # Transformer layers. + # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism. + # Note: a final layer norm and/or linear layer present in some implementations are omitted here. They can be added separately where needed. + self.decoder = TransformerBlock( + config=transformer_config, + spec=transformer_layer_spec, + pre_process=True, + post_process=False, + ) + + def set_input_tensor(self, input_tensor: torch.Tensor) -> None: + """Sets input tensor to the model. + + Args: + input_tensor (Tensor): Sets the input tensor for the model. + """ + self.decoder.set_input_tensor(input_tensor) + + def forward( + self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """Forward function of the CLIP ViT Model. This function passes the input tensors + through the embedding layer and then the transformer. + + Args: + x (torch.Tensor): input data of shape [batch, img_h, img_w] + attention_mask (torch.Tensor with dtype=bool): Attention mask to use. If none, all ones. + + Returns: + x (torch.Tensor): output after final transformer block of shape [b, s, h]. + """ + x = self.conv1(x) # shape = [batch, hidden_size, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # [batch, hidden_size, grid ** 2] + x = x.permute(0, 2, 1) # [batch, grid ** 2, hidden_size] + + if self.add_class_token: + class_token = self.class_token.expand( + x.shape[0], -1, -1 + ) # [batch, class_token_len, hidden_size] + x = torch.cat( + [class_token, x], dim=1 + ) # [batch, grid ** 2 + class_token_len, hidden_size] + + x = x + self.position_embeddings(self.position_ids) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # [b, s, h] -> [s, b, h] + if attention_mask is None: + attention_mask = torch.ones(1, 1, x.shape[0], x.shape[0]).cuda() # [1, 1, s, s] + attention_mask = attention_mask < 0.5 # to bool + x = self.decoder(x.contiguous(), attention_mask) + x = x.permute(1, 0, 2) # [s, b, h] -> [b, s, h] + x = x.contiguous() + + return x diff --git a/megatron/core/models/vision/multimodal_projector.py b/megatron/core/models/vision/multimodal_projector.py new file mode 100644 index 0000000..84cb24c --- /dev/null +++ b/megatron/core/models/vision/multimodal_projector.py @@ -0,0 +1,58 @@ +from megatron.core import tensor_parallel +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + + +class MultimodalProjector(MegatronModule): + """ + MultimodalProjector will take the encoded input with input_size hidden state and project + it into the hidden size of the language model for multimodal training. When projector is + type affine linear_fc1 from submodules is used. + + Args: + transformer_config (TransformerConfig): Transformer config + submodules (MLPSubmodules): Specifies MLP submodules for mlp type projector + projector_type (str): Projector type + input_size (int): Input size from feature encoder + """ + + def __init__( + self, + config: TransformerConfig, + submodules: MLPSubmodules, + projector_type: str, + input_size: int, + ): + super().__init__(config=config) + self.projector_type = projector_type + + assert submodules is not None, "MLPSubmodules must be provided" + + if self.projector_type == "mlp": + self.encoder = MLP(config=config, submodules=submodules, input_size=input_size) + elif self.projector_type == "affine": + self.encoder = build_module( + submodules.linear_fc1, + input_size, + config.hidden_size, + config=config, + init_method=config.init_method, + gather_output=True, + bias=config.add_bias_linear, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name=None, + ) + else: + raise Exception(f"Unsupported multimodal projection type {self.projector_type}") + + def forward(self, hidden_states): + # Run encoder. + encoder_output, encoder_output_bias = self.encoder(hidden_states) + + if encoder_output_bias is not None: + encoder_output = encoder_output + encoder_output_bias + + return encoder_output diff --git a/megatron/core/models/vision/vit_layer_specs.py b/megatron/core/models/vision/vit_layer_specs.py new file mode 100644 index 0000000..cfc9f05 --- /dev/null +++ b/megatron/core/models/vision/vit_layer_specs.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + + +# Use this spec to use lower level Transformer Engine modules (required for fp8 training) +def get_vit_layer_with_transformer_engine_spec() -> ModuleSpec: + mlp = _get_mlp_module_spec(use_te=True) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={ + "attn_mask_type": AttnMaskType.causal + }, # TODO: This should be no_mask when CI is upgraded + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + + +# Helper function to get module spec for MLP/MoE +def _get_mlp_module_spec(use_te: bool = True,) -> ModuleSpec: + # Dense MLP w/ or w/o TE modules. + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, + ), + ) diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py new file mode 100644 index 0000000..a5997df --- /dev/null +++ b/megatron/core/num_microbatches_calculator.py @@ -0,0 +1,301 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron Core number of micro-batches calculators.""" + +import logging +from abc import ABC, abstractmethod +from typing import List, Optional, Union + +import torch.distributed + +logger = logging.getLogger(__name__) + +# TODO: global_var merge into mcore? +_GLOBAL_NUM_MICROBATCHES_CALCULATOR: Union[ + 'ConstantNumMicroBatchesCalculator', 'RampupBatchsizeNumMicroBatchesCalculator' +] = None + + +def get_num_microbatches() -> int: + """Get number of micro-batches.""" + return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get() + + +def get_current_global_batch_size() -> int: + """Get current global batch size.""" + return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size() + + +def get_micro_batch_size(): + return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.micro_batch_size + + +def reconfigure_microbatch_calculator( + rank: int, + rampup_batch_size: Optional[List[int]], + global_batch_size: int, + micro_batch_size: int, + data_parallel_size: int, +) -> None: + """Updates the microbatch calculator. Useful in cases where the + number of microbatches varies throughout a training. For example, + when the number of microbatches differs from training and validation. + + Args: + rank (int): Rank of the GPU, only rank 0 will log the information. + rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of [start_global_batch_size, batch_size_increment, ramup_samples]. + global_batch_size (int): Global batch size for the model. + micro_batch_size (int): Micro batch size at initialization. + data_parallel_size (int): Data parallel size. + """ + global _GLOBAL_NUM_MICROBATCHES_CALCULATOR + + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + rank, rampup_batch_size, global_batch_size, micro_batch_size, data_parallel_size + ) + + +def update_num_microbatches( + consumed_samples: int, consistency_check: Optional[bool] = True +) -> None: + """Update number of micro-batches. + + Args: + consumed_samples (int): Number of samples consumed. + consistency_check (bool, optional): Option to check current schedule's consistency. Defaults to True. + """ + _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples, consistency_check) + + +def init_num_microbatches_calculator( + rank: int, + rampup_batch_size: Optional[List[int]], + global_batch_size: int, + micro_batch_size: int, + data_parallel_size: int, +) -> None: + """Initialize number of micro-batches calculator. + + Args: + rank (int): Rank of the GPU, only rank 0 will log the information. + rampup_batch_size (Optional[List[int]]): Rampup batch size. + global_batch_size (int): Global batch size for the model. + micro_batch_size (int): Micro batch size at initialization. + data_parallel_size (int): Data parallel size. + """ + global _GLOBAL_NUM_MICROBATCHES_CALCULATOR + assert ( + _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None + ), 'num microbatches calculator is already initialized.' + + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + rank, rampup_batch_size, global_batch_size, micro_batch_size, data_parallel_size + ) + + +def build_num_microbatches_calculator( + rank: int, + rampup_batch_size: Optional[List[int]], + global_batch_size: int, + micro_batch_size: int, + data_parallel_size: int, +) -> Union['ConstantNumMicroBatchesCalculator', 'RampupBatchsizeNumMicroBatchesCalculator']: + """Build number of micro-batches calculator. + + Args: + rank (int): Rank of the GPU, only rank 0 will log the information. + rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of [start_global_batch_size, batch_size_increment, ramup_samples]. + global_batch_size (int): Global batch size for the model. + micro_batch_size (int): Micro batch size at initialization. + data_parallel_size (int): Data parallel size. + """ + + # Constant num micro-batches. + if rampup_batch_size is None: + num_microbatches_calculator = ConstantNumMicroBatchesCalculator( + global_batch_size, micro_batch_size, data_parallel_size + ) + if rank == 0: + logger.info( + f'setting number of micro-batches to constant {num_microbatches_calculator.get()}' + ) + # Batch size ramp up num micro-batches. + else: + assert len(rampup_batch_size) == 3, ( + 'expected the following ' + 'format: --rampup-batch-size ' + ' ' + ) + start_global_batch_size = int(rampup_batch_size[0]) + batch_size_increment = int(rampup_batch_size[1]) + ramup_samples = int(rampup_batch_size[2]) + if rank == 0: + logger.info( + f'will use batch size rampup starting from global batch size {start_global_batch_size} to global batch size {global_batch_size} with batch size increments {batch_size_increment} over {ramup_samples} samples.' + ) + num_microbatches_calculator = RampupBatchsizeNumMicroBatchesCalculator( + global_batch_size, + micro_batch_size, + data_parallel_size, + start_global_batch_size, + batch_size_increment, + ramup_samples, + ) + + return num_microbatches_calculator + + +class NumMicroBatchesCalculator(ABC): + """Base class for number of micro-batches calculator.""" + + def __init__(self) -> None: + self.num_micro_batches = None + self.current_global_batch_size = None + + def get(self) -> int: + """Get number of micro-batches.""" + return self.num_micro_batches + + def get_current_global_batch_size(self) -> int: + """Get current global batch size.""" + return self.current_global_batch_size + + @abstractmethod + def update(self, consumed_samples, consistency_check) -> None: + pass + + +class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator): + """Calculator of number of micro-batches with constant global batch size. + + Args: + global_batch_size (int): Global batch size. + micro_batch_size (int): Micro batch size. + data_parallel_size (int): Data parallel size. + """ + + def __init__( + self, global_batch_size: int, micro_batch_size: int, data_parallel_size: int + ) -> None: + + micro_batch_times_data_parallel = micro_batch_size * data_parallel_size + assert global_batch_size % micro_batch_times_data_parallel == 0, ( + 'global batch size ({}) is not divisible by micro batch size ({})' + ' times data parallel size ({})'.format( + global_batch_size, micro_batch_size, data_parallel_size + ) + ) + + self.num_micro_batches = global_batch_size // micro_batch_times_data_parallel + assert ( + self.num_micro_batches >= 1 + ), 'number of micro-batches should be at least 1, got {}.'.format(self.num_micro_batches) + + self.current_global_batch_size = global_batch_size + self.micro_batch_size = micro_batch_size + + def update(self, consumed_samples, consistency_check) -> None: + pass + + +class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator): + """Calculator of number of micro-batches with ramp up global batch size. + Over + steps = (global-batch-size - start-batch-size) / batch_size_increment + increment batch size from start-batch-size to global-batch-size using + rampup-samples / steps + samples. + + Args: + global_batch_size (int): Global batch size post rampup. + micro_batch_size (int): Micro batch size. + data_parallel_size (int): Data parallel size. + start_global_batch_size (int): Global batch size to start with. + batch_size_increment (int): Global batch size increments. + ramup_samples (int): Number of samples to use ramp up global + batch size from `start_global_batch_size` to `global_batch_size`. + """ + + def __init__( + self, + global_batch_size: int, + micro_batch_size: int, + data_parallel_size: int, + start_global_batch_size: int, + batch_size_increment: int, + ramup_samples: int, + ) -> None: + assert global_batch_size > 0, 'global batch size should be positive, got {}.'.format( + global_batch_size + ) + assert start_global_batch_size > 0, 'start batch size should be positive, got {}.'.format( + start_global_batch_size + ) + assert batch_size_increment > 0, 'batch size increment should be positive, got {}.'.format( + batch_size_increment + ) + assert ramup_samples >= 0, 'ramp-up samples should be non-negative, got {}.'.format( + ramup_samples + ) + + self.global_batch_size = global_batch_size + self.micro_batch_size = micro_batch_size + self.data_parallel_size = data_parallel_size + self.start_global_batch_size = start_global_batch_size + self.batch_size_increment = batch_size_increment + self.ramup_samples = ramup_samples + + self.micro_batch_times_data_parallel_size = self.micro_batch_size * self.data_parallel_size + assert self.micro_batch_times_data_parallel_size > 0 + + diff_batch_size = self.global_batch_size - self.start_global_batch_size + assert ( + diff_batch_size >= 0 + ), 'expected global batch size to be greater than or equal to start batch size, got {} and {}.'.format( + self.global_batch_size, self.start_global_batch_size + ) + assert diff_batch_size % batch_size_increment == 0, ( + 'expected ' + 'global batch size interval ({}) to be divisible by global batch ' + 'size increment ({})'.format(diff_batch_size, batch_size_increment) + ) + + num_increments = diff_batch_size // self.batch_size_increment + self.rampup_samples_per_increment = self.ramup_samples / num_increments + + # Initialize number of microbatches. + self.update(0, False) + + def update(self, consumed_samples: int, consistency_check: bool) -> None: + """Update number of micro-batches. + + Args: + consumed_samples (int): Number of samples consumed. + consistency_check (bool): Option to check current schedule's consistency. + """ + + # Update current global batch size. + if consumed_samples > self.ramup_samples: + self.current_global_batch_size = self.global_batch_size + else: + steps = int(consumed_samples / self.rampup_samples_per_increment) + self.current_global_batch_size = ( + self.start_global_batch_size + steps * self.batch_size_increment + ) + assert self.current_global_batch_size <= self.global_batch_size + + # Check consistency of the current global batch size. + if consistency_check: + assert ( + self.current_global_batch_size % self.micro_batch_times_data_parallel_size == 0 + ), ( + 'current global ' + 'batch size ({}) is not divisible by micro-batch-size ({}) times' + 'data parallel size ({})'.format( + self.current_global_batch_size, self.micro_batch_size, self.data_parallel_size + ) + ) + + self.num_micro_batches = ( + self.current_global_batch_size // self.micro_batch_times_data_parallel_size + ) diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py new file mode 100644 index 0000000..04bffc8 --- /dev/null +++ b/megatron/core/optimizer/__init__.py @@ -0,0 +1,371 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import logging +from typing import Callable, Dict, List, Optional + +import torch + +try: + from transformer_engine.pytorch.optimizers import FusedAdam as Adam + from transformer_engine.pytorch.optimizers import FusedSGD as SGD +except ImportError: + try: + from apex.optimizers import FusedAdam as Adam + from apex.optimizers import FusedSGD as SGD + except ImportError: + import warnings + + warnings.warn( + f'Transformer Engine and Apex are not installed. Falling back to Torch optimizers.' + ) + + ## apex's FusedAdam is a drop-in replacement for torch's AdamW + ## see https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16 + from torch.optim import AdamW as Adam, SGD + +from megatron.core import mpu + +from ..distributed import ParamAndGradBuffer +from ..transformer.module import MegatronModule +from ..utils import log_single_rank +from .distrib_optimizer import DistributedOptimizer +from .grad_scaler import ConstantGradScaler, DynamicGradScaler +from .optimizer import ( + ChainedOptimizer, + Float16OptimizerWithFloat16Params, + FP32Optimizer, + MegatronOptimizer, +) +from .optimizer_config import OptimizerConfig + +logger = logging.getLogger(__name__) + + +def _get_param_groups( + model_chunks: List[MegatronModule], + no_weight_decay_cond: Callable, + scale_lr_cond: Callable, + lr_mult: float, + use_decoupled_learning_rate: bool, +) -> List[Dict]: + """Create parameter groups for optimizer. + + Creates parameter groups based on weight decay condition (regularized vs + non regularized), learning rate scale condition (lr vs lr_mult * lr), + and whether it is expert parameters. scale_lr_cond is used during finetuning + where head of the network requires a scaled version of the base learning rate. + + Args: + model_chunks (List[MegatronModule]): model chunks to create parameter + groups for. + no_weight_decay_cond (func): function to determine whether a parameter + should not perform weight decay. + scale_lr_cond (func): function to determine whether a parameter + should have a scaled learning rate. + lr_mult (float): learning rate multiplier for parameters that + satisfy scale_lr_cond. + use_decoupled_learning_rate (bool): true if using decoupled learning rate. + + Returns: + List of parameter groups. + """ + + # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params. + params_map = {} + for model_chunk in model_chunks: + for name, param in model_chunk.named_parameters(): + if not param.requires_grad: + continue + + is_expert_parallel = not getattr(param, 'allreduce', True) + + if no_weight_decay_cond is not None: + no_wd = no_weight_decay_cond(name, param) + else: + # Do not regularize biases and norm parameters. + no_wd = name.endswith(".bias") or len(param.shape) == 1 + + if scale_lr_cond is not None: + scale_lr = scale_lr_cond(name, param) + else: + scale_lr = False + + if not no_wd and not scale_lr: + wd_mult, _lr_mult = 1.0, 1.0 + elif not no_wd and scale_lr: + wd_mult, _lr_mult = 1.0, lr_mult + elif no_wd and not scale_lr: + wd_mult, _lr_mult = 0.0, 1.0 + else: + wd_mult, _lr_mult = 0.0, lr_mult + + is_decoupled_lr = False + # For input/embedding and output layer: embedding.word_embeddings.weight / output_layer.weight. + if use_decoupled_learning_rate and getattr( + param, 'is_embedding_or_output_parameter', False + ): + is_decoupled_lr = True + + key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr) + if key not in params_map: + params_map[key] = [] + params_map[key].append(param) + + param_groups = [] + for (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr), params in params_map.items(): + assert len(params) > 0 + param_groups.append( + { + 'params': params, + 'wd_mult': wd_mult, + 'lr_mult': _lr_mult, + 'is_expert_parallel': is_expert_parallel, + 'is_decoupled_lr': is_decoupled_lr, + } + ) + + return param_groups + + +def _update_min_and_max_lr_in_param_groups( + param_groups: List[Dict], + lr: float, + min_lr: float, + decoupled_lr: Optional[float], + decoupled_min_lr: Optional[float], +) -> List[Dict]: + """ + Updates `max_lr` and `min_lr` values in each parameter group, and returns new list. + By default, each group will use `lr` / `min_lr` as `max_lr` / `min_lr`. + If `decoupled_lr` is provided, then `decoupled_lr` / `decoupled_min_lr` will be used + as `max_lr` / `min_lr` for the input and output layer. + + Args: + param_groups (List): parameter groups whose 'max_lr' and `min_lr` fields need to + be adjusted. + lr (float): learning rate. + min_lr (float): minimum learning rate. + decoupled_lr (Optional[float]): optional decoupled learning rate. + decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate. + + Returns: + List of adjusted parameter groups. + """ + + if decoupled_min_lr is None: + decoupled_min_lr = min_lr + + for param_group in param_groups: + if param_group['is_decoupled_lr']: + assert decoupled_lr is not None + param_group['max_lr'] = decoupled_lr + param_group['min_lr'] = decoupled_min_lr + else: + param_group['max_lr'] = lr + param_group['min_lr'] = min_lr + return param_groups + + +def _get_megatron_optimizer_based_on_param_groups( + config: OptimizerConfig, + param_groups: List, + per_model_buffers: Optional[Dict[int, List[ParamAndGradBuffer]]] = None, + model_parallel_group: Optional[torch.distributed.ProcessGroup] = None, + data_parallel_group: Optional[torch.distributed.ProcessGroup] = None, + data_parallel_group_gloo: Optional[torch.distributed.ProcessGroup] = None, + data_parallel_group_idx: Optional[int] = None, +) -> MegatronOptimizer: + """Get Megatron optimizer based on parameter groups. + + Args: + config (OptimizerConfig): optimizer configuration object. + param_groups (list): list of parameter groups. + per_model_buffers (dict, optional): buffers for distributed optimizer. Defaults to None. + data_parallel_group (torch.distributed.ProcessGroup, optional): data-parallel group for + distributed optimizer. Defaults to None. + data_parallel_group_gloo (torch.distributed.ProcessGroup, optional): gloo data-parallel + group for distributed optimizer. Defaults to None. + data_parallel_group_idx (int, optional): data-parallel group index for distributed + optimizer. Defaults to None. + + Returns: + Instance of MegatronOptimizer. + """ + if config.optimizer == 'adam': + optimizer = Adam( + param_groups, + lr=config.lr, + weight_decay=config.weight_decay, + betas=(config.adam_beta1, config.adam_beta2), + eps=config.adam_eps, + ) + + def init_state_fn(opt): + for group in opt.param_groups: + for p in group['params']: + if len(opt.state[p]) == 0: + opt.state[p]['exp_avg'] = torch.zeros_like(p.data) + opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data) + + elif config.optimizer == 'sgd': + optimizer = SGD( + param_groups, + lr=config.lr, + weight_decay=config.weight_decay, + momentum=config.sgd_momentum, + ) + init_state_fn = None + else: + raise Exception('{} optimizer is not supported.'.format(config.optimizer)) + + # Mixed precision optimizer. + # - Note: both the Float16Optimizer and the DistributedOptimizer inherit + # from the MixedPrecisionOptimizer, which manages any optimizer where + # the model params and main params are distinct. + if config.fp16 or config.bf16 or config.use_distributed_optimizer: + + # Grad scaler: + # if loss-scale is provided, instantiate the constant scaler. + # if we are using fp16 and loss-scale is not present, use a + # dynamic scaler. + # otherwise we are running in bf16 with no loss-scale so + # leave it as None. + grad_scaler = None + + # Constant loss scale. + if config.loss_scale: + grad_scaler = ConstantGradScaler(config.loss_scale) + + # Dynamic loss scale. + else: + if config.fp16: + grad_scaler = DynamicGradScaler( + initial_scale=config.initial_loss_scale, + min_scale=config.min_loss_scale, + growth_factor=2.0, + backoff_factor=0.5, + growth_interval=config.loss_scale_window, + hysteresis=config.hysteresis, + ) + + optimizer_args = [ + optimizer, + config, + grad_scaler, + init_state_fn, + ] + if config.use_distributed_optimizer: + optimizer = DistributedOptimizer( + *optimizer_args, + per_model_buffers=per_model_buffers, + data_parallel_group=data_parallel_group, + data_parallel_group_gloo=data_parallel_group_gloo, + data_parallel_group_idx=data_parallel_group_idx, + ) + else: + optimizer = Float16OptimizerWithFloat16Params(*optimizer_args) + setattr(optimizer, 'model_parallel_group', model_parallel_group) + else: + # FP32 optimizer. + optimizer = FP32Optimizer( + optimizer, + config, + init_state_fn, + ) + setattr(optimizer, 'model_parallel_group', model_parallel_group) + + return optimizer + + +def get_megatron_optimizer( + config: OptimizerConfig, + model_chunks: List[MegatronModule], + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, +) -> MegatronOptimizer: + """Retrieve the Megatron optimizer for model chunks. + + We use separate optimizers for expert parameters and non-expert parameters. + + Args: + config (OptimizerConfig): optimizer configuration object. + model_chunks (List[MegatronModule]): model chunks to get optimizer for. + no_weight_decay_cond (func, optional): function to determine whether a parameter + should not perform weight decay. Defaults to None. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. Defaults to None. + lr_mult (float, optional): learning rate multiplier for parameters that + satisfy scale_lr_cond. Defaults to 1.0. + + Returns: + Instance of MegatronOptimizer. + """ + + log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}') + + # Collect param groups. + param_groups = _get_param_groups( + model_chunks, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, + use_decoupled_learning_rate=config.decoupled_lr is not None, + ) + param_groups = _update_min_and_max_lr_in_param_groups( + param_groups, + lr=config.lr, + min_lr=config.min_lr, + decoupled_lr=config.decoupled_lr, + decoupled_min_lr=config.decoupled_min_lr, + ) + + # Collect grad buffers for distributed optimizer. + per_model_buffers = {} + per_model_ep_buffers = {} + for model_idx, model_chunk in enumerate(model_chunks): + if hasattr(model_chunk, 'buffers'): + per_model_buffers[model_idx] = model_chunk.buffers + per_model_ep_buffers[model_idx] = model_chunk.expert_parallel_buffers + + # Split param groups into dense and MoE params (since data-parallel groups for MoE + # parameters can be different with expert parallelism). + dense_param_groups = list(filter(lambda g: not g['is_expert_parallel'], param_groups)) + moe_param_groups = list(filter(lambda g: g['is_expert_parallel'], param_groups)) + + # Create optimizers. + model_parallel_rank = torch.distributed.get_rank(mpu.get_model_parallel_group()) + optimizers = [ + _get_megatron_optimizer_based_on_param_groups( + config, + param_groups=dense_param_groups, + per_model_buffers=per_model_buffers, + model_parallel_group=mpu.get_model_parallel_group(), + data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True), + data_parallel_group_gloo=mpu.get_data_parallel_group_gloo(with_context_parallel=True), + data_parallel_group_idx=model_parallel_rank, + ) + ] + if len(moe_param_groups) > 0: + model_parallel_world_size = torch.distributed.get_world_size(mpu.get_model_parallel_group()) + expert_parallel_rank = mpu.get_expert_model_parallel_rank() + optimizers.append( + _get_megatron_optimizer_based_on_param_groups( + config, + param_groups=moe_param_groups, + per_model_buffers=per_model_ep_buffers, + model_parallel_group=mpu.get_model_parallel_group(with_expert_parallel=True), + data_parallel_group=mpu.get_data_modulo_expert_parallel_group( + with_context_parallel=True + ), + data_parallel_group_gloo=mpu.get_data_modulo_expert_parallel_group_gloo( + with_context_parallel=True + ), + data_parallel_group_idx=expert_parallel_rank * model_parallel_world_size + + model_parallel_rank, + ) + ) + + if len(optimizers) == 1: + return optimizers[0] + + return ChainedOptimizer(optimizers) diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py new file mode 100644 index 0000000..708ccd0 --- /dev/null +++ b/megatron/core/optimizer/clip_grads.py @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Gradient clipping.""" + +import os +from typing import List, Optional, Union + +import torch +from torch import inf + +try: + from transformer_engine.pytorch.optimizers import ( + multi_tensor_applier, + multi_tensor_l2norm, + multi_tensor_scale, + ) + + l2_norm_impl = multi_tensor_l2norm + multi_tensor_scale_impl = multi_tensor_scale +except ImportError: + try: + import amp_C + from apex.multi_tensor_apply import multi_tensor_applier + + l2_norm_impl = amp_C.multi_tensor_l2norm + multi_tensor_scale_impl = amp_C.multi_tensor_scale + except ImportError: + import warnings + + warnings.warn( + f'Transformer Engine and Apex are not installed. ' + 'Falling back to local implementations of multi_tensor_applier, ' + 'multi_tensor_l2norm, and multi_tensor_scale' + ) + + from megatron.core.utils import ( + local_multi_tensor_applier, + local_multi_tensor_l2_norm, + local_multi_tensor_scale, + ) + + multi_tensor_applier = local_multi_tensor_applier + l2_norm_impl = local_multi_tensor_l2_norm + multi_tensor_scale_impl = local_multi_tensor_scale + + +from ..tensor_parallel import param_is_not_tensor_parallel_duplicate +from ..transformer.module import param_is_not_shared + + +def get_grad_norm_fp32( + grads_for_norm: Union[List[torch.Tensor], torch.Tensor], + norm_type: Union[int, float] = 2, + model_parallel_group: Optional[torch.distributed.ProcessGroup] = None, +) -> float: + """Calculate the norm of gradients in fp32. + + This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and + added functionality to handle model parallel parameters. + + Arguments: + grads_for_norm (Iterable[Tensor] or Tensor): an iterable of Tensors or a single + Tensor that will be used for calculating the grad norm. + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + model_parallel_group (group): given the nature of the distributed + optimizer, this is passed as an argument. + + Returns: + Total norm of the parameters (viewed as a single vector). + """ + + if isinstance(grads_for_norm, torch.Tensor): + grads_for_norm = [grads_for_norm] + + # Norm parameters. + norm_type = float(norm_type) + total_norm = 0.0 + + # Calculate norm. + if norm_type == inf: + total_norm = max(grad.abs().max() for grad in grads_for_norm) + total_norm_cuda = torch.tensor([float(total_norm)], dtype=torch.float, device='cuda') + # Take max across all model-parallel GPUs. + torch.distributed.all_reduce( + total_norm_cuda, op=torch.distributed.ReduceOp.MAX, group=model_parallel_group + ) + total_norm = total_norm_cuda[0].item() + + else: + if norm_type == 2.0: + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda') + # Use apex's multi-tensor applier for efficiency reasons. + # Multi-tensor applier takes a function and a list of list + # and performs the operation on that list all in one kernel. + if grads_for_norm: + grad_norm, _ = multi_tensor_applier( + l2_norm_impl, + dummy_overflow_buf, + [grads_for_norm], + False, # no per-parameter norm + ) + else: + grad_norm = torch.tensor([0], dtype=torch.float, device='cuda') + # Since we will be summing across data parallel groups, + # we need the pow(norm-type). + total_norm = grad_norm**norm_type + + else: + for grad in grads_for_norm: + grad_norm = torch.norm(grad, norm_type) + total_norm += grad_norm**norm_type + + # Sum across all model-parallel GPUs. + torch.distributed.all_reduce( + total_norm, op=torch.distributed.ReduceOp.SUM, group=model_parallel_group + ) + total_norm = total_norm.item() ** (1.0 / norm_type) + + return total_norm + + +def clip_grad_by_total_norm_fp32( + parameters: Union[List[torch.Tensor], torch.Tensor], + max_norm: Union[int, float], + total_norm: float, +): + """Clips gradient of an iterable of parameters in fp32 by total norm. + + Note that the gradients are modified in place. + + Args: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized. + max_norm (float or int): max norm of the gradients. + total_norm (float): total norm of the gradients. + """ + # Grads. + grads = [] + for param in parameters: + if param.grad is not None: + assert param.grad.type() == 'torch.cuda.FloatTensor' + grads.append(param.grad.detach()) + + # Scale. + clip_coeff = max_norm / (total_norm + 1.0e-6) + if clip_coeff < 1.0: + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda') + multi_tensor_applier( + multi_tensor_scale_impl, dummy_overflow_buf, [grads, grads], clip_coeff + ) + + +def count_zeros_fp32( + parameters: Union[List[torch.Tensor], torch.Tensor], + model_parallel_group: torch.distributed.ProcessGroup, +) -> float: + """Counts the number of zeros in gradients associated with the passed-in list of + parameters. + + Args: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have the number of zeros in its corresponding + gradient counted. + model_parallel_group (torch.distributed.ProcessGroup, optional): model-parallel + group over which grad norm needs to be aggregated. + """ + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + # Filter parameters based on: + # - grad should not be none + # - parameter should not be shared + # - should not be a replica due to tensor model parallelism + total_num_zeros = torch.tensor([0.0], dtype=torch.float, device='cuda') + for param in parameters: + grad_not_none = param.grad is not None + is_not_shared = param_is_not_shared(param) + is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) + if grad_not_none and is_not_shared and is_not_tp_duplicate: + grad = param.grad.detach() + num_zeros = grad.numel() - torch.count_nonzero(grad) + total_num_zeros = num_zeros + total_num_zeros + + # Sum across all model-parallel GPUs. + torch.distributed.all_reduce( + total_num_zeros, op=torch.distributed.ReduceOp.SUM, group=model_parallel_group + ) + + total_num_zeros = total_num_zeros.item() + + return total_num_zeros diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py new file mode 100644 index 0000000..d31cbf1 --- /dev/null +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -0,0 +1,1616 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron distributed optimizer.""" + + +import itertools +from dataclasses import replace +from logging import getLogger +from typing import Callable, Dict, List, Optional, Tuple + +import torch + +HAVE_APEX_OR_TE = True +try: + from transformer_engine.pytorch.optimizers import FusedAdam as Adam +except ImportError: + try: + from apex.optimizers import FusedAdam as Adam + except ImportError: + HAVE_APEX_OR_TE = False + +from .. import parallel_state, tensor_parallel +from ..dist_checkpointing import ShardedTensor +from ..dist_checkpointing.dict_utils import nested_values +from ..dist_checkpointing.mapping import ( + LocalNonpersistentObject, + ShardedObject, + ShardedStateDict, + ShardedTensorFactory, +) +from ..dist_checkpointing.optimizer import get_param_id_to_sharded_param_map +from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories +from ..distributed import ParamAndGradBuffer, shard_buffer +from .grad_scaler import MegatronGradScaler +from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper +from .optimizer_config import OptimizerConfig + +logger = getLogger(__name__) + + +class Range: + """ + A range represents a start and end points for indexing a shard + from a full tensor. + """ + + def __init__(self, start: int, end: int): + self.start = start + self.end = end + self.size = end - start + + def normalize(self, start: int = 0): + return Range(start, start + self.size) + + def __str__(self): + return "%d,%d [%d]" % (self.start, self.end, self.size) + + def __len__(self): + return self.end - self.start + + +class DistributedOptimizer(MixedPrecisionOptimizer): + @classmethod + def _build_model_gbuf_param_range_map( + cls, + param_world_index_map: Dict[torch.nn.Parameter, Tuple], + gbuf_world_range: Range, + bucket_offset: int, + ): + """ + Build mapping from param reference to grad buffer shard ranges. + + This method builds a mapping from parameter references to grad + buffer shard ranges, specific to each data-parallel (DP) rank's + set of 'owned' parameters. Each grad buffer (padded to be an even + multiple of DP-world-size) is conceptually divided into DP-world-size + contiguous regions, where each DP rank 'owns' a contiguous regions. + Ownership in this sense means DP rank is responsible for reducing + the relevant subset of grads, and updating the relevant subset of + params. + + This conceptual partitioning of the grad buffer does NOT respect + parameter boundaries, and as such it is assumed that each created + range references a shard (or subset) of the full parameter. It is + easiest to think of each DP rank as operating (i.e., reducing, + gathering) purely on views into the grad buffer, for all model-to- + main & main-to-model operations. + + This method creates four ranges: + - The param's range within the entire grad buffer (i.e., world index). + - The param's range within the relevant grad bucket's buffer. + - The param's range within the DP rank's local view of the grad buffer. + - The param's range within itself (i.e., its shard). + """ + + # Param range map. + param_range_map = {} + for param, param_world_indexes in param_world_index_map.items(): + + # Param range. + param_world_start, param_world_end, _ = param_world_indexes + param_local_start = max(0, param_world_start - gbuf_world_range.start) + param_local_end = min(gbuf_world_range.size, param_world_end - gbuf_world_range.start) + + # Add param, if within local gbuf range. + if param_local_end > param_local_start: + param_local_range = Range(param_local_start, param_local_end) + param_world_range = param_local_range.normalize( + param_local_start + gbuf_world_range.start + ) + param_world_range_in_bucket = Range( + param_world_range.start - bucket_offset, param_world_range.end - bucket_offset + ) + sub_param_start = max(0, gbuf_world_range.start - param_world_start) + sub_param_range = param_local_range.normalize(sub_param_start) + param_range_map[param] = { + "gbuf_world": param_world_range, + "gbuf_world_in_bucket": param_world_range_in_bucket, + "gbuf_local": param_local_range, + "param": sub_param_range, + } + + return param_range_map + + @classmethod + def _build_model_gbuf_range(cls, param_and_grad_buffer: ParamAndGradBuffer, bucket_index: int): + """ + Build mapping between params and their grad buffers. + + This method does the initial setup for the method above. This setup + includes determining the shard ranges into the param_and_grad_buffer + for each data-parallel (DP) rank. Each DP rank keeps range info for + all other DP ranks, for the purpose of creating args for + reduce-scatter and all-gather. + """ + + data_parallel_rank = torch.distributed.get_rank(param_and_grad_buffer.data_parallel_group) + data_parallel_world_size = param_and_grad_buffer.data_parallel_group.size() + + bucket = param_and_grad_buffer.buckets[bucket_index] + gbuf_size = bucket.grad_data.numel() + assert ( + gbuf_size % data_parallel_world_size == 0 + ), f"Each bucket's buffer size should be divisible by {data_parallel_world_size}" + max_gbuf_range_size = gbuf_size // data_parallel_world_size + + # All world ranges (i.e., across all data parallel ranks). + gbuf_world_all_ranges = [] + for r in range(data_parallel_world_size): + # Compute start of chunk in this bucket. + gbuf_world_start = r * max_gbuf_range_size + gbuf_world_end = min(gbuf_size, gbuf_world_start + max_gbuf_range_size) + # Add bucket's offset in grad buffer. + gbuf_world_range = Range( + gbuf_world_start + bucket.offset, gbuf_world_end + bucket.offset + ) + gbuf_world_all_ranges.append(gbuf_world_range) + + # Local DP's ranges. + gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank] + + # Get each param's ranges. + param_range_map = cls._build_model_gbuf_param_range_map( + param_and_grad_buffer.param_index_map, gbuf_world_range, bucket.offset + ) + + # Group into dict. + data = { + "param_map": param_range_map, + } + + return data + + @classmethod + def _build_gbuf_range_map(cls, param_and_grad_buffer: ParamAndGradBuffer): + """ + Build mapping between params and their grad buffers. These mappings are + partitioned according to data type. + + Iterate through all buckets of grad buffer to construct param ranges + that this rank "owns" (the dp_rank'th shard of each bucket, where each + shard is 1/dp_world_size of the bucket). + + Args: + param_and_grad_buffer (ParamAndGradBuffer): buffer to build mapping for. + """ + return { + (param_and_grad_buffer.param_dtype, param_and_grad_buffer.grad_dtype): [ + cls._build_model_gbuf_range(param_and_grad_buffer, bucket_index) + for bucket_index in range(len(param_and_grad_buffer.buckets)) + ] + } + + @classmethod + def _build_model_param_gbuf_map( + cls, gbuf_ranges: List[Dict] + ) -> Dict[torch.nn.Parameter, Tuple]: + """ + Create a reverse of the gbuf_ranges, for referencing in opposite direction. + """ + param_gbuf_map = {} + for gbuf_index, gbuf_range_map in enumerate(gbuf_ranges): + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_map.items(): + for bucket_index, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets): + for param, _ in gbuf_range_map["param_map"].items(): + assert ( + param not in param_gbuf_map + ), "Param should not be in param_gbuf_map; each param only belongs to a single bucket" + param_gbuf_map[param] = (gbuf_index, dtype, bucket_index) + return param_gbuf_map + + @classmethod + def _build_optimizer_group_ranges(cls, param_groups: List[Dict], gbuf_ranges: List[Dict]): + """ + Create optimizer groups. + + Given the set of parameter shard ranges that are owned by the current + data-parallel (DP) rank, gather the set of parameters that will be + used (in the method below) to create the current DP's optimizer + groups. + """ + + # Param group map. + # World param group map. + # - Store a mapping of for all parameters + # across all DP ranks. This is necessary because it is our first + # cross reference between the DDP mappings and the optimizer group + # parameters. This mapping only for use in the next step of building + # the local mapping over this DP rank's parameters. + world_param_group_map = {} + for group_index, group in enumerate(param_groups): + for param in group["params"]: + assert param.requires_grad + world_param_group_map[param] = group_index + + # Optimizer group ranges & param-group mapping. + # - Build a mapping from groups to their contained parameters, and also + # from parameters to their containing group index and order within + # the group. The group index and order are particularly important for + # saving and loading checkpoints. + local_param_group_map = {} + group_ranges = [{"params": []} for _ in param_groups] + for gbuf_range_map in gbuf_ranges: + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_map.items(): + for gbuf_range_map in gbuf_range_map_for_all_buckets: + for param in gbuf_range_map["param_map"]: + group_index = world_param_group_map[param] + group_range = group_ranges[group_index] + group_range["params"].append(param) + local_param_group_map[param] = (group_index, len(group_range["params"]) - 1) + + # Squeeze zero-size group ranges. + for group_index, group_range in enumerate(group_ranges): + group_range["orig_group"] = param_groups[group_index] + group_range["orig_group_idx"] = param_groups[group_index] + + return local_param_group_map, group_ranges + + @classmethod + def _build_model_and_main_param_groups( + cls, + gbuf_ranges: List[Dict], + param_gbuf_map: Dict[torch.nn.Parameter, Tuple], + opt_group_ranges: List, + ): + """ + Create main parameter groups needed for the optimizer step. + + These groups encompass both: 1) groups used by this class, for + reducing/gather, and 2) groups used by the inner optimizer for the + parameter update. Given that the conceptual grad buffer partitioning + (created in earlier method) doesn't respect parameter boundaries, + the optimizer operates on shards of the model parameters, rather than + the full parameters. + """ + + # Parameter groups: + # model_float16_groups: original float16 parameters + # model_fp32_groups: original fp32 parameters + # shard_float16_groups: shards of original float16 parameters + # shard_fp32_groups: shards of original fp32 parameters + # shard_fp32_from_float16_groups: fp32 copy of float16 parameters + model_float16_groups = [] + model_fp32_groups = [] + shard_float16_groups = [] + shard_fp32_groups = [] + shard_fp32_from_float16_groups = [] + + # Allocate (or slice) each group's param shard. + for group_range in opt_group_ranges: + + # Params of this group. + model_float16_params_this_group = [] + model_fp32_params_this_group = [] + shard_float16_params_this_group = [] + shard_fp32_params_this_group = [] + shard_fp32_from_float16_params_this_group = [] + model_float16_groups.append(model_float16_params_this_group) + model_fp32_groups.append(model_fp32_params_this_group) + shard_float16_groups.append(shard_float16_params_this_group) + shard_fp32_groups.append(shard_fp32_params_this_group) + shard_fp32_from_float16_groups.append(shard_fp32_from_float16_params_this_group) + + for model_param in group_range["params"]: + + assert model_param.requires_grad + + gbuf_index, dtype, bucket_index = param_gbuf_map[model_param] + gbuf_range = gbuf_ranges[gbuf_index][dtype][bucket_index] + param_range = gbuf_range["param_map"][model_param]["param"] + + # fp16, bf16 params. + if model_param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']: + + # Clone model -> main. + shard_model_param = model_param.detach().view(-1)[ + param_range.start : param_range.end + ] + shard_main_param = shard_model_param.clone().float() + tensor_parallel.copy_tensor_model_parallel_attributes( + shard_model_param, model_param + ) + tensor_parallel.copy_tensor_model_parallel_attributes( + shard_main_param, model_param + ) + if hasattr(model_param, 'shared'): + shard_model_param.shared = model_param.shared + shard_main_param.shared = model_param.shared + + # Add to group. + model_float16_params_this_group.append(model_param) + shard_float16_params_this_group.append(shard_model_param) + shard_fp32_from_float16_params_this_group.append(shard_main_param) + + # fp32 params. + elif model_param.type() == 'torch.cuda.FloatTensor': + shard_model_param = model_param.view(-1)[param_range.start : param_range.end] + model_fp32_params_this_group.append(model_param) + shard_fp32_params_this_group.append(shard_model_param) + tensor_parallel.copy_tensor_model_parallel_attributes( + shard_model_param, model_param + ) + if hasattr(model_param, 'shared'): + shard_model_param.shared = model_param.shared + + else: + raise TypeError( + 'Wrapped parameters must be one of ' + 'torch.cuda.FloatTensor, ' + 'torch.cuda.HalfTensor, or ' + 'torch.cuda.BFloat16Tensor. ' + 'Received {}'.format(model_param.type()) + ) + + # Update optimizer's params. + group_range["orig_group"]["params"] = [ + *shard_fp32_params_this_group, + *shard_fp32_from_float16_params_this_group, + ] + + return ( + model_float16_groups, + model_fp32_groups, + shard_float16_groups, + shard_fp32_groups, + shard_fp32_from_float16_groups, + ) + + def __init__( + self, + optimizer: torch.optim.Optimizer, + config: OptimizerConfig, + grad_scaler: MegatronGradScaler, + init_state_fn: Optional[Callable], + per_model_buffers: Dict[int, List[ParamAndGradBuffer]], + data_parallel_group: torch.distributed.ProcessGroup, + data_parallel_group_gloo: torch.distributed.ProcessGroup, + data_parallel_group_idx: int, + ): + """ + Distributed optimizer, for all data types (fp16, bf16, and fp32). + + The steps in this method create the core mapping between param and grad buffers, + parameters, and parameter shard ranges, that is needed for converting between model + param indexes and main parameter shard indexes. This method also updates the optimizer + parameter groups with the newly created shards. + + Args: + optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD. + config (OptimizerConfig): configuration object for optimizer. + grad_scaler (MegatronGradScaler): used for scaling gradients. Note that + this can be None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constant gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + init_state_fn (Callable, optional): function to initialize state in the optimizer. + per_model_buffers (Dict[int, List[ParamAndGradBuffer]]): the implementation of the + distributed optimizer is centered on using a contiguous buffer for + communicating grads & params between the model state and the optimizer state. + You can find a more detailed description in + https://github.com/NVIDIA/Megatron-LM/blob/main/docs/source/distrib_optimizer.md. + data_parallel_group (torch.distributed.ProcessGroup): data-parallel group to use to + all-gather params after optimizer.step(). + data_parallel_group_gloo (torch.distributed.ProcessGroup): gloo data-parallel group + (used in checkpoint loading and saving). + data_parallel_group_idx (int): index in data-parallel group (used by + distributed checkpointing logic). + """ + + assert ( + HAVE_APEX_OR_TE + ), f'Please install Apex or Transformer Engine to use DistributedOptimizer.' + + super().__init__( + optimizer, + config, + grad_scaler, + init_state_fn, + ) + + assert isinstance( + optimizer, Adam + ), "Only Adam currently supported, due to checkpointing requirements." + + # Model grad buffer ranges. + assert per_model_buffers is not None, "per_model_buffers must be provided" + self.buffers = list(itertools.chain(*per_model_buffers.values())) + self.per_model_buffers = per_model_buffers + self.data_parallel_group = data_parallel_group + self.data_parallel_group_gloo = data_parallel_group_gloo + self.data_parallel_group_idx = data_parallel_group_idx + self.gbuf_idx_to_model_idx_map = {} + gbuf_idx = 0 + for model_idx, buffers in self.per_model_buffers.items(): + for _ in buffers: + self.gbuf_idx_to_model_idx_map[gbuf_idx] = model_idx + gbuf_idx += 1 + self.gbuf_ranges = [] + self.per_bucket_numel = [] + self.per_bucket_numel_unpadded = [] + for buffer in self.buffers: + + self.per_bucket_numel.append( + { + (buffer.param_dtype, buffer.grad_dtype): [ + bucket.grad_data.numel() for bucket in buffer.buckets + ] + } + ) + self.per_bucket_numel_unpadded.append( + { + (buffer.param_dtype, buffer.grad_dtype): [ + bucket.numel_unpadded for bucket in buffer.buckets + ] + } + ) + self.gbuf_ranges.append(self._build_gbuf_range_map(buffer)) + self.model_param_gbuf_map = self._build_model_param_gbuf_map(self.gbuf_ranges) + + # Optimizer ranges. + ( + self.model_param_group_index_map, + self.opt_group_ranges, + ) = self._build_optimizer_group_ranges(self.optimizer.param_groups, self.gbuf_ranges) + + # Allocate main param shards. + ( + self.model_float16_groups, + self.model_fp32_groups, + self.shard_float16_groups, + self.shard_fp32_groups, + self.shard_fp32_from_float16_groups, + ) = self._build_model_and_main_param_groups( + self.gbuf_ranges, self.model_param_gbuf_map, self.opt_group_ranges + ) + + # Now construct data structures to manage all-gather handles. + self.all_gather_handles = [] + self.all_gather_handle_index_to_bucket_index_map = [] + self.model_index_to_all_gather_handle_index_map = {} + self.all_gather_handle_indices = [] + self.param_to_all_gather_handle_index_map = {} + + self.pbuf_view_items = self._get_model_param_buffer_dp_views() + for gbuf_index, dtype, bucket_index, _, _ in self.pbuf_view_items: + self.all_gather_handle_index_to_bucket_index_map.append( + (gbuf_index, dtype, bucket_index) + ) + all_gather_handle_index = len(self.all_gather_handle_index_to_bucket_index_map) - 1 + self.all_gather_handles.append(None) + + # Store all all_gather_handle_indices. + model_idx = self.gbuf_idx_to_model_idx_map[gbuf_index] + if model_idx not in self.model_index_to_all_gather_handle_index_map: + self.model_index_to_all_gather_handle_index_map[model_idx] = [] + self.model_index_to_all_gather_handle_index_map[model_idx].append( + all_gather_handle_index + ) + + for param in self.buffers[gbuf_index].buckets[bucket_index].params_list: + self.param_to_all_gather_handle_index_map[param] = all_gather_handle_index + self.num_all_gather_handles = len(self.all_gather_handle_index_to_bucket_index_map) + + self.overlap_param_gather = self.config.overlap_param_gather + self.remove_pre_hook_handle = None + if self.overlap_param_gather: + self.enable_pre_hook() + + self.update_successful = False + + # Update optimizer groups. + # - Also, leverage state_dict() and load_state_dict() to + # recast preexisting per-param state tensors. + self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges] + self.optimizer.load_state_dict(self.optimizer.state_dict()) + + def enable_pre_hook(self): + """ + Enable forward pre-hook needed for param all-gather overlap with forward compute. + """ + assert self.remove_pre_hook_handle is None + self.remove_pre_hook_handle = torch.nn.modules.module.register_module_forward_pre_hook( + self._make_forward_pre_hook() + ) + + def disable_pre_hook(self): + """ + Disable forward pre-hook needed for param all-gather overlap with forward compute. + """ + assert self.remove_pre_hook_handle is not None + self.remove_pre_hook_handle.remove() + self.remove_pre_hook_handle = None + + # Make sure all-gathers are completed as needed. + self._reset_metadata_and_sync_gather_all_model_params(force_sync=True) + + def _get_model_param_range_map(self, param: torch.nn.Parameter): + """ + Given a model param, get the index sub-range of the param that this + data-parallel rank owns. + """ + gbuf_index, dtype, bucket_index = self.model_param_gbuf_map[param] + gbuf_range_map = self.gbuf_ranges[gbuf_index][dtype][bucket_index] + param_range_map = gbuf_range_map["param_map"][param] + return param_range_map + + def get_model_parallel_group(self) -> torch.distributed.ProcessGroup: + """ + With the distributed optimizer, the model parallel group is the + entire world. + """ + return None + + def state_dict(self): + """ + The state dict contains all non-DP-rank-dependent (i.e., non-parameter- + related) optimizer variables. The returned state dict can be stored in + the standard model/RNG checkpoint file. The parameter and dependent + optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate + checkpoint file by calling 'save_parameter_state()'. + """ + + state_dict = {} + + # Optimizer state (do not store parameter state here). + state_dict['optimizer'] = { + k: v for k, v in self.optimizer.state_dict().items() if k != "state" + } + for param_group in state_dict["optimizer"]["param_groups"]: + del param_group["params"] + + # Grad scaler state. + if self.grad_scaler: + state_dict['grad_scaler'] = self.grad_scaler.state_dict() + + return state_dict + + def load_state_dict(self, state_dict): + """Load the state dict. + + As detailed in state_dict(), the state dict contains all non- + parameter-related variables. This method is notably longer than + state_dict(), because the Torch optimizers state has yet to be + allocated at this point, and so we must do a cross referencing between + the optimizers state (and the ordering it expects for parameter state) + and this DP rank's shards. The optimizer at this point does not contain + any tensor dimension information, so we must get these dimensions from + the DP shards mapped during DistributedOptimizer.__init__(). + + The tensor parameter state is loaded via load_parameter_state(), and + so this method also must populate the loaded state dict with dummy + tensor data (i.e., via torch.empty() below). This will be overwritten + during load_parameter_state(). + + ** Note: Torch optimizer's state structure. ** + The Torch optimizer stores its state in two levels. The top level is a + list of groups, where each group contains a list of integer indexes + (corresponding to parameters) that index into a master parameter list + that is shared by all groups. As such, three values are necessary for + maintaining this ordering: + + - group_index : The group to which a parameter belongs. + - group_order : The index of a parameter within its group. + - state_order : The index of a parameter within the shared parameter + list. + """ + + # Get the Torch optimizer's state dict. + # - This 'inner' optimizer at this point is unallocated, and only + # contains an integer odering of parameters within each group, and + # the ordering of parameters within its flattened parameter state + # list. + inner_state_dict = self.optimizer.state_dict() + state_dict_param_groups = [ + { + **group, + "params": list(inner_state_dict["param_groups"][idx]["params"]), + } + for idx, group in enumerate(state_dict["optimizer"]["param_groups"]) + ] + + # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below) + # - Real data is overwritten during load_parameter_state(). + state_dict_state = [] + for gbuf_range_maps in self.gbuf_ranges: + for gbuf_range_map_for_all_buckets in gbuf_range_maps.values(): + for gbuf_range_map in gbuf_range_map_for_all_buckets: + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + + # Get parameter ordering information (see method docstring + # for details). + group_index, group_order = self.model_param_group_index_map[model_param] + state_order = inner_state_dict["param_groups"][group_index]["params"][ + group_order + ] + + # Allocate dummy tensors. + numel = len(param_range_map["gbuf_world"]) + init_shard = lambda: torch.empty( + (numel,), dtype=torch.float32, device=torch.cuda.current_device() + ) + + state_dict_state.append( + ( + state_order, + { + "exp_avg": init_shard(), + "exp_avg_sq": init_shard(), + }, + ) + ) + + # Sort by state order (see method docstring for details). + state_dict_state.sort(key=lambda s: s[0]) + state_dict_state = {s[0]: s[1] for s in state_dict_state} + + # Optimizer. + self.optimizer.load_state_dict( + { + "state": state_dict_state, + "param_groups": state_dict_param_groups, + } + ) + + # Grad scaler. + if 'grad_scaler' not in state_dict: + if self.config.fp16: + logger.info( + '***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...' + ) + else: + if self.grad_scaler: + self.grad_scaler.load_state_dict(state_dict['grad_scaler']) + else: + logger.info( + '***WARNING*** fould the grad scaler in the ' + 'checkpoint but it is None in the class. ' + 'Skipping loading grad scaler ...' + ) + + if 'param_state' in state_dict: + assert 'param_state_sharding_type' in state_dict, state_dict.keys() + param_state = state_dict['param_state'] + sharding_type = state_dict['param_state_sharding_type'] + logger.info(f'Loading distributed optimizer sharded state of type {sharding_type}') + if sharding_type == 'dp_zero_gather_scatter': + self.load_parameter_state_from_dp_zero(param_state) + elif sharding_type == 'fully_sharded_bucket_space': + self.load_parameter_state_from_fs_bucket_space(param_state) + elif sharding_type == 'fully_sharded_model_space': + self.load_parameter_state_from_fs_model_space(param_state) + else: + raise NotImplementedError(f'Unknown sharding_type: {sharding_type}') + + def get_parameter_state_fs_bucket_space(self): + """Get internal representation of parameter state without any copies and modifications. + + This is referred to as "fully sharded bucket space" because the optimizer state is + fully sharded (e.g. no gather involved) and bucket-centric (the state + follows the internal structure of the Distributed Optimizer buckets) + as opposed to model-centric (typical structure of PyT optimizers) + """ + state = { + "per_bucket_numel": self.per_bucket_numel, + "per_bucket_numel_unpadded": self.per_bucket_numel_unpadded, + } + for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges): + + # Iterate grad buffers (by data type). + dtype_state = {} + assert len(gbuf_range_maps) == 1, "single dtype supported, for now." + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items(): + buckets_state = [] + for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets): + bucket_state = [] + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + + # Main param & optimizer states. + group_index, group_order = self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups[group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + tensors = { + "param": main_param, + **optim_state, + "gbuf_local_start": param_range_map["gbuf_local"].start, + "gbuf_local_end": param_range_map["gbuf_local"].end, + } + bucket_state.append(tensors) + buckets_state.append(bucket_state) + dtype_state[dtype] = buckets_state + state[gbuf_idx] = dtype_state + return state + + def get_parameter_state_dp_zero(self): + """Get parameter state (i.e., parameter & optimizer tensors). + + This method performs two steps: + - For each DP rank, copy param & optimizer shards to contiguous CPU + buffers (e.g., one buffer each for main_param, exp_avg, and + exp_avg_sq). + - Gather contiguous buffers on DP rank 0 and concatenate to world + buffers. + """ + + # Data parallelism variables. + data_parallel_world_size = self.data_parallel_group_gloo.size() + data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group_gloo) + data_parallel_group_gloo = self.data_parallel_group_gloo + data_parallel_global_ranks = torch.distributed.get_process_group_ranks( + self.data_parallel_group_gloo + ) + + # Collect param states. + state = { + "buckets_coalesced": True, + } + for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges): + + # Iterate grad buffers (by data type). + dtype_state = {} + assert len(gbuf_range_maps) == 1, "single dtype supported, for now." + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items(): + buffer_numel_unpadded = self.buffers[gbuf_idx].numel_unpadded + # Create coalesced tensors for all state related to parameters in this buffer. + world_tensors = {} + if data_parallel_rank == 0: + world_tensors = { + key: torch.zeros( + (buffer_numel_unpadded,), dtype=torch.float32, device="cpu" + ) + for key in ("param", "exp_avg", "exp_avg_sq") + } + world_tensors["numel_unpadded"] = buffer_numel_unpadded + offset_in_world_tensors = 0 + for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets): + + # Compute local DP contiguous shard's size. + gbuf_world_numel = self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel() + assert gbuf_world_numel % data_parallel_world_size == 0 + gbuf_local_numel = gbuf_world_numel // data_parallel_world_size + + gbuf_world_numel_unpadded = ( + self.buffers[gbuf_idx].buckets[bucket_idx].numel_unpadded + ) + assert gbuf_world_numel_unpadded <= gbuf_world_numel + + local_shards = { + key: torch.zeros((gbuf_local_numel,), dtype=torch.float32, device="cpu") + for key in ("param", "exp_avg", "exp_avg_sq") + } + + # Build contiguous DP rank shards (for param + optim states). + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + + # Main param & optimizer states. + group_index, group_order = self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups[group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + tensors = { + "param": main_param, + **optim_state, + } + + # Copy states into contiguous shard. + gbuf_local_start = param_range_map["gbuf_local"].start + gbuf_local_end = param_range_map["gbuf_local"].end + for key in local_shards: + local_shards[key][gbuf_local_start:gbuf_local_end].data.copy_( + tensors[key].detach().cpu() + ) + + # Gather contiguous shards on DP rank 0. + for key, send_tensor in local_shards.items(): + + # Gather tensor list. + if data_parallel_rank == 0: + recv_tensors = [ + torch.zeros((gbuf_local_numel,), dtype=torch.float32, device="cpu") + for _ in range(data_parallel_world_size) + ] + else: + recv_tensors = None + + # Gather. + torch.distributed.gather( + send_tensor, + recv_tensors, + data_parallel_global_ranks[0], + data_parallel_group_gloo, + ) + + # Concatenate. + if data_parallel_rank == 0: + recv_tensors_concatenated = torch.cat(recv_tensors) + # Copy this bucket's collected all-gather tensors into the right place in the + # tensor for the buffer. The tensor for the buffer gets rid of the padding + # between buckets. + start = offset_in_world_tensors + end = offset_in_world_tensors + gbuf_world_numel_unpadded + world_tensors[key][start:end].copy_( + recv_tensors_concatenated[:gbuf_world_numel_unpadded] + ) + + offset_in_world_tensors += gbuf_world_numel_unpadded + + # Collect world state. + dtype_state[dtype] = world_tensors + state[gbuf_idx] = dtype_state + + return state + + def save_parameter_state(self, filename: str): + """Save the distributed parameter state on DP rank 0. + + Args: + filename (str): path to save parameter state to. + """ + + state_dict = self.get_parameter_state_dp_zero() + if torch.distributed.get_rank(self.data_parallel_group) == 0: + torch.save(state_dict, filename) + + def sharded_state_dict( + self, + model_sharded_state_dict: ShardedStateDict, + is_loading: bool = False, + sharding_type: str = 'fully_sharded_model_space', + ): + """ + Chooses between 3 param state sharding implementations as requested by `sharding_type`. + + Regular state dict parameters are saved on DP rank 0 and loaded on all ranks. + """ + if not is_loading and sharding_type == 'fully_sharded_bucket_space': + logger.warning( + '`fully_sharded_bucket_space` sharding for DistributedOptimizer' + ' checkpoint is deprecated and will be removed in the future.' + ' Please switch to `full_sharded_model_space`.' + ) + + state_dict = self.state_dict() + if sharding_type != 'fully_sharded_model_space': + # State dict differs between different model parallel groups + state_dict = { + k: ShardedObject( + f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.{k}', + v, + (1,), + (0,), + replica_id=torch.distributed.get_rank(self.data_parallel_group), + ) + for k, v in state_dict.items() + } + + if is_loading: + self.init_state_fn(self.optimizer) + + if sharding_type == 'fully_sharded_bucket_space': + param_state = self.sharded_param_state_fs_bucket_space( + model_sharded_state_dict, is_loading + ) + elif sharding_type == 'dp_zero_gather_scatter': + param_state = self.sharded_param_state_dp_zero(model_sharded_state_dict, is_loading) + elif sharding_type == 'fully_sharded_model_space': + param_state = self.sharded_param_state_fs_model_space( + model_sharded_state_dict, is_loading + ) + else: + raise NotImplementedError(f'Unknown sharding_type: {sharding_type}') + + state_dict['param_state'] = param_state + state_dict['param_state_sharding_type'] = sharding_type + return state_dict + + def sharded_param_state_dp_zero( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ): + """Naive implementation which reuses gather/scatter from the legacy ckpt format. + + During saving, gathers the parameters state on DP rank 0 and saves a ShardedObject + with fixed TPxPP structure. During loading, loads the saved data on DP rank 0 + (None on other ranks). Relies on the parameters scatter done in load_state_dict. + """ + if is_loading: + param_state_data = None + else: + # Gather on rank 0 + param_state_data = self.get_parameter_state_dp_zero() + + if torch.distributed.get_rank(self.data_parallel_group) == 0: + # Fixed TPxPP. Save on DP rank 0 only + param_state = ShardedObject( + f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.param_state', + param_state_data, + (1,), + (0,), + ) + else: + # DP ranks > 0 don't save. During loading, the param_state needs to be None. + param_state = LocalNonpersistentObject(None) + + return param_state + + def sharded_param_state_fs_bucket_space( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ): + """Sharded state dict where each noncontiguous buffer is a separate ShardedTensor. + + Results in fully parallel save and load without any inter-process + communication or intermediate buffers/copies. + """ + data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group) + data_parallel_world_size = torch.distributed.get_world_size(self.data_parallel_group) + + state = self.get_parameter_state_fs_bucket_space() + # per_bucket_numel metadata is saved separately for each TPxPP domain. + for per_bucket_key in ('per_bucket_numel', 'per_bucket_numel_unpadded'): + state[per_bucket_key] = ShardedObject( + f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.{per_bucket_key}', + state[per_bucket_key], + (1,), + (0,), + replica_id=data_parallel_rank, + ) + + for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges): + for dtype, gbuf_range_map_for_all_buckets in state[gbuf_idx].items(): + for bucket_idx, bucket_state in enumerate(gbuf_range_map_for_all_buckets): + # Compute local DP contiguous shard's size. + gbuf_world_numel = self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel() + assert gbuf_world_numel % data_parallel_world_size == 0 + gbuf_local_numel = gbuf_world_numel // data_parallel_world_size + + sharded_bucket_key = f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.gbuf_idx_{gbuf_idx}.dtype_{dtype}.bucket_idx_{bucket_idx}' + + # The global ckpt tensors must be fully covered. + # We add extra empty padding if necessary + assert bucket_state, 'empty bucket encountered' + + # Insert padding between parameter tensors to ensure full coverage as needed. + all_pad_tensors = {} + for i in range(len(bucket_state) - 1): + next_param_start = bucket_state[i + 1]['gbuf_local_start'] + cur_param_end = bucket_state[i]['gbuf_local_end'] + if next_param_start != cur_param_end: + pad_tensors = { + k: torch.empty( + next_param_start - cur_param_end, + dtype=v.dtype, + device=v.device, + ) + for k, v in bucket_state[i].items() + if isinstance(v, torch.Tensor) + } + all_pad_tensors[i + 1] = { + **pad_tensors, + 'gbuf_local_start': cur_param_end, + 'gbuf_local_end': next_param_start, + 'padding': True, + } + + # Insert from end so that insertion positions are still correct. + indices_to_insert = sorted(list(all_pad_tensors.keys())) + for index_to_insert in reversed(indices_to_insert): + bucket_state.insert(index_to_insert, all_pad_tensors[index_to_insert]) + + if bucket_state[-1]['gbuf_local_end'] != gbuf_local_numel: + pad_tensors = { + k: torch.empty( + gbuf_local_numel - bucket_state[-1]['gbuf_local_end'], + dtype=v.dtype, + device=v.device, + ) + for k, v in bucket_state[-1].items() + if isinstance(v, torch.Tensor) + } + bucket_state.append( + { + **pad_tensors, + 'gbuf_local_start': bucket_state[-1]['gbuf_local_end'], + 'gbuf_local_end': gbuf_local_numel, + 'padding': True, + } + ) + + # Each tensor is mapped to a slice (`flattened_range`) + # of a DP-local shard of size `gbuf_local_numel`. + for bucket_params_idx in range(len(bucket_state)): + tensors = bucket_state[bucket_params_idx] + gbuf_local_start = tensors.pop('gbuf_local_start') + gbuf_local_end = tensors.pop('gbuf_local_end') + if 'padding' not in tensors: + tensors['padding'] = False + + for key in tensors: + if key == 'padding': + tensors[key] = LocalNonpersistentObject(tensors[key]) + continue + assert tensors[key].shape == (gbuf_local_end - gbuf_local_start,), ( + tensors[key].shape, + gbuf_local_start, + gbuf_local_end, + ) + + tensors[key] = ShardedTensor( + f'{sharded_bucket_key}.{key}', + tensors[key], + tensors[key].dtype, + (gbuf_local_numel,), + (data_parallel_world_size * gbuf_local_numel,), + (data_parallel_rank * gbuf_local_numel,), + axis_fragmentations=(data_parallel_world_size,), + flattened_range=slice(gbuf_local_start, gbuf_local_end), + allow_shape_mismatch=True, + ) + return state + + def sharded_param_state_fs_model_space( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ): + """Sharded state dict where each buffer is mapped to corresponding model param. + + In this approach the optimizer state tensors are directly related to model parameters + by linking them with metadata from `model_sharded_state_dict`. + This will allow changing TP and PP while using DistOpt (as with other optimizers). + """ + + param_to_sharded_metadata = {} + model_sharded_state_dict, _ = extract_sharded_tensors_and_factories( + model_sharded_state_dict + ) + for sh_base in nested_values(model_sharded_state_dict): + param_to_sharded_metadata[sh_base.data] = sh_base + + prefix = 'optimizer.state' + state = {} + param_idx = 0 # this is not stored in the checkpoint, used only to identify params in `sharded_param_state_fs_model_space` + for gbuf_range_maps in self.gbuf_ranges: + for gbuf_range_map_for_all_buckets in gbuf_range_maps.values(): + for gbuf_range_map in gbuf_range_map_for_all_buckets: + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + group_index, group_order = self.model_param_group_index_map[model_param] + param_range = param_range_map['param'] + + main_param = self.optimizer.param_groups[group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + tensors = { + "fp32_param": main_param, + **optim_state, + } + # Match optimizer parameter with model ShardedTensor (or ShardedTensorFactory) + try: + sharded_metadata = param_to_sharded_metadata[model_param] + except KeyError as e: + raise ValueError( + f'Model param {model_param} not in model_sharded_state_dict' + ) from e + + # Set DP corresponding replica_id coordinate to 0 + assert ( + len(sharded_metadata.replica_id) == 3 + ), f'Expected replica_id format (PP, TP, DP), got: {sharded_metadata}' + replica_id = (*sharded_metadata.replica_id[:2], 0) + + # Instantiate ShardedTensor (or ShardedTensorFactory) for optimizer params + for state_key, state_ten in tensors.items(): + replace_kwargs = dict( + key=f'{prefix}.{state_key}.{sharded_metadata.key}', + data=state_ten, + dtype=state_ten.dtype, + flattened_range=slice(param_range.start, param_range.end), + replica_id=replica_id, + ) + if isinstance(sharded_metadata, ShardedTensorFactory): + replace_kwargs.pop('dtype') + tensors[state_key] = replace(sharded_metadata, **replace_kwargs) + tensors[state_key].validate_metadata_integrity() + state[param_idx] = tensors + param_idx += 1 + return state + + def load_parameter_state_from_fs_bucket_space(self, state_dict): + """Loads the parameter state from an internal representation. + + Inverse of the `get_parameter_state_fs_bucket_space` method. + """ + logger.warning( + '`fully_sharded_bucket_space` sharding for DistributedOptimizer' + 'checkpoint is deprecated. Please switch to `full_sharded_model_space`' + ) + + if state_dict is not None and "per_bucket_numel_unpadded" in state_dict: + per_bucket_numel_unpadded_in_checkpoint = state_dict["per_bucket_numel_unpadded"] + assert self.per_bucket_numel_unpadded == per_bucket_numel_unpadded_in_checkpoint, ( + f"Number of unpadded elements in each bucket need to be the same in current run " + f"({self.per_bucket_numel_unpadded}) and checkpoint " + f"({per_bucket_numel_unpadded_in_checkpoint})" + ) + + for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges): + assert len(gbuf_range_maps) == 1, "single dtype supported, for now." + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items(): + for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets): + bucket_state = state_dict[gbuf_idx][dtype][bucket_idx] + bucket_state = [ + bucket_state_elem + for bucket_state_elem in bucket_state + if not bucket_state_elem['padding'] + ] + + assert len(bucket_state) == len(gbuf_range_map["param_map"]), ( + len(bucket_state), + len(gbuf_range_map["param_map"]), + ) + for src_tensors, (model_param, param_range_map) in zip( + bucket_state, gbuf_range_map["param_map"].items() + ): + # Main param & optimizer states. + group_index, group_order = self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups[group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + dst_tensors = { + "param": main_param, + **optim_state, + } + for key in dst_tensors: + dst_tensors[key].copy_(src_tensors[key]) + + @torch.no_grad() + def load_parameter_state_from_fs_model_space(self, state_dict): + """Loads the parameter state from a "model space" representation. + + Inverse of the `sharded_param_state_fs_model_space` method. + """ + param_idx = 0 # matching order with `sharded_param_state_fs_model_space` + for gbuf_range_maps in self.gbuf_ranges: + for gbuf_range_map_for_all_buckets in gbuf_range_maps.values(): + for gbuf_range_map in gbuf_range_map_for_all_buckets: + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + group_index, group_order = self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups[group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + src_tensors = state_dict[param_idx] + dst_tensors = { + "fp32_param": main_param, + **optim_state, + } + for key in dst_tensors: + dst_tensors[key].copy_(src_tensors[key]) + + param_idx += 1 + + def load_parameter_state_from_dp_zero(self, state_dict): + """Load parameter state (i.e., parameter & optimizer tensors) from DP 0 rank, + using the new checkpoint format with coalesced state across buckets. + + This method performs the reverse of get_parameter_state_dp_zero(): + - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP + rank receives its relevant subset of the world buffers). + - For each DP rank, copy param & optimizer shards from contiguous CPU + buffers. (e.g., one buffer each for main_param, exp_avg, and + exp_avg_sq). + """ + + # Data parallelism variables. + data_parallel_world_size = self.data_parallel_group_gloo.size() + data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group_gloo) + data_parallel_group_gloo = self.data_parallel_group_gloo + data_parallel_global_ranks = torch.distributed.get_process_group_ranks( + self.data_parallel_group_gloo + ) + + # Scatter tensors to all DP ranks. + for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges): + for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items(): + if data_parallel_rank == 0: + buffer_numel_unpadded = self.buffers[gbuf_idx].numel_unpadded + checkpoint_numel_unpadded = state_dict[gbuf_idx][dtype]["numel_unpadded"] + assert buffer_numel_unpadded == checkpoint_numel_unpadded, ( + f"Number of unpadded elements must be same in current run " + f"({buffer_numel_unpadded}) and checkpoint ({checkpoint_numel_unpadded})" + ) + for key in ("param", "exp_avg", "exp_avg_sq"): + offset_in_world_tensors = 0 + for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets): + # Compute local DP contiguous shard's size. + gbuf_world_numel = ( + self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel() + ) + assert gbuf_world_numel % data_parallel_world_size == 0 + gbuf_local_numel = gbuf_world_numel // data_parallel_world_size + gbuf_world_numel_unpadded = ( + self.buffers[gbuf_idx].buckets[bucket_idx].numel_unpadded + ) + assert gbuf_world_numel_unpadded <= gbuf_world_numel + + # Contiguous local shards (received from DP rank 0). + recv_tensor = torch.zeros( + (gbuf_local_numel,), dtype=torch.float32, device="cpu" + ) + + # Scatter tensor list. + if data_parallel_rank == 0: + world_tensors = state_dict[gbuf_idx][dtype][key] + + start = offset_in_world_tensors + end = offset_in_world_tensors + gbuf_world_numel_unpadded + assert 0 <= start < end <= world_tensors.numel() + world_tensor = world_tensors[start:end] + offset_in_world_tensors += gbuf_world_numel_unpadded + + # Pad world_tensor to gbuf_world_numel. Don't pad at the front, pad at the back. + world_tensor = torch.nn.functional.pad( + world_tensor, (0, gbuf_world_numel - gbuf_world_numel_unpadded) + ) + assert world_tensor.numel() == gbuf_world_numel + gbuf_start_idxs = list(range(0, gbuf_world_numel, gbuf_local_numel)) + send_tensors = [ + world_tensor[i : (i + gbuf_local_numel)] for i in gbuf_start_idxs + ] + else: + send_tensors = None + + # Scatter. + torch.distributed.scatter( + recv_tensor, + send_tensors, + data_parallel_global_ranks[0], + data_parallel_group_gloo, + ) + + # Copy local contiguous shards to param/optim shards. + for model_param, param_range_map in gbuf_range_map["param_map"].items(): + + # Main param & optimizer states. + group_index, group_order = self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups[group_index]["params"][ + group_order + ] + if key == "param": + tensor_to_copy_into = main_param + else: + optim_state = self.optimizer.state[main_param] + tensor_to_copy_into = optim_state[key] + + # Copy states into contiguous shard. + gbuf_local_start = param_range_map["gbuf_local"].start + gbuf_local_end = param_range_map["gbuf_local"].end + tensor_to_copy_into.data.copy_( + recv_tensor[gbuf_local_start:gbuf_local_end] + ) + + def load_parameter_state(self, filename: str): + """Load the distributed parameter state from disk. + + Args: + filename (str): path to load parameter state from. + """ + state_dict = None + if torch.distributed.get_rank(self.data_parallel_group) == 0: + state_dict = torch.load(filename) + + self.load_parameter_state_from_dp_zero(state_dict) + + def zero_grad(self, set_to_none: bool = True): + """ + Zeroes grads for the model related parameters, i.e., model_float16_groups + and model_fp32_groups. We additionally zero the remaining groups as a + memory optimization to reduce fragmentation; in the case of + set_to_none==True, the space used by this field can be safely deallocated. + + Args: + set_to_none (bool): if true, set grads to None. + """ + for groups in ( + self.model_float16_groups, + self.model_fp32_groups, + self.shard_float16_groups, # grad empty/unused here? + self.shard_fp32_groups, # throws grad-access warning + self.shard_fp32_from_float16_groups, + ): + for group in groups: + _zero_grad_group_helper(group, set_to_none) + + # If overlapping param all-gather with forward compute, launch all-gather + # for first accessed bucket here before forward compute is initiated. + # The all-gather for the next bucket will be launched in the forward + # pre-hook when this all-gather finishes (to ensure that the communication + # kernels don't head-of-line block the compute kernels since we run with + # CUDA_DEVICE_MAX_CONNECTIONS=1 to support sequence parallelism). + if self.overlap_param_gather: + self._dispatch_gather_model_params(all_gather_handle_index=0) + + def _get_model_param_buffer_dp_views(self): + """ + Get shard views of each of the param buffers. + + In this nested list, the top level is grouped by the virtual model + index and the buffer's data type. The sub-level is a list of + shards of that buffer, where each shard in the list represents + a contiguous view of the buffer, that is owned by a data-parallel + rank. The shard boundary does not respect parameter boundaries, and + so the elements of some parameters are split across data parallel + ranks. + + Additionally, return references to the entire buffers, for use + in _all_gather_base. + """ + + # Buffer views. + # Add in reverse order in each model chunk since buckets start from the end of the model but we want + # all-gathers to run first for the start of the model (same order as forward pass). + # We keep the view_items in model chunk order since we want to still first run all_gather and + # all_gather_handle.wait() for the first model chunk. + # In all cases, we want all_gather and all_gather_handle.wait() to be called in the same order, + # and all_gather_handle.wait() needs to be called just before the corresponding forward pass. + view_items = [] + for gbuf_index, buffer in enumerate(self.buffers): + view_items_per_model_chunk = [] + dtype = self.buffers[gbuf_index].param_dtype + for bucket_index, bucket in enumerate(buffer.buckets): + data_parallel_world_size = torch.distributed.get_world_size( + self.data_parallel_group + ) + buf_views = shard_buffer(bucket.param_data, data_parallel_world_size) + view_items_per_model_chunk.insert( + 0, (gbuf_index, dtype, bucket_index, bucket.param_data, buf_views) + ) + view_items.extend(view_items_per_model_chunk) + + return view_items + + def _dispatch_gather_model_params(self, all_gather_handle_index: int, force_sync: bool = False): + """ + All-gather updated model params. + + When using the distributed optimizer, the params are already laid out in a contiguous + buffer (see mcore/distributed/param_and_grad_buffer.py for details), and so the + all-gather will put the results in the right region of memory. + """ + async_op = self.overlap_param_gather and not force_sync + if self.update_successful: + data_parallel_group = self.data_parallel_group + data_parallel_rank = torch.distributed.get_rank(data_parallel_group) + + # All-gather updated main params. + # All param_buf views are guaranteed to have the same number of elements + # across all data-parallel ranks, due to padding done in + # param_and_grad_buffer.py). Thus, all sub-views will have consistent + # start / end indexes across data-parallel ranks. + (gbuf_index, dtype, bucket_index, pbuf, pbuf_views) = self.pbuf_view_items[ + all_gather_handle_index + ] + assert all_gather_handle_index < len(self.all_gather_handles) + all_gather_handle = torch.distributed._all_gather_base( + pbuf, + pbuf_views[data_parallel_rank], + group=data_parallel_group, + async_op=async_op, + ) + self.all_gather_handles[all_gather_handle_index] = all_gather_handle + assert self.all_gather_handle_index_to_bucket_index_map[all_gather_handle_index] == ( + gbuf_index, + dtype, + bucket_index, + ) + + def _make_forward_pre_hook(self): + """ + Create a forward pre-hook to wait on all-gather handles when necessary (i.e., + when a module uses a parameter in a bucket with a still incomplete all-gather) + and then copy the results from the param_buffer into model_params. + """ + + def hook(module, *unused): + assert ( + self.overlap_param_gather + ), "Should use pre-hook only when overlap_param_gather is True" + + # Make sure all parameters in this module have been all-gathered as necessary. + for param in module.parameters(recurse=False): + # Skip parameters that don't require grad. + if not param.requires_grad: + continue + + # Some params might be handled in another DistributedOptimizer instance; for + # example, we use separate DistributedOptimizer instances for expert and + # non-expert params. + if param in self.param_to_all_gather_handle_index_map: + all_gather_handle_index = self.param_to_all_gather_handle_index_map[param] + self._finish_param_sync_helper(all_gather_handle_index) + + return hook + + def finish_param_sync(self, model_index: int, *unused): + """ + Finishes all necessary param syncs for the model_index'th model chunk. + + Args: + model_index (int): index of model chunk to synchronize params. + """ + if model_index not in self.model_index_to_all_gather_handle_index_map: + return + + all_gather_handle_indices = self.model_index_to_all_gather_handle_index_map[model_index] + for all_gather_handle_index in all_gather_handle_indices: + self._finish_param_sync_helper(all_gather_handle_index) + + def _finish_param_sync_helper(self, all_gather_handle_index: int): + """ + Waits on all_gather_handle if necessary, then dispatches the next all-gather + as necessary. + """ + + # First check if there is an outstanding all-gather handle for this param. + # If so, wait on the handle to ensure the communication is finished. + assert all_gather_handle_index < len(self.all_gather_handles) + all_gather_handle = self.all_gather_handles[all_gather_handle_index] + if all_gather_handle is not None: + all_gather_handle.wait() + self.all_gather_handles[all_gather_handle_index] = None + + # Launch the all-gather for the next bucket now. + # We can't pre-launch all-gathers for all buckets at once since we don't + # want to head-of-line block the compute kernels with communication kernels + # (since we run with CUDA_DEVICE_MAX_CONNECTIONS=1 to support sequence + # parallelism). + next_all_gather_handle_index = all_gather_handle_index + 1 + if next_all_gather_handle_index < self.num_all_gather_handles: + self._dispatch_gather_model_params(next_all_gather_handle_index) + + def _collect_main_grad_data_for_unscaling(self): + """ + Note: this should be equivalent to the float-16 optimizer's method, + but written differently, so the two should be combined. + """ + return [ + param.grad.data for group in self.optimizer.param_groups for param in group["params"] + ] + + def _get_model_and_main_params_data_float16(self): + """ + Get aligned list of model and main params. + """ + model_data = [] + main_data = [] + for model_group, main_group in zip( + self.shard_float16_groups, self.shard_fp32_from_float16_groups + ): + for model_param, main_param in zip(model_group, main_group): + model_data.append(model_param.data) + main_data.append(main_param.data) + return model_data, main_data + + def _copy_model_grads_to_main_grads(self): + """ + Copy model grads to main grads. + + Since this step follows a reduce-scatter through the DDP's grad + buffer, this method is responsible for copying the updated grads + from the grad buffer to the main shard's grad field. + """ + + # Utility method for copying group grads. + def copy_group_grads(model_groups, shard_main_groups): + for model_group, shard_main_group in zip(model_groups, shard_main_groups): + for model_param, shard_main_param in zip(model_group, shard_main_group): + + param_range_map = self._get_model_param_range_map(model_param) + param_range = param_range_map["param"] + assert param_range.size == shard_main_param.nelement() + + model_grad = model_param.main_grad + shard_model_grad = model_grad.view(-1)[param_range.start : param_range.end] + shard_main_param.grad = shard_model_grad.float() + + # Copy model groups to shard groups. + copy_group_grads(self.model_float16_groups, self.shard_fp32_from_float16_groups) + copy_group_grads(self.model_fp32_groups, self.shard_fp32_groups) + + def _copy_main_params_to_model_params(self): + """ + Copy main params to model params. + + Since this step is followed by an all-gather through the DDP's grad + buffer, this method is responsible for copying the updated params + from the main shards into the correct position in the grad buffer. + """ + + # Utility method for copying group params. + def copy_group_params(shard_main_groups, model_groups): + for shard_main_group, model_group in zip(shard_main_groups, model_groups): + for shard_main_param, model_param in zip(shard_main_group, model_group): + + param_range_map = self._get_model_param_range_map(model_param) + world_range = param_range_map["gbuf_world_in_bucket"] + + assert world_range.size == shard_main_param.nelement() + + gbuf_index, _, bucket_id = self.model_param_gbuf_map[model_param] + model_param_buffer = self.buffers[gbuf_index].buckets[bucket_id].param_data + + shard_model_param = model_param_buffer.view(-1)[ + world_range.start : world_range.end + ] + + shard_model_param.data.copy_(shard_main_param) + + # Copy shard groups to model groups. + copy_group_params(self.shard_fp32_from_float16_groups, self.model_float16_groups) + copy_group_params(self.shard_fp32_groups, self.model_fp32_groups) + + def _copy_model_params_to_main_params(self): + """ + Copy model params to main params. + + During finetuning, this method is used to reload the main params from + the model params. This copy does not make use of the grad buffer as + an intermediary. + """ + + # Utility method for copying group params. + def copy_group_params(model_groups, shard_main_groups): + for model_group, shard_main_group in zip(model_groups, shard_main_groups): + for model_param, shard_main_param in zip(model_group, shard_main_group): + + param_range_map = self._get_model_param_range_map(model_param) + param_range = param_range_map["param"] + assert param_range.size == shard_main_param.nelement() + + shard_model_param = model_param.view(-1)[param_range.start : param_range.end] + shard_main_param.data.copy_(shard_model_param) + + # Copy model groups to shard groups. + copy_group_params(self.model_float16_groups, self.shard_fp32_from_float16_groups) + copy_group_params(self.model_fp32_groups, self.shard_fp32_groups) + + def _reset_metadata_and_sync_gather_all_model_params(self, force_sync: bool): + """ + Reset metadata needed to track results of all-gathers. + """ + self.all_gather_handles = [None for _ in range(len(self.all_gather_handles))] + + # Launch synchronous all-gather if --overlap-param-gather is turned on or if force_sync + # is explicitly set to True (e.g., if we are going to turn off all-gather overlapping for + # validation / test iterations). + if not self.overlap_param_gather or force_sync: + for all_gather_handle_index in range(self.num_all_gather_handles): + self._dispatch_gather_model_params(all_gather_handle_index, force_sync=force_sync) + + @torch.no_grad() + def step_with_ready_grads(self) -> bool: + """Step the optimizer with ready gradients, return successful. + Under the hood, either launch synchronous param all-gathers or get ready to launch + asynchorous all-gathers that get overlapped with the next forward pass. + """ + self.update_successful = super().step_with_ready_grads() + + timers = self.config.timers + if timers is not None: + timers('params-all-gather', log_level=1).start(barrier=self.config.barrier_with_L1_time) + # If not overlapping all-gather for parameters, launch synchronous all-gather + # communication calls here. If overlapping all-gather for parameters, the following + # call to _gather_all_model_params is a no-op: the first all-gather is launched + # asynchronously in the next optimizer.zero_grad() call and subsequent all-gathers + # are launched in the forward pre-hook. + self._reset_metadata_and_sync_gather_all_model_params(force_sync=False) + if timers is not None: + timers('params-all-gather').stop() + + return self.update_successful diff --git a/megatron/core/optimizer/grad_scaler.py b/megatron/core/optimizer/grad_scaler.py new file mode 100644 index 0000000..abdd1e7 --- /dev/null +++ b/megatron/core/optimizer/grad_scaler.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron grad scaler.""" + +from abc import ABC, abstractmethod +from typing import Dict + +import torch + + +class MegatronGradScaler(ABC): + def __init__(self, initial_scale: float): + """Initialize scale value with the input initial scale.""" + assert initial_scale > 0.0 + self._scale = torch.tensor([initial_scale], dtype=torch.float, device='cuda') + + @property + def scale(self): + return self._scale + + @property + def inv_scale(self): + return self._scale.double().reciprocal().float() + + @abstractmethod + def update(self, found_inf: bool): + pass + + @abstractmethod + def state_dict(self): + pass + + @abstractmethod + def load_state_dict(self, state_dict: Dict): + pass + + +class ConstantGradScaler(MegatronGradScaler): + """ + Constant grad scaler (loss scale is never adjusted regardless of NaNs seen in gradients). + """ + + def update(self, found_inf: bool): + pass + + def state_dict(self): + return dict() + + def load_state_dict(self, state_dict): + pass + + +class DynamicGradScaler(MegatronGradScaler): + """ + Grad scaler with dynamic scale that gets adjusted during training. + + Reduces loss scale by `backoff_factor` if `hysteresis` number of NaNs are seen in a row. Increases + loss scale by `growth_factor` if NaNs are not seen for `growth_interval` iterations. + """ + + def __init__( + self, + initial_scale: float, + min_scale: float, + growth_factor: float, + backoff_factor: float, + growth_interval: int, + hysteresis: int, + ): + """ + Grad scaler with dynamic scale that gets adjusted during training. + + Args: + initial_scale (float): Initial loss scale value. + min_scale (float): Minimum loss scale value. + growth_factor (float): Factor to grow loss scale by if NaNs are not seen in `growth_interval` + training iterations. Must be greater than 1. + backoff_factor (float): Factor to decrease loss scale by if NaNs are seen in `hysteresis` + consecutive training iterations. Must be between 0 and 1. + growth_interval (int): Number of training iterations of no NaNs before loss scale is increased. + hysteresis (int): Number of training iterations of consecutive NaNs before loss scale is decreased. + """ + super(DynamicGradScaler, self).__init__(initial_scale) + + # Lower bound on the scale. + assert min_scale > 0.0 + assert min_scale <= initial_scale + self.min_scale = torch.tensor([min_scale], dtype=torch.float, device='cuda') + # Growth and backoff factors for the scale. + assert growth_factor > 1.0 + self.growth_factor = torch.tensor([growth_factor], dtype=torch.float, device='cuda') + assert backoff_factor < 1.0 + assert backoff_factor > 0.0 + self.backoff_factor = torch.tensor([backoff_factor], dtype=torch.float, device='cuda') + # Interval over which if we don't see any inf/nan, + # we will scale the grad scale by the growth factor. + assert growth_interval > 0 + self.growth_interval = growth_interval + # Number of inf/nans we should see before scaling down + # the grad scale by the backoff factor. + assert hysteresis > 0 + self.hysteresis = hysteresis + + # Trackers. + self._growth_tracker = 0 + self._hysteresis_tracker = self.hysteresis + + def update(self, found_inf: bool): + """ + Updates internal state in grad scaler based on whether NaNs are seen in grads or not. + """ + + # If we have an inf/nan, growth tracker is set to 0 + # and hysterisis tracker is reduced by 1. + if found_inf: + self._growth_tracker = 0 + self._hysteresis_tracker -= 1 + # Now if we are out of hysteresis count, scale down the loss. + if self._hysteresis_tracker <= 0: + self._scale = torch.max(self._scale * self.backoff_factor, self.min_scale) + else: + # If there is no nan/inf, increment the growth tracker. + self._growth_tracker += 1 + # If we have had enough consequitive intervals with no nan/inf: + if self._growth_tracker == self.growth_interval: + # Reset the tracker and hysteresis trackers, + self._growth_tracker = 0 + self._hysteresis_tracker = self.hysteresis + # and scale up the loss scale. + self._scale = self._scale * self.growth_factor + + def state_dict(self): + state_dict = {} + state_dict['scale'] = self._scale + state_dict['growth_tracker'] = self._growth_tracker + state_dict['hysteresis_tracker'] = self._hysteresis_tracker + return state_dict + + def load_state_dict(self, state_dict: Dict): + self._scale = state_dict['scale'].cuda(torch.cuda.current_device()) + self._growth_tracker = state_dict['growth_tracker'] + self._hysteresis_tracker = state_dict['hysteresis_tracker'] diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py new file mode 100644 index 0000000..43c9a65 --- /dev/null +++ b/megatron/core/optimizer/optimizer.py @@ -0,0 +1,1064 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron optimizer.""" + +import copy +import math +from abc import ABC, abstractmethod +from itertools import chain +from logging import getLogger +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch + +try: + from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale +except ImportError: + try: + from apex.multi_tensor_apply import multi_tensor_applier + except ImportError: + from megatron.core.utils import local_multi_tensor_applier + + multi_tensor_applier = local_multi_tensor_applier + try: + import amp_C + + l2_norm_impl = amp_C.multi_tensor_l2norm + multi_tensor_scale_impl = amp_C.multi_tensor_scale + except ImportError: + from megatron.core.utils import local_multi_tensor_l2_norm, local_multi_tensor_scale + + l2_norm_impl = local_multi_tensor_l2_norm + multi_tensor_scale_impl = local_multi_tensor_scale + +from .. import parallel_state, tensor_parallel +from ..dist_checkpointing.mapping import ShardedStateDict +from ..dist_checkpointing.optimizer import ( + get_param_id_to_sharded_param_map, + make_sharded_optimizer_tensor, + optim_state_to_sharding_state, +) +from ..dist_checkpointing.utils import add_prefix_for_sharding +from ..transformer.module import param_is_not_shared +from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32 +from .grad_scaler import MegatronGradScaler +from .optimizer_config import OptimizerConfig + +logger = getLogger(__name__) + + +def _zero_grad_group_helper(group: List[torch.nn.Parameter], set_to_none: bool): + """ + Zero out the gradient for a group of parameters. + Note: copied from torch.optim.optimizer. + """ + for param in group: + if param.grad is not None: + if set_to_none: + param.grad = None + else: + if param.grad.grad_fn is not None: + param.grad.detach_() + else: + param.grad.requires_grad_(False) + param.grad.zero_() + + +def _multi_tensor_copy_this_to_that( + this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None +): + """ + Use multi-tensor-applier to copy values from one list to another. + We don't have a bfloat16 implementation so for now if the overflow_buf + is not provided, we default back to simple loop copy to be compatible + with bfloat16. + """ + if overflow_buf: + overflow_buf.fill_(0) + # Scaling with factor `1.0` is equivalent to copy. + multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) + else: + for this_, that_ in zip(this, that): + that_.copy_(this_) + + +class MegatronOptimizer(ABC): + """ + Base class for all Megatron optimizers. + + Args: + optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD. + config (OptimizerConfig): configuration object for optimizer. + init_state_fn (Callable, optional): function to initialize state in the optimizer. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + config: OptimizerConfig, + init_state_fn: Callable = lambda x: None, + ): + """Input optimizer is the base optimizer (e.g., Adam).""" + self.optimizer = optimizer + assert self.optimizer, 'no optimizer is provided.' + self.config = config + self.init_state_fn = init_state_fn + + def get_parameters(self) -> List[torch.nn.Parameter]: + """ + Get list of parameters wrapped in optimizer. + """ + params = [] + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + params.append(param) + return params + + def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]: + """ + Get main_grads that should be taken into account to compute the grad norm. + Filter parameters based on: + - grad should not be None. + - parameter should not be shared (i.e., grads shouldn't be double counted while + computing norms). + - should not be a replica due to tensor model parallelism. + """ + params = self.get_parameters() + grads_for_norm = [] + for param in params: + grad = param.grad + grad_not_none = grad is not None + is_not_shared = param_is_not_shared(param) + is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param) + if grad_not_none and is_not_shared and is_not_tp_duplicate: + grads_for_norm.append(grad) + + return grads_for_norm + + def get_model_parallel_group(self) -> torch.distributed.ProcessGroup: + """Default returned here, but the distributed optimizer overrides this.""" + if hasattr(self, 'model_parallel_group'): + return self.model_parallel_group + return parallel_state.get_model_parallel_group() + + @abstractmethod + def prepare_grads(self) -> bool: + """Pre-processing gradients before the optimizer step, returns whether inf/nan is found.""" + return False + + @abstractmethod + def step_with_ready_grads(self) -> bool: + """Step the optimizer with ready gradients, return successful.""" + return True + + @torch.no_grad() + def get_grad_norm(self): + grads_for_norm = self.get_main_grads_for_grad_norm() + total_norm = get_grad_norm_fp32( + grads_for_norm, + model_parallel_group=self.get_model_parallel_group(), + ) + return total_norm + + def clip_grad_norm(self, clip_grad: float) -> float: + """Compute grad norm.""" + params = self.get_parameters() + grads_for_norm = self.get_main_grads_for_grad_norm() + grad_norm = get_grad_norm_fp32( + grads_for_norm, model_parallel_group=self.get_model_parallel_group() + ) + clip_grad_by_total_norm_fp32(params, clip_grad, grad_norm) + return grad_norm + + def count_zeros(self) -> float: + """Count number of zeros in model's gradients.""" + params = self.get_parameters() + return count_zeros_fp32(params, model_parallel_group=self.get_model_parallel_group()) + + @abstractmethod + def zero_grad(self, set_to_none: bool = True): + pass + + @abstractmethod + def get_loss_scale(self) -> torch.Tensor: + """ + Get current loss scale factor. + NOTE: The output should be a CUDA tensor of size 1. + """ + pass + + def scale_loss(self, loss: torch.Tensor) -> torch.Tensor: + """Simple scaling.""" + return self.get_loss_scale() * loss + + def finish_param_sync(self, model_index: int): + """ + Finish parameter synchronization for all optimizers. + This is a no-op for all non-distributed optimizers. + """ + pass + + @abstractmethod + def reload_model_params(self): + """Refreshes any internal state from the current model parameters. + Call whenever the parameters are changed outside of the optimizer. + For example, when we load a model from a checkpoint without loading + the optimizer, the model parameters are updated but for fp16 optimizer + with main parameters, the main parameters need to also be updated.""" + pass + + @abstractmethod + def state_dict(self): + pass + + @abstractmethod + def load_state_dict(self, state_dict): + pass + + # Promote state so it can be retrieved or set via + # "optimizer_instance.state" + def _get_state(self): + return self.optimizer.state + + def _set_state(self, value): + self.optimizer.state = value + + state = property(_get_state, _set_state) + + # Promote param_groups so it can be retrieved or set via + # "optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + return self.optimizer.param_groups + + def _set_param_groups(self, value): + self.optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) + + @abstractmethod + def step(self): + """Step the optimizer.""" + pass + + @abstractmethod + def sharded_state_dict( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ) -> ShardedStateDict: + """Builds sharded state dict for the optimizer, based on model's sharded state dict. + + Args: + model_sharded_state_dict (ShardedStateDict): sharded state dict of the model + is_loading (bool, optional): flag indicating whether the state dict will be used to save or load the optimizer state. + Defaults to False. + + Returns: optimizer sharded state dict + """ + + @staticmethod + def _extract_common_per_param_step(state_dict) -> Union[int, torch.Tensor]: + common_step = None + for param_idx, param_state in state_dict['state'].items(): + param_step = param_state.get('step', None) + if param_step is not None: + if common_step is None: + common_step = param_step + elif common_step != param_step: + raise ValueError( + "The optimizer step differs per parameter. Mcore only supports " + "optimizers whose step is shared across all parameters." + ) + return common_step + + @staticmethod + def _restore_common_per_param_step(state_dict: Dict, step: Union[int, torch.Tensor]): + for param_idx, param_state in state_dict['state'].items(): + param_state['step'] = copy.deepcopy(step) + + +class MixedPrecisionOptimizer(MegatronOptimizer): + """Base class for both the float-16 and the distributed optimizer. + + Args: + optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD. + config (OptimizerConfig): configuration object for optimizer. + grad_scaler (MegatronGradScaler): used for scaling gradients. Note that + this can be None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constant gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + init_state_fn (Callable, optional): function to initialize state in the optimizer. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + config: OptimizerConfig, + grad_scaler: Optional[MegatronGradScaler], + init_state_fn: Callable, + ): + + super().__init__( + optimizer, + config, + init_state_fn, + ) + self.grad_scaler = grad_scaler + + # None grad scaler is only supported for bf16. + if self.grad_scaler is None: + assert not self.config.fp16, 'fp16 expects a grad scaler.' + + # Tensor used to determine if a nan/if has happend. + # Any non-zero value indicates inf/nan. + # Note that we keep this for the cases that grad scaler is none. + # We still record nan/inf if we have a bfloat16 with a grad scaler. + if self.grad_scaler: + self.found_inf = torch.tensor([0.0], dtype=torch.float, device='cuda') + + # Dummy tensor needed for apex multi-apply tensor. + # For bfloat, we don't have multi-tensor apply and for now + # we set it to none so the multi-tensor apply gets ignored. + if self.config.bf16: + self._dummy_overflow_buf = None + else: + self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda') + + # In case grad scaler is not passed, define the unity scale. + if self.grad_scaler is None: + self._scale_one = torch.tensor([1.0], dtype=torch.float, device='cuda') + + def get_loss_scale(self): + if self.grad_scaler is None: + return self._scale_one + return self.grad_scaler.scale + + def reload_model_params(self): + self._copy_model_params_to_main_params() + + def _unscale_main_grads_and_check_for_nan(self): + + # Collect main grads. + main_grads = self._collect_main_grad_data_for_unscaling() + + # Reset found inf. + self.found_inf.fill_(0.0) + + # Unscale and set found inf/nan + torch._amp_foreach_non_finite_check_and_unscale_( + main_grads, self.found_inf, self.grad_scaler.inv_scale + ) + + # Update across all model parallel instances. + torch.distributed.all_reduce( + self.found_inf, op=torch.distributed.ReduceOp.MAX, group=self.get_model_parallel_group() + ) + + # Check for nan. + found_inf_flag = self.found_inf.item() > 0 + + return found_inf_flag + + @torch.no_grad() + def prepare_grads(self) -> bool: + """Pre-processing gradients before the optimizer step, returns whether inf/nan is found.""" + timers = self.config.timers + + # Copy gradients from model params to main params. + if timers is not None: + timers('optimizer-copy-to-main-grad', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + self._copy_model_grads_to_main_grads() + if timers is not None: + timers('optimizer-copy-to-main-grad').stop() + + # Do unscale, check for inf, and update grad scaler only for + # the case that grad scaler is provided. + if self.grad_scaler: + + # Unscale and check for inf/nan. + if timers is not None: + timers('optimizer-unscale-and-check-inf', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + found_inf_flag = self._unscale_main_grads_and_check_for_nan() + if timers is not None: + timers('optimizer-unscale-and-check-inf').stop() + + # We are done with scaling gradients + # so we can update the loss scale. + self.grad_scaler.update(found_inf_flag) + + return found_inf_flag + + return False + + @torch.no_grad() + def step_with_ready_grads(self) -> bool: + """Step the optimizer with ready gradients, return successful.""" + timers = self.config.timers + # Step the optimizer. + if timers is not None: + timers('optimizer-inner-step', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + self.optimizer.step() + if timers is not None: + timers('optimizer-inner-step').stop() + + # Update params from main params. + if timers is not None: + timers('optimizer-copy-main-to-model-params', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + self._copy_main_params_to_model_params() + if timers is not None: + timers('optimizer-copy-main-to-model-params').stop() + + return True + + @torch.no_grad() + def step(self): + timers = self.config.timers + + found_inf_flag = self.prepare_grads() + if found_inf_flag: + return False, None, None + + # Clip the main gradients. + if timers is not None: + timers('optimizer-clip-main-grad', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + grad_norm = None + if self.config.clip_grad > 0.0: + grad_norm = self.clip_grad_norm(self.config.clip_grad) + if timers is not None: + timers('optimizer-clip-main-grad').stop() + + # Count the zeros in the grads. + if timers is not None: + timers('optimizer-count-zeros', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None + if timers is not None: + timers('optimizer-count-zeros').stop() + + success = self.step_with_ready_grads() + + # Successful update. + return success, grad_norm, num_zeros_in_grad + + +class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer): + """Float16 optimizer for fp16 and bf16 data types. + + Args: + optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD. + config (OptimizerConfig): configuration object for optimizer. + grad_scaler (MegatronGradScaler): used for scaling gradients. Note that + this can be None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constant gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + init_state_fn (Callable, optional): function to initialize state in the optimizer. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + config: OptimizerConfig, + grad_scaler: MegatronGradScaler, + init_state_fn: Callable, + ): + + super().__init__( + optimizer, + config, + grad_scaler, + init_state_fn, + ) + + # Handle main parameters. + + # Three groups of parameters: + # float16_groups: original float16 parameters + # fp32_from_float16_groups: fp32 copy of float16 parameters + # fp32_from_fp32_groups: original fp32 parameters + self.float16_groups = [] + self.fp32_from_float16_groups = [] + self.fp32_from_fp32_groups = [] + + # For all the groups in the original optimizer: + for param_group in self.optimizer.param_groups: + float16_params_this_group = [] + fp32_params_this_group = [] + fp32_from_float16_params_this_group = [] + # For all the parameters in this group: + for i, param in enumerate(param_group['params']): + if param.requires_grad: + + # float16 params: + if param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']: + float16_params_this_group.append(param) + # Create a copy + main_param = param.detach().clone().float() + # Copy tensor model parallel attributes. + tensor_parallel.copy_tensor_model_parallel_attributes(main_param, param) + if hasattr(param, 'shared'): + main_param.shared = param.shared + # Replace the optimizer params with the new fp32 copy. + param_group['params'][i] = main_param + + fp32_from_float16_params_this_group.append(main_param) + # Reset existing state dict key to the new main param. + if param in self.optimizer.state: + self.optimizer.state[main_param] = self.optimizer.state.pop(param) + # fp32 params. + elif param.type() == 'torch.cuda.FloatTensor': + fp32_params_this_group.append(param) + param_group['params'][i] = param + + else: + raise TypeError( + 'Wrapped parameters must be one of ' + 'torch.cuda.FloatTensor, ' + 'torch.cuda.HalfTensor, or ' + 'torch.cuda.BFloat16Tensor. ' + 'Received {}'.format(param.type()) + ) + + self.float16_groups.append(float16_params_this_group) + self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group) + self.fp32_from_fp32_groups.append(fp32_params_this_group) + + def zero_grad(self, set_to_none=True): + """We only need to zero the model related parameters, i.e., + float16_groups & fp32_from_fp32_groups. We additionally zero + fp32_from_float16_groups as a memory optimization to reduce + fragmentation; in the case of set_to_none==True, the space + used by this field can be safely deallocated at this point.""" + for group in self.float16_groups: + _zero_grad_group_helper(group, set_to_none) + for group in self.fp32_from_float16_groups: + _zero_grad_group_helper(group, set_to_none) + for group in self.fp32_from_fp32_groups: + _zero_grad_group_helper(group, set_to_none) + + def _collect_main_grad_data_for_unscaling(self): + + main_grads = [] + + # fp32 params from float16 ones. + for main_group in self.fp32_from_float16_groups: + for main_param in main_group: + if main_param.grad is not None: + main_grads.append(main_param.grad.data) + + # Append fp32 parameters. + for main_group in self.fp32_from_fp32_groups: + for main_param in main_group: + if main_param.grad is not None: + main_grads.append(main_param.grad.data) + + return main_grads + + def _get_model_and_main_params_data_float16(self): + model_data = [] + main_data = [] + for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups): + for model_param, main_param in zip(model_group, main_group): + model_data.append(model_param.data) + main_data.append(main_param.data) + return model_data, main_data + + def _copy_model_grads_to_main_grads(self): + # This only needs to be done for the float16 group. + for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups): + for model_param, main_param in zip(model_group, main_group): + if hasattr(model_param, 'main_grad'): + main_param.grad = model_param.main_grad.float() + else: + if model_param.grad is not None: + main_param.grad = model_param.grad.float() + + # Safe to deallocate model's grad/main_grad after copying. + # (If using contiguous buffers, main_grad's memory should + # persist and therefore should not be deallocated.) + model_param.grad = None + + # For fp32 grads, we need to reset the grads to main grad. + for model_group in self.fp32_from_fp32_groups: + for model_param in model_group: + model_param.grad = model_param.main_grad + + def _copy_main_params_to_model_params(self): + # Only needed for the float16 params. + model_data, main_data = self._get_model_and_main_params_data_float16() + _multi_tensor_copy_this_to_that( + this=main_data, that=model_data, overflow_buf=self._dummy_overflow_buf + ) + + def _copy_model_params_to_main_params(self): + # Only needed for the float16 params. + model_data, main_data = self._get_model_and_main_params_data_float16() + _multi_tensor_copy_this_to_that( + this=model_data, that=main_data, overflow_buf=self._dummy_overflow_buf + ) + + def state_dict(self): + state_dict = {} + state_dict['optimizer'] = self.optimizer.state_dict() + if self.grad_scaler: + state_dict['grad_scaler'] = self.grad_scaler.state_dict() + state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups + return state_dict + + def sharded_state_dict( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ): + + if is_loading: + self.init_state_fn(self.optimizer) + + state_dict = self.state_dict() + + id_to_sharded_param_map = get_param_id_to_sharded_param_map( + model_sharded_state_dict, chain.from_iterable(g for g in self.float16_groups) + ) + + # Convert fp32_from_fp16_params + assert len(state_dict['fp32_from_fp16_params']) == len( + state_dict['optimizer']['param_groups'] + ) + state_dict['fp32_from_fp16_params'] = [ + [ + make_sharded_optimizer_tensor( + id_to_sharded_param_map[param_id], + fp32_param, + prefix=f'optimizer.state.fp32_param', + ) + for param_id, fp32_param in zip(state_group['params'], fp32_group) + ] + for fp32_group, state_group in zip( + state_dict['fp32_from_fp16_params'], state_dict['optimizer']['param_groups'] + ) + ] + + step = self._extract_common_per_param_step(state_dict['optimizer']) + + # Convert regular optimizer state + # all optimizer parameters passed to optim_state_to_sharding_state are + # expected to have the same shape as the model parameters, + # so we save the step separately and ignore it here + optim_state_to_sharding_state( + state_dict['optimizer'], id_to_sharded_param_map, exclude_keys="step" + ) + # save step as a shared step among all parameters. Separate per-parameter + # steps are not supported + state_dict['optimizer']['state']['common_step'] = step + return state_dict + + def load_state_dict(self, state_dict): + pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() + # Optimizer. + optimizer_key = 'optimizer' + if optimizer_key not in state_dict: + optimizer_key = 'optimizer_state_dict' + logger.info('***WARNING*** loading optimizer from ' 'an old checkpoint ...') + if 'common_step' in state_dict[optimizer_key]['state']: + common_step = state_dict[optimizer_key]['state'].pop('common_step') + self._restore_common_per_param_step(state_dict[optimizer_key], common_step) + self.optimizer.load_state_dict(state_dict[optimizer_key]) + + # Grad scaler. + if 'grad_scaler' not in state_dict: + if self.config.fp16: + logger.info( + '***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...' + ) + else: + if self.grad_scaler: + self.grad_scaler.load_state_dict(state_dict['grad_scaler']) + else: + logger.info( + '***WARNING*** fould the grad scaler in the ' + 'checkpoint but it is None in the class. ' + 'Skipping loading grad scaler ...' + ) + + # Copy data for the main params. + fp32_from_float16_params_key = 'fp32_from_fp16_params' + if fp32_from_float16_params_key not in state_dict: + fp32_from_float16_params_key = 'fp32_from_fp16' + for current_group, saved_group in zip( + self.fp32_from_float16_groups, state_dict[fp32_from_float16_params_key] + ): + for current_param, saved_param in zip(current_group, saved_group): + current_param.data.copy_(saved_param.data) + + +class FP32Optimizer(MegatronOptimizer): + """Float32 optimizer. + + Args: + optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD. + config (OptimizerConfig): configuration object for optimizer. + init_state_fn (Callable, optional): function to initialize state in the optimizer. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + config: OptimizerConfig, + init_state_fn: Callable, + ): + + super(FP32Optimizer, self).__init__( + optimizer, + config, + init_state_fn, + ) + + self._scale = torch.tensor([1.0], dtype=torch.float, device='cuda') + + def zero_grad(self, set_to_none=True): + """Copied from torch.optim.optimizer""" + for group in self.optimizer.param_groups: + _zero_grad_group_helper(group['params'], set_to_none) + + def get_loss_scale(self): + """FP32 optimizer does not do any scaling.""" + return self._scale + + @torch.no_grad() + def prepare_grads(self) -> bool: + """Pre-processing gradients before the optimizer step, returns whether inf/nan is found.""" + timers = self.config.timers + + # Copy main_grads to grads. + if timers is not None: + timers('optimizer-copy-to-main-grad', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + param.grad = param.main_grad + if timers is not None: + timers('optimizer-copy-to-main-grad').stop() + + return False + + @torch.no_grad() + def step_with_ready_grads(self) -> bool: + """Step the optimizer with ready gradients, return successful.""" + timers = self.config.timers + + # Update parameters. + if timers is not None: + timers('optimizer-inner-step', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + self.optimizer.step() + if timers is not None: + timers('optimizer-inner-step').stop() + + return True + + @torch.no_grad() + def step(self): + """Clip gradients (if needed) and step the base optimizer. + Always return successful since there is no overflow.""" + timers = self.config.timers + + found_inf_flag = self.prepare_grads() + if found_inf_flag: + return False, None, None + + # Clip gradients. + if timers is not None: + timers('optimizer-clip-main-grad', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + grad_norm = None + if self.config.clip_grad > 0.0: + grad_norm = self.clip_grad_norm(self.config.clip_grad) + if timers is not None: + timers('optimizer-clip-main-grad').stop() + + # Count the zeros in the grads. + if timers is not None: + timers('optimizer-count-zeros', log_level=1).start( + barrier=self.config.barrier_with_L1_time + ) + num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None + if timers is not None: + timers('optimizer-count-zeros').stop() + + success = self.step_with_ready_grads() + + # No overflow for FP32 optimizer. + return success, grad_norm, num_zeros_in_grad + + def reload_model_params(self): + pass + + def state_dict(self): + return self.optimizer.state_dict() + + def load_state_dict(self, state_dict): + pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() + if 'common_step' in state_dict['state']: + common_step = state_dict['state'].pop('common_step') + self._restore_common_per_param_step(state_dict, common_step) + self.optimizer.load_state_dict(state_dict) + + def sharded_state_dict( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False + ): + if is_loading: + self.init_state_fn(self.optimizer) + + state_dict = self.state_dict() + id_to_sharded_param_map = get_param_id_to_sharded_param_map( + model_sharded_state_dict, self.get_parameters() + ) + step = self._extract_common_per_param_step(state_dict) + + # all optimizer parameters passed to optim_state_to_sharding_state are + # expected to have the same shape as the model parameters, + # so we save the step separately and ignore it here + optim_state_to_sharding_state(state_dict, id_to_sharded_param_map, exclude_keys="step") + # save step as a shared step among all parameters. Separate per-parameter + # steps are not supported + state_dict['state']['common_step'] = step + return state_dict + + +class ProxyDict: + """ + A dictionary-like object that proxies to a list of dictionaries. + + e.g., ProxyDict([{'a': 1}, {'b': 2}]) behaves like: + { + (0, 'a'): 1, + (1, 'b'): 2, + } + We use tuples as keys to avoid ambiguity with the keys of the inner dicts. + """ + + def __init__(self, inner_dicts: List[dict]): + self._inner_dicts = inner_dicts + + def __getitem__(self, key: Tuple[int, str]): + idx, inner_key = key + return self._inner_dicts[idx].get(inner_key) + + def __setitem__(self, key: Tuple[int, str], value: Any): + idx, inner_key = key + self._inner_dicts[idx][inner_key] = value + + def __len__(self) -> int: + return sum([len(inner_dict) for inner_dict in self._inner_dicts]) + + def __iter__(self): + for idx, inner_dict in enumerate(self._inner_dicts): + for inner_key in inner_dict: + yield (idx, inner_key) + + def items(self): + for idx, inner_dict in enumerate(self._inner_dicts): + for inner_key, value in inner_dict.items(): + yield (idx, inner_key), value + + +class ChainedOptimizer(MegatronOptimizer): + """ChainedOptimizer is designed for a collection of optimizers. + + These optimizers are responsible for different parts of multiple models for + a training task and will be executed one-by-one when the model is updated. + + Args: + chained_optimizers: a list of optimizers. + """ + + def __init__(self, chained_optimizers: List[MegatronOptimizer]): + self.chained_optimizers = chained_optimizers + + @property + def param_groups(self) -> List[dict]: + param_groups = [] + for optimizer in self.chained_optimizers: + param_groups += optimizer.param_groups + return param_groups + + @property + def state(self) -> ProxyDict: + """ + Return optimizer state with tuple keys, where the first element is the + index of the optimizer in the list of chained optimizers. + """ + return ProxyDict([opt.state for opt in self.chained_optimizers]) + + def zero_grad(self, set_to_none=True): + for optimizer in self.chained_optimizers: + optimizer.zero_grad(set_to_none) + + def get_loss_scale(self): + return self.chained_optimizers[0].get_loss_scale() + + def reload_model_params(self): + for optimizer in self.chained_optimizers: + optimizer.reload_model_params() + + def state_dict(self): + return [optimizer.state_dict() for optimizer in self.chained_optimizers] + + def sharded_state_dict( + self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False, **kwargs + ): + sharded_state_dict = {} + for optimizer_idx, optimizer in enumerate(self.chained_optimizers): + optim_state_dict = optimizer.sharded_state_dict( + model_sharded_state_dict, is_loading, **kwargs + ) + add_prefix_for_sharding(optim_state_dict, f'chained_{optimizer_idx}.') + sharded_state_dict[optimizer_idx] = optim_state_dict + return sharded_state_dict + + def load_state_dict(self, state_dict): + if len(self.chained_optimizers) != len(state_dict): + raise RuntimeError( + f'Expected {len(self.chained_optimizers)} entries' + f' in state dict, but got {len(state_dict)}.' + ) + if isinstance(state_dict, dict): + state_dict = (v for k, v in sorted(state_dict.items())) + for optimizer, state in zip(self.chained_optimizers, state_dict): + optimizer.load_state_dict(state) + + @torch.no_grad() + def prepare_grads(self) -> bool: + """Pre-processing gradients before the optimizer step, returns whether inf/nan is found.""" + found_inf_flag = False + for optimizer in self.chained_optimizers: + found_inf_flag |= optimizer.prepare_grads() + + return found_inf_flag + + @torch.no_grad() + def step_with_ready_grads(self) -> bool: + """Step the optimizer with ready gradients, return successful.""" + success = True + for optimizer in self.chained_optimizers: + success &= optimizer.step_with_ready_grads() + + return success + + def disable_pre_hook(self): + for optimizer in self.chained_optimizers: + if ( + not optimizer.config.use_distributed_optimizer + or not optimizer.config.overlap_param_gather + ): + raise ValueError( + "disable_pre_hook should only be called with 'use_distributed_optimizer' " + "and 'overlap_param_gather' both enabled." + ) + optimizer.disable_pre_hook() + + def enable_pre_hook(self): + for optimizer in self.chained_optimizers: + if ( + not optimizer.config.use_distributed_optimizer + or not optimizer.config.overlap_param_gather + ): + raise ValueError( + "enable_pre_hook should only be called with 'use_distributed_optimizer' " + "and 'overlap_param_gather' both enabled." + ) + optimizer.enable_pre_hook() + + @torch.no_grad() + def step(self): + """ChainedOptimizer will step all optimizers one by one.""" + found_inf_flag = self.prepare_grads() + if found_inf_flag: + return False, None, None + + # Get grad norm. + grad_norms = [] + for optimizer in self.chained_optimizers: + _grad_norm = optimizer.get_grad_norm() + grad_norms += [_grad_norm if _grad_norm else 0.0] + grad_norm = math.sqrt(sum([x**2 for x in grad_norms])) + + # Clip gradients. + for optimizer in self.chained_optimizers: + if optimizer.config.clip_grad > 0.0: + clip_grad_by_total_norm_fp32( + optimizer.get_parameters(), + max_norm=optimizer.config.clip_grad, + total_norm=grad_norm, + ) + + # Count the zeros in the grads. + num_zeros_in_grad = 0 + for optimizer in self.chained_optimizers: + num_zeros_in_grad += ( + optimizer.count_zeros() if optimizer.config.log_num_zeros_in_grad else 0 + ) + + update_successful = self.step_with_ready_grads() + + return update_successful, grad_norm, num_zeros_in_grad + + def save_parameter_state(self, filename: str): + """Save the distributed parameter states of all optimizers to a file. + + Args: + filename (str): path to save parameter state to. + """ + save_states = False + states = [] + for optimizer in self.chained_optimizers: + if hasattr(optimizer, 'get_parameter_state_dp_zero'): + state_dict = optimizer.get_parameter_state_dp_zero() + + # Save checkpoint economically, only when DP rank = 0, state dict + # needs to be saved. + if torch.distributed.get_rank(optimizer.data_parallel_group) == 0: + states.append(state_dict) + save_states = True + else: + states.append(None) + else: + states.append(None) + + if save_states: + torch.save(states, filename) + + def load_parameter_state(self, filename: str): + """Load the distributed parameter states of all optimizers from a file. + + Args: + filename (str): path to load parameter state from. + """ + states = None + for idx, optimizer in enumerate(self.chained_optimizers): + if not hasattr(optimizer, 'load_parameter_state_from_dp_zero'): + continue + + # Lazy loading checkpoint, state dict is needed only when DP rank = 0. + if torch.distributed.get_rank(optimizer.data_parallel_group) == 0 and states is None: + states = torch.load(filename) + + state_dict = states[idx] if states else None + optimizer.load_parameter_state_from_dp_zero(state_dict) + + def finish_param_sync(self, model_index: int): + """Finish parameter synchronization for all optimizers.""" + for optimizer in self.chained_optimizers: + optimizer.finish_param_sync(model_index) diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py new file mode 100644 index 0000000..66daea9 --- /dev/null +++ b/megatron/core/optimizer/optimizer_config.py @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Callable, Optional + +import torch + + +@dataclass +class OptimizerConfig: + """Configuration for optimizer.""" + + ############## + # General + ############## + optimizer: str = 'adam' + """Optimizer to use (one of Adam or SGD).""" + + lr: Optional[float] = None + """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each + iteration would be different. + """ + + min_lr: Optional[float] = None + """Minumum value for learning rate. The scheduler clip values below this threshold.""" + + decoupled_lr: Optional[float] = None + """Separate learning rate for the input and output layer.""" + + decoupled_min_lr: Optional[float] = None + """Minimum value for learning rate for the input and output layer. The scheduler clip values + below this threshold. + """ + + weight_decay: float = 0.01 + """Weight decay coefficient for L2 regularization.""" + + ############## + # Precision + ############## + fp16: bool = False + """If true, train with fp16 mixed precision training. Defaults to False.""" + + bf16: bool = False + """If true, train with bf16 mixed precision training. Defaults to False.""" + + params_dtype: torch.dtype = torch.float32 + """dtype used when intializing the weights. Defaults to torch.float32.""" + + ############### + # Loss scaling + ############### + loss_scale: Optional[float] = None + """Static loss scaling, positive power of 2 values can improve fp16 convergence. If None, + dynamic loss scaling is used. + """ + + initial_loss_scale: float = 2 ** 32 + """Initial loss-scale for dynamic loss scaling.""" + + min_loss_scale: float = 1.0 + """Minimum loss scale for dynamic loss scaling.""" + + loss_scale_window: float = 1000 + """Window over which to raise/lower dynamic scale.""" + + hysteresis: int = 2 + """Hysteresis for dynamic loss scaling.""" + + ############## + # Optimizer + ############## + # Adam + adam_beta1: float = 0.9 + """First coefficient for computing running averages of gradient and its square in Adam + optimizer. + """ + + adam_beta2: float = 0.999 + """Second coefficient for computing running averages of gradient and its square in Adam + optimizer. + """ + + adam_eps: float = 1e-08 + """Term added to the denominator to improve numerical stability in Adam optimizer.""" + + # SGD. + sgd_momentum: float = 0.9 + """Momentum factor for SGD optimizer.""" + + ####################### + # Distributed optimizer + ####################### + use_distributed_optimizer: bool = False + """Distribute optimizer state over data-parallel replicas.""" + + overlap_grad_reduce: bool = False + """If true, overlap grad reduce-scatter with backward compute in distributed optimizer.""" + + overlap_param_gather: bool = False + """If true, overlap param all-gather with forward compute in distributed optimizer.""" + + ################ + # Miscellaneous + ################ + clip_grad: float = 1.0 + """Gradient clipping based on global L2 norm.""" + + log_num_zeros_in_grad: bool = False + """If true, calculate and log the number of zeros in gradient.""" + + barrier_with_L1_time: bool = False + """If true, use barrier with level 1 time measurements.""" + + timers: Callable = None + """Function to get timers.""" diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py new file mode 100644 index 0000000..980faab --- /dev/null +++ b/megatron/core/package_info.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +MAJOR = 0 +MINOR = 8 +PATCH = 0 +PRE_RELEASE = '' + +# Use the following formatting: (major, minor, patch, pre-release) +VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) + +__shortversion__ = '.'.join(map(str, VERSION[:3])) +__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) + +__package_name__ = 'megatron_core' +__contact_names__ = 'NVIDIA' +__contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email +__homepage__ = ( + 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage +) +__repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' +__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' +__description__ = ( + 'Megatron Core - a library for efficient and scalable training of transformer based models' +) +__license__ = 'BSD-3' +__keywords__ = ( + 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' +) diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py new file mode 100644 index 0000000..478c172 --- /dev/null +++ b/megatron/core/packed_seq_params.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass + +from torch import Tensor + + +@dataclass +class PackedSeqParams: + # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format, + qkv_format: str = None + cu_seqlens_q: Tensor = None + cu_seqlens_kv: Tensor = None + max_seqlen_q: Tensor = None + max_seqlen_kv: Tensor = None diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py new file mode 100644 index 0000000..ac43ee5 --- /dev/null +++ b/megatron/core/parallel_state.py @@ -0,0 +1,1357 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Model and data parallel groups.""" + +import os +import warnings +from datetime import timedelta +from typing import List, Optional + +import torch + +from .utils import GlobalMemoryBuffer + +# Intra-layer model parallel group that the current rank belongs to. +_TENSOR_MODEL_PARALLEL_GROUP = None +# Inter-layer model parallel group that the current rank belongs to. +_PIPELINE_MODEL_PARALLEL_GROUP = None +# Model parallel group (both intra- and pipeline) that the current rank belongs to. +_MODEL_PARALLEL_GROUP = None +# Model parallel group (both intra-, pipeline, and expert) that the current rank belongs to. +_MODEL_AND_EXPERT_PARALLEL_GROUP = None +# Embedding group. +_EMBEDDING_GROUP = None +# Position embedding group. +_POSITION_EMBEDDING_GROUP = None +# Data parallel group that the current rank belongs to. +_DATA_PARALLEL_GROUP = None +_DATA_PARALLEL_GROUP_GLOO = None +# tensor model parallel group and data parallel group combined +# used for fp8 and moe training +_TENSOR_AND_DATA_PARALLEL_GROUP = None +# Expert parallel group that the current rank belongs to. +_EXPERT_MODEL_PARALLEL_GROUP = None +_TENSOR_AND_EXPERT_PARALLEL_GROUP = None +_DATA_MODULO_EXPERT_PARALLEL_GROUP = None +_DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = None +_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = None +_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = None + + +_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None +_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None +_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None + +# These values enable us to change the mpu sizes on the fly. +_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None +_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None +_MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None +_MPU_TENSOR_MODEL_PARALLEL_RANK = None +_MPU_PIPELINE_MODEL_PARALLEL_RANK = None +_MPU_EXPERT_MODEL_PARALLEL_RANK = None + +# A list of ranks that have a copy of the embedding. +_EMBEDDING_GLOBAL_RANKS = None + +# A list of ranks that have a copy of the position embedding. +_POSITION_EMBEDDING_GLOBAL_RANKS = None + +# A list of global ranks for each pipeline group to ease calculation of the source +# rank when broadcasting from the first or last pipeline stage. +_PIPELINE_GLOBAL_RANKS = None + +# A list of global ranks for each data parallel group to ease calculation of the source +# rank when broadcasting weights from src to all other data parallel ranks +_DATA_PARALLEL_GLOBAL_RANKS = None + +# A list of global ranks for each tensor model parallel group to ease calculation of +# the first local rank in the tensor model parallel group +_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None + +# Context parallel group that the current rank belongs to +_CONTEXT_PARALLEL_GROUP = None +# A list of global ranks for each context parallel group to ease calculation of the +# destination rank when exchanging KV/dKV between context parallel_ranks +_CONTEXT_PARALLEL_GLOBAL_RANKS = None + +# Data parallel group information with context parallel combined. +_DATA_PARALLEL_GROUP_WITH_CP = None +_DATA_PARALLEL_GROUP_WITH_CP_GLOO = None +_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None + +# combined parallel group of TP and CP +_TENSOR_AND_CONTEXT_PARALLEL_GROUP = None + +# combined parallel group of TP, DP, and CP used for fp8 +_TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None + +# Memory buffers to avoid dynamic memory allocation +_GLOBAL_MEMORY_BUFFER = None + +# MOE logging +_MOE_LAYER_WISE_LOGGING_TRACKER = {} + + +def get_nccl_options(pg_name, nccl_comm_cfgs): + """Set the NCCL process group options. + + Args: + pg_name (str): process group name + nccl_comm_cfgs (dict): nccl communicator configurations + + When an option (e.g., max_ctas) is not found in the config, use the NCCL default setting. + """ + if pg_name in nccl_comm_cfgs: + nccl_options = torch.distributed.ProcessGroupNCCL.Options() + nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get('cga_cluster_size', 4) + nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get('max_ctas', 32) + nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get('min_ctas', 1) + return nccl_options + else: + return None + + +def generate_masked_orthogonal_rank_groups( + world_size: int, + parallel_size: List[int], + mask: List[bool], +) -> List[List[int]]: + """Generate orthogonal parallel groups based on the parallel size and mask. + + Arguments: + world_size (int): world size + + parallel_size (List[int]): + The parallel size of each orthogonal parallel type. For example, if + tensor_parallel_size = 2, pipeline_model_parallel_group = 3, data_parallel_size = 4, + and the parallel mapping order is tp-pp-dp, then the parallel_size = [2, 3, 4]. + + mask (List[bool]): + The mask controls which parallel methods the generated groups represent. If mask[i] is + True, it means the generated group contains the i-th parallelism method. For example, + if parallel_size = [tp_size, pp_size, dp_size], and mask = [True, False , True], then + the generated group is the `tp-dp` group, if the mask = [False, True, False], then the + generated group is the `pp` group. + + Algorithm: + For orthogonal parallelism, such as tp/dp/pp/cp, the global_rank and + local_rank satisfy the following equation: + global_rank = tp_rank + dp_rank * tp_size + pp_rank * tp_size * dp_size (1) + tp_rank \in [0, tp_size) + dp_rank \in [0, dp_size) + pp_rank \in [0, pp_size) + + If we want to get the `dp_group` (tp_size * pp_size groups of dp_size ranks each. + For example, if the gpu size is 8 and order is 'tp-pp-dp', size is '2-2-2', and the + dp_group here is [[0, 4], [1, 5], [2, 6], [3, 7]].) + The tp_rank and pp_rank will be combined to form the `dp_group_index`. + dp_group_index = tp_rank + pp_rank * tp_size (2) + + So, Given that tp_rank and pp_rank satisfy equation (2), and dp_rank in + range(0, dp_size), the ranks in dp_group[dp_group_index] satisfies the + equation (1). + + This function solve this math problem. + + For example, if the parallel_size = [tp_size, dp_size, pp_size] = [2, 3, 4], + and the mask = [False, True, False]. Then, + dp_group_index(0) = tp_rank(0) + pp_rank(0) * 2 + dp_group_index(1) = tp_rank(1) + pp_rank(0) * 2 + ... + dp_group_index(7) = tp_rank(1) + pp_rank(3) * 2 + + dp_group[0] = 0 + range(0, 3) * 2 + 0 = [0, 2, 4] + dp_group[1] = 1 + range(0, 3) * 2 + 0 = [1, 3, 5] + ... + dp_group[7] = 1 + range(0, 3) * 2 + 3 * 2 * 3 = [19, 21, 23] + """ + + def prefix_product(a: List[int], init=1) -> List[int]: + r = [init] + for v in a: + init = init * v + r.append(init) + return r + + def inner_product(a: List[int], b: List[int]) -> int: + return sum([x * y for x, y in zip(a, b)]) + + def decompose(index, shape, stride=None): + ''' + This function solve the math problem below: + There is an equation: + index = sum(idx[i] * stride[i]) + And given the value of index, stride. + Return the idx. + This function will used to get the pp/dp/pp_rank + from group_index and rank_in_group. + ''' + if stride is None: + stride = prefix_product(shape) + idx = [(index // d) % s for s, d in zip(shape, stride)] + # stride is a prefix_product result. And the value of stride[-1] + # is not used. + assert ( + sum([x * y for x, y in zip(idx, stride[:-1])]) == index + ), "idx {} with shape {} mismatch the return idx {}".format(index, shape, idx) + return idx + + masked_shape = [s for s, m in zip(parallel_size, mask) if m] + unmasked_shape = [s for s, m in zip(parallel_size, mask) if not m] + + global_stride = prefix_product(parallel_size) + masked_stride = [d for d, m in zip(global_stride, mask) if m] + unmasked_stride = [d for d, m in zip(global_stride, mask) if not m] + + group_size = prefix_product(masked_shape)[-1] + num_of_group = world_size // group_size + + ranks = [] + for group_index in range(num_of_group): + # get indices from unmaksed for group_index. + decomposed_group_idx = decompose(group_index, unmasked_shape) + rank = [] + for rank_in_group in range(group_size): + # get indices from masked for rank_in_group. + decomposed_rank_idx = decompose(rank_in_group, masked_shape) + rank.append( + inner_product(decomposed_rank_idx, masked_stride) + + inner_product(decomposed_group_idx, unmasked_stride) + ) + ranks.append(rank) + return ranks + + +class RankGenerator(object): + def __init__(self, tp: int, ep: int, dp: int, pp: int, cp: int, order: str) -> None: + self.tp = tp + self.ep = ep + self.dp = dp + self.pp = pp + self.cp = cp + self.world_size = tp * dp * pp * cp + + self.name_to_size = { + "tp": self.tp, + "pp": self.pp, + "dp": self.dp, + "ep": self.ep, + "cp": self.cp, + } + self.order = order + order = order.lower() + + if 'ep' in order: + if 'ep-dp' not in order and 'dp-ep' not in order: + raise RuntimeError(f"The ep and dp must be adjacent in order ({self.order}).") + + for name in self.name_to_size.keys(): + if name not in order and self.name_to_size[name] != 1: + raise RuntimeError( + f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't specified the order ({self.order})." + ) + elif name not in order: + order = order + '-' + name + + self.order_w_ep = order + self.order_wo_ep = '-'.join([token for token in order.split('-') if token != 'ep']) + self.ordered_size_wo_ep = [] + self.ordered_size_w_ep = [] + + for token in order.split('-'): + if token == 'dp': + self.ordered_size_w_ep.append(self.dp // self.ep) + self.ordered_size_wo_ep.append(self.dp) + elif token == 'ep': + self.ordered_size_w_ep.append(self.ep) + else: + self.ordered_size_w_ep.append(self.name_to_size[token]) + self.ordered_size_wo_ep.append(self.name_to_size[token]) + + def get_mask(self, order: str, token: str): + ordered_token = order.split('-') + token = token.split('-') + mask = [False] * len(ordered_token) + for t in token: + mask[ordered_token.index(t)] = True + return mask + + def get_ranks(self, token, independent_ep=False): + '''Get rank group by input token. + + Arguments: + token (str): + Specify the ranks type that want to get. If we want + to obtain multiple parallel types, we can use a hyphen + '-' to separate them. For example, if we want to obtain + the TP_DP group, the token should be 'tp-dp'. + + independent_ep (bool: True): + This flag controls whether we treat EP and DP independently. + EP shares ranks with DP, if we want to get ranks related to + EP, we should set the flag. For example, get_ranks('dp', True) + will get DP modulo EP group, and get_ranks('dp', False) will + get full DP group. + ''' + if independent_ep: + parallel_size = self.ordered_size_w_ep + order = self.order_w_ep + else: + parallel_size = self.ordered_size_wo_ep + order = self.order_wo_ep + mask = self.get_mask(order, token) + ranks = generate_masked_orthogonal_rank_groups(self.world_size, parallel_size, mask) + return ranks + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + virtual_pipeline_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_split_rank: Optional[int] = None, + use_sharp: bool = False, + context_parallel_size: int = 1, + expert_model_parallel_size: int = 1, + nccl_communicator_config_path: Optional[str] = None, + distributed_timeout_minutes: int = 30, + order: str = "tp-cp-ep-dp-pp", +) -> None: + """Initialize model data parallel groups. + + Args: + tensor_model_parallel_size (int, default = 1): + The number of GPUs to split individual tensors across. + + pipeline_model_parallel_size (int, default = 1): + The number of tensor parallel GPU groups to split the + Transformer layers across. For example, if + tensor_model_parallel_size is 4 and + pipeline_model_parallel_size is 2, the model will be split + into 2 groups of 4 GPUs. + + virtual_pipeline_model_parallel_size (int, optional): + The number of stages that each pipeline group will have, + interleaving as necessary. If None, no interleaving is + performed. For example, if tensor_model_parallel_size is 1, + pipeline_model_parallel_size is 4, + virtual_pipeline_model_parallel_size is 2, and there are + 16 transformer layers in the model, the model will be + split into 8 stages with two layers each and each GPU + would get 2 stages as such (layer number starting with 1): + + GPU 0: [1, 2] [9, 10] + GPU 1: [3, 4] [11, 12] + GPU 2: [5, 6] [13, 14] + GPU 3: [7, 8] [15, 16] + + pipeline_model_parallel_split_rank (int, optional): + For models with both an encoder and decoder, the rank in + pipeline to switch between encoder and decoder (i.e. the + first rank of the decoder). This allows the user to set + the pipeline parallel size of the encoder and decoder + independently. For example, if + pipeline_model_parallel_size is 8 and + pipeline_model_parallel_split_rank is 3, then ranks 0-2 + will be the encoder and ranks 3-7 will be the decoder. + + use_sharp (bool, default = False): + Set the use of SHARP for the collective communications of + data-parallel process groups. When `True`, run barrier + within each data-parallel process group, which specifies + the SHARP application target groups. + + context_parallel_size (int, default = 1): + The number of tensor parallel GPU groups to split the + network input sequence length across. Compute of attention + module requires tokens of full sequence length, so GPUs + in a context parallel group need to communicate with each + other to exchange information of other sequence chunks. + Each GPU and its counterparts in other tensor parallel + groups compose a context parallel group. + + For example, assume we have 8 GPUs, if tensor model parallel + size is 4 and context parallel size is 2, the network input + will be split into two sequence chunks, which are processed + by 2 different groups of 4 GPUs. One chunk is processed by + GPU0-3, the other chunk is processed by GPU4-7. Four groups + are build to do context parallel communications: [GPU0, GPU4], + [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7]. + + Context parallelism partitions sequence length, so it has no + impact on weights, which means weights are duplicated among + GPUs in a context parallel group. Hence, weight gradients + all-reduce is required in backward. For simplicity, we piggyback + GPUs of context parallelism on data parallel group for + weight gradient all-reduce. + + expert_model_parallel_size (int, default = 1): + The number of Mixture of Experts parallel GPUs in each expert + parallel group. + + nccl_communicator_config_path (str, default = None): + Path to the yaml file of NCCL communicator configurations. + `min_ctas`, `max_ctas`, and `cga_cluster_size` can be set + for each communicator. + + distributed_timeout_minutes (int, default = 30): Timeout, in + minutes,for operations executed against distributed + process groups. See PyTorch documentation at + https://pytorch.org/docs/stable/distributed.html for + caveats. + + order (str, default=tp-dp-pp): + The rank initialization order of parallelism. Now we support + tp-dp-pp and tp-pp-dp orders. + + Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 8 tensor model-parallel groups, 4 pipeline model-parallel groups + and 8 data-parallel groups as: + 8 data_parallel groups: + [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15] + 8 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15] + 4 pipeline model-parallel groups: + [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = torch.distributed.get_world_size() + + if ( + world_size + % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) + != 0 + ): + raise RuntimeError( + f"world_size ({world_size}) is not divisible by tensor_model_parallel_size " + f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size}) " + f"x context_parallel_size ({context_parallel_size})" + ) + + data_parallel_size: int = world_size // ( + tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size + ) + + if data_parallel_size % expert_model_parallel_size != 0: + raise RuntimeError( + f"data_parallel_size ({data_parallel_size}) is not divisible by expert_model_parallel_size " + ) + + if virtual_pipeline_model_parallel_size is not None: + if not pipeline_model_parallel_size > 1: + raise RuntimeError( + "pipeline-model-parallel size should be greater than 1 with interleaved schedule" + ) + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0 + _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size + + if pipeline_model_parallel_split_rank is not None: + global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank + + rank = torch.distributed.get_rank() + + nccl_comm_cfgs = {} + if nccl_communicator_config_path is not None: + try: + import yaml + except ImportError: + raise RuntimeError( + "Cannot import `yaml`. Setting custom nccl communicator configs " + "requires the yaml package." + ) + + with open(nccl_communicator_config_path, "r") as stream: + nccl_comm_cfgs = yaml.safe_load(stream) + + rank_generator = RankGenerator( + tp=tensor_model_parallel_size, + ep=expert_model_parallel_size, + dp=data_parallel_size, + pp=pipeline_model_parallel_size, + cp=context_parallel_size, + order=order, + ) + timeout = timedelta(minutes=distributed_timeout_minutes) + + # Build the data-parallel groups. + global _DATA_PARALLEL_GROUP + global _DATA_PARALLEL_GROUP_GLOO + global _DATA_PARALLEL_GLOBAL_RANKS + global _DATA_PARALLEL_GROUP_WITH_CP + global _DATA_PARALLEL_GROUP_WITH_CP_GLOO + global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP + assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized' + + for ranks in rank_generator.get_ranks('dp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('dp', nccl_comm_cfgs) + ) + group_gloo = torch.distributed.new_group(ranks, timeout=timeout, backend="gloo") + if rank in ranks: + _DATA_PARALLEL_GROUP = group + _DATA_PARALLEL_GROUP_GLOO = group_gloo + _DATA_PARALLEL_GLOBAL_RANKS = ranks + for ranks_with_cp in rank_generator.get_ranks('dp-cp'): + group_with_cp = torch.distributed.new_group( + ranks_with_cp, timeout=timeout, pg_options=get_nccl_options('dp_cp', nccl_comm_cfgs) + ) + group_with_cp_gloo = torch.distributed.new_group( + ranks_with_cp, timeout=timeout, backend="gloo" + ) + if rank in ranks_with_cp: + _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp + _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo + _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp + + # Apply SHARP to DP process groups + if use_sharp: + if rank == 0: + print( + "The number of process groups to use SHARP with depends on the type " + "of the network switch. Nvidia QM1 switch supports SAHRP up to 8 " + "process groups and QM2 supports up to 256 process groups. We apply " + "SHARP to the communications of the data-parallel domain. If the " + "number of data-parallel process groups is larger than the max " + "process groups that the network switch supports, the communication " + "will fall back to non-SHARP operators. To enable SHARP, " + "`#SBATCH_NETWORK=sharp` should be set in the sbatch script." + ) + torch.distributed.barrier( + group=get_data_parallel_group(with_context_parallel=True), + device_ids=[torch.cuda.current_device()], + ) + # Set `NCCL_COLLNET_ENABLE=0` to restrict SHARP application to DP process groups + os.environ["NCCL_COLLNET_ENABLE"] = "0" + + # Build the context-parallel groups. + global _CONTEXT_PARALLEL_GROUP + global _CONTEXT_PARALLEL_GLOBAL_RANKS + assert _CONTEXT_PARALLEL_GROUP is None, 'context parallel group is already initialized' + for ranks in rank_generator.get_ranks('cp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('cp', nccl_comm_cfgs) + ) + if rank in ranks: + _CONTEXT_PARALLEL_GROUP = group + _CONTEXT_PARALLEL_GLOBAL_RANKS = ranks + + # Build the model-parallel groups. + global _MODEL_PARALLEL_GROUP + assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized' + for ranks in rank_generator.get_ranks('tp-pp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('mp', nccl_comm_cfgs) + ) + if rank in ranks: + _MODEL_PARALLEL_GROUP = group + + # Build the model-parallel groups with expert parallel + global _MODEL_AND_EXPERT_PARALLEL_GROUP + assert ( + _MODEL_AND_EXPERT_PARALLEL_GROUP is None + ), 'model and expert parallel group is already initialized' + for ranks in rank_generator.get_ranks('tp-ep-pp', independent_ep=True): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('mp_exp', nccl_comm_cfgs) + ) + if rank in ranks: + _MODEL_AND_EXPERT_PARALLEL_GROUP = group + + # Build the tensor model-parallel groups. + global _TENSOR_MODEL_PARALLEL_GROUP + global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS + assert ( + _TENSOR_MODEL_PARALLEL_GROUP is None + ), 'tensor model parallel group is already initialized' + for ranks in rank_generator.get_ranks('tp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp', nccl_comm_cfgs) + ) + if rank in ranks: + _TENSOR_MODEL_PARALLEL_GROUP = group + _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = ranks + + # Build the pipeline model-parallel groups and embedding groups + # (first and last rank in each pipeline model-parallel group). + global _PIPELINE_MODEL_PARALLEL_GROUP + global _PIPELINE_GLOBAL_RANKS + assert ( + _PIPELINE_MODEL_PARALLEL_GROUP is None + ), 'pipeline model parallel group is already initialized' + global _EMBEDDING_GROUP + global _EMBEDDING_GLOBAL_RANKS + assert _EMBEDDING_GROUP is None, 'embedding group is already initialized' + global _POSITION_EMBEDDING_GROUP + global _POSITION_EMBEDDING_GLOBAL_RANKS + assert _POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized' + for ranks in rank_generator.get_ranks('pp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('pp', nccl_comm_cfgs) + ) + if rank in ranks: + _PIPELINE_MODEL_PARALLEL_GROUP = group + _PIPELINE_GLOBAL_RANKS = ranks + # Setup embedding group (to exchange gradients between + # first and last stages). + if len(ranks) > 1: + embedding_ranks = [ranks[0], ranks[-1]] + position_embedding_ranks = [ranks[0]] + if pipeline_model_parallel_split_rank is not None: + if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks: + embedding_ranks = [ + ranks[0], + ranks[pipeline_model_parallel_split_rank], + ranks[-1], + ] + if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks: + position_embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank]] + else: + embedding_ranks = ranks + position_embedding_ranks = ranks + + group = torch.distributed.new_group( + embedding_ranks, timeout=timeout, pg_options=get_nccl_options('embd', nccl_comm_cfgs) + ) + if rank in embedding_ranks: + _EMBEDDING_GROUP = group + if rank in ranks: + _EMBEDDING_GLOBAL_RANKS = embedding_ranks + + group = torch.distributed.new_group( + position_embedding_ranks, + timeout=timeout, + pg_options=get_nccl_options('embd', nccl_comm_cfgs), + ) + if rank in position_embedding_ranks: + _POSITION_EMBEDDING_GROUP = group + if rank in ranks: + _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks + + # Build the tensor + data parallel groups. + global _TENSOR_AND_DATA_PARALLEL_GROUP + global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP + assert ( + _TENSOR_AND_DATA_PARALLEL_GROUP is None + ), 'Tensor + data parallel group is already initialized' + for ranks in rank_generator.get_ranks('tp-dp-cp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp_cp', nccl_comm_cfgs) + ) + if rank in ranks: + _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group + for ranks in rank_generator.get_ranks('tp-dp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp', nccl_comm_cfgs) + ) + if rank in ranks: + _TENSOR_AND_DATA_PARALLEL_GROUP = group + + global _TENSOR_AND_CONTEXT_PARALLEL_GROUP + assert ( + _TENSOR_AND_CONTEXT_PARALLEL_GROUP is None + ), 'Tensor + context parallel group is already initialized' + for ranks in rank_generator.get_ranks('tp-cp'): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp_cp', nccl_comm_cfgs) + ) + if rank in ranks: + _TENSOR_AND_CONTEXT_PARALLEL_GROUP = group + + # Build the tensor + expert parallel groups + global _EXPERT_MODEL_PARALLEL_GROUP + assert _EXPERT_MODEL_PARALLEL_GROUP is None, 'Expert parallel group is already initialized' + global _TENSOR_AND_EXPERT_PARALLEL_GROUP + assert ( + _TENSOR_AND_EXPERT_PARALLEL_GROUP is None + ), 'Tensor + expert parallel group is already initialized' + global _DATA_MODULO_EXPERT_PARALLEL_GROUP + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP is None + ), 'Data modulo expert group is already initialized' + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is None + ), 'Data modulo expert group with context parallel is already initialized' + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO + + for ranks in rank_generator.get_ranks('tp-ep', independent_ep=True): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp_exp', nccl_comm_cfgs) + ) + if rank in ranks: + _TENSOR_AND_EXPERT_PARALLEL_GROUP = group + + for ranks in rank_generator.get_ranks('ep', independent_ep=True): + group = torch.distributed.new_group( + ranks, pg_options=get_nccl_options('exp', nccl_comm_cfgs) + ) + if rank in ranks: + _EXPERT_MODEL_PARALLEL_GROUP = group + + for ranks in rank_generator.get_ranks('dp', independent_ep=True): + group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('dp_modulo_exp', nccl_comm_cfgs) + ) + group_gloo = torch.distributed.new_group(ranks, backend="gloo") + if rank in ranks: + _DATA_MODULO_EXPERT_PARALLEL_GROUP = group + _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = group_gloo + + for ranks in rank_generator.get_ranks('dp-cp', independent_ep=True): + # Lazy initialization of the group + if get_context_parallel_world_size() > 1: + group = torch.distributed.new_group( + ranks, + timeout=timeout, + pg_options=get_nccl_options('dp_modulo_exp_cp', nccl_comm_cfgs), + ) + group_gloo = torch.distributed.new_group(ranks, backend="gloo") + else: + group = _DATA_MODULO_EXPERT_PARALLEL_GROUP + group_gloo = _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO + if rank in ranks: + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = group + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = group_gloo + + # Initialize global memory buffer + # This isn't really "parallel state" but there isn't another good place to + # put this. If we end up with a more generic initialization of megatron-core + # we could stick it there + _set_global_memory_buffer() + + +def is_initialized(): + """Useful for code segments that may be accessed with or without mpu initialization""" + return _DATA_PARALLEL_GROUP is not None + + +def is_unitialized() -> bool: + """Check if parallel state has been initialized + + Deprecated. Use is_initialized instead. + + """ + warnings.warn( + "is_unitialized is deprecated, use is_initialized instead", + DeprecationWarning, + ) + return not is_initialized() + + +def model_parallel_is_initialized(): + """Check if model and data parallel groups are initialized.""" + if ( + _TENSOR_MODEL_PARALLEL_GROUP is None + or _PIPELINE_MODEL_PARALLEL_GROUP is None + or _DATA_PARALLEL_GROUP is None + ): + return False + return True + + +def get_model_parallel_group(with_expert_parallel=False): + """Get the model parallel group the caller rank belongs to.""" + if with_expert_parallel: + assert ( + _MODEL_AND_EXPERT_PARALLEL_GROUP is not None + ), 'model parallel group is not initialized' + return _MODEL_AND_EXPERT_PARALLEL_GROUP + assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized' + return _MODEL_PARALLEL_GROUP + + +def get_tensor_model_parallel_group(check_initialized=True): + """Get the tensor model parallel group the caller rank belongs to.""" + if check_initialized: + assert ( + _TENSOR_MODEL_PARALLEL_GROUP is not None + ), 'tensor model parallel group is not initialized' + return _TENSOR_MODEL_PARALLEL_GROUP + + +def get_pipeline_model_parallel_group(): + """Get the pipeline model parallel group the caller rank belongs to.""" + assert ( + _PIPELINE_MODEL_PARALLEL_GROUP is not None + ), 'pipeline_model parallel group is not initialized' + return _PIPELINE_MODEL_PARALLEL_GROUP + + +def get_data_parallel_group(with_context_parallel=False): + """Get the data parallel group the caller rank belongs to.""" + if with_context_parallel: + assert ( + _DATA_PARALLEL_GROUP_WITH_CP is not None + ), 'data parallel group with context parallel combined is not initialized' + return _DATA_PARALLEL_GROUP_WITH_CP + else: + assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized' + return _DATA_PARALLEL_GROUP + + +def get_data_parallel_group_gloo(with_context_parallel=False): + """Get the data parallel group-gloo the caller rank belongs to.""" + if with_context_parallel: + assert ( + _DATA_PARALLEL_GROUP_WITH_CP_GLOO is not None + ), 'data parallel group-gloo with context parallel combined is not initialized' + return _DATA_PARALLEL_GROUP_WITH_CP_GLOO + else: + assert _DATA_PARALLEL_GROUP_GLOO is not None, 'data parallel group-gloo is not initialized' + return _DATA_PARALLEL_GROUP_GLOO + + +def get_context_parallel_group(check_initialized=True): + """Get the context parallel group the caller rank belongs to.""" + if check_initialized: + assert _CONTEXT_PARALLEL_GROUP is not None, 'context parallel group is not initialized' + return _CONTEXT_PARALLEL_GROUP + + +def get_context_parallel_global_ranks(check_initialized=True): + """Get all global ranks of the context parallel group that the caller rank belongs to.""" + if check_initialized: + assert ( + _CONTEXT_PARALLEL_GLOBAL_RANKS is not None + ), 'context parallel group is not initialized' + return _CONTEXT_PARALLEL_GLOBAL_RANKS + + +def get_embedding_group(): + """Get the embedding group the caller rank belongs to.""" + assert _EMBEDDING_GROUP is not None, 'embedding group is not initialized' + return _EMBEDDING_GROUP + + +def get_position_embedding_group(): + """Get the position embedding group the caller rank belongs to.""" + assert _POSITION_EMBEDDING_GROUP is not None, 'position embedding group is not initialized' + return _POSITION_EMBEDDING_GROUP + + +def get_amax_reduction_group(with_context_parallel=False): + """Get the FP8 amax reduction group the caller rank belongs to.""" + if with_context_parallel: + assert ( + _TENSOR_AND_CONTEXT_PARALLEL_GROUP is not None + ), 'FP8 amax reduction group is not initialized' + return _TENSOR_AND_CONTEXT_PARALLEL_GROUP + else: + assert ( + _TENSOR_MODEL_PARALLEL_GROUP is not None + ), 'FP8 amax reduction group is not initialized' + return _TENSOR_MODEL_PARALLEL_GROUP + + +def get_tensor_and_data_parallel_group(with_context_parallel=False): + """Get the tensor and data parallel group the caller rank belongs to.""" + if with_context_parallel: + assert ( + _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP is not None + ), 'tensor and data parallel group is not initialized' + return _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP + else: + assert ( + _TENSOR_AND_DATA_PARALLEL_GROUP is not None + ), 'tensor and data parallel group is not initialized' + return _TENSOR_AND_DATA_PARALLEL_GROUP + + +def get_tensor_and_context_parallel_group(): + """Get the tensor and context parallel group the caller rank belongs to.""" + assert ( + _TENSOR_AND_CONTEXT_PARALLEL_GROUP is not None + ), 'tensor and context parallel group is not initialized' + return _TENSOR_AND_CONTEXT_PARALLEL_GROUP + + +def get_expert_model_parallel_group(): + assert ( + _EXPERT_MODEL_PARALLEL_GROUP is not None + ), 'expert model parallel group is not initialized' + return _EXPERT_MODEL_PARALLEL_GROUP + + +def get_tensor_and_expert_parallel_group(): + assert ( + _TENSOR_AND_EXPERT_PARALLEL_GROUP is not None + ), 'tensor and expert parallel group is not initialized' + return _TENSOR_AND_EXPERT_PARALLEL_GROUP + + +def get_data_modulo_expert_parallel_group(with_context_parallel=False): + if with_context_parallel: + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is not None + ), 'data modulo expert parallel group with context parallel is not initialized' + return _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP + else: + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP is not None + ), 'data modulo expert parallel group is not initialized' + return _DATA_MODULO_EXPERT_PARALLEL_GROUP + + +def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False): + if with_context_parallel: + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO is not None + ), 'data modulo expert parallel group-gloo with context parallel is not initialized' + return _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO + else: + assert ( + _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO is not None + ), 'data modulo expert parallel group-gloo is not initialized' + return _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO + + +def set_expert_model_parallel_world_size(world_size): + global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE + _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = world_size + + +def set_tensor_model_parallel_world_size(world_size): + """Set the tensor model parallel size""" + global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size + + +def set_pipeline_model_parallel_world_size(world_size): + """Set the pipeline model parallel size""" + global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size + + +def set_virtual_pipeline_model_parallel_world_size(world_size): + """Set the pipeline model parallel size""" + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size + + +def get_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None: + return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + return torch.distributed.get_world_size(group=get_tensor_model_parallel_group()) + + +def get_pipeline_model_parallel_world_size(): + """Return world size for the pipeline model parallel group.""" + global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None: + return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group()) + + +def set_expert_model_parallel_rank(rank): + """Set expert model parallel rank.""" + global _MPU_EXPERT_MODEL_PARALLEL_RANK + _MPU_EXPERT_MODEL_PARALLEL_RANK = rank + + +def set_tensor_model_parallel_rank(rank): + """Set tensor model parallel rank.""" + global _MPU_TENSOR_MODEL_PARALLEL_RANK + _MPU_TENSOR_MODEL_PARALLEL_RANK = rank + + +def set_pipeline_model_parallel_rank(rank): + """Set pipeline model parallel rank.""" + global _MPU_PIPELINE_MODEL_PARALLEL_RANK + _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank + + +def set_pipeline_model_parallel_split_rank(rank): + """Set pipeline model parallel split rank.""" + global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank + + +def get_tensor_model_parallel_rank(): + """Return my rank for the tensor model parallel group.""" + global _MPU_TENSOR_MODEL_PARALLEL_RANK + if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None: + return _MPU_TENSOR_MODEL_PARALLEL_RANK + return torch.distributed.get_rank(group=get_tensor_model_parallel_group()) + + +def get_pipeline_model_parallel_rank(): + """Return my rank for the pipeline model parallel group.""" + global _MPU_PIPELINE_MODEL_PARALLEL_RANK + if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None: + return _MPU_PIPELINE_MODEL_PARALLEL_RANK + return torch.distributed.get_rank(group=get_pipeline_model_parallel_group()) + + +def get_pipeline_model_parallel_split_rank(): + """Return pipeline model parallel split rank.""" + global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + return _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + + +def is_pipeline_first_stage(ignore_virtual=False): + """Return True if in the first pipeline model-parallel stage, False otherwise.""" + if not ignore_virtual: + if ( + get_virtual_pipeline_model_parallel_world_size() is not None + and get_virtual_pipeline_model_parallel_rank() != 0 + ): + return False + return get_pipeline_model_parallel_rank() == 0 + + +def is_pipeline_last_stage(ignore_virtual=False): + """Return True if in the last pipeline model-parallel stage, False otherwise.""" + if not ignore_virtual: + virtual_pipeline_model_parallel_world_size = ( + get_virtual_pipeline_model_parallel_world_size() + ) + if ( + virtual_pipeline_model_parallel_world_size is not None + and get_virtual_pipeline_model_parallel_rank() + != (virtual_pipeline_model_parallel_world_size - 1) + ): + return False + return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1) + + +def is_rank_in_embedding_group(ignore_virtual=False): + """Return true if current rank is in embedding group, False otherwise.""" + rank = torch.distributed.get_rank() + global _EMBEDDING_GLOBAL_RANKS + if ignore_virtual: + return rank in _EMBEDDING_GLOBAL_RANKS + if rank in _EMBEDDING_GLOBAL_RANKS: + if rank == _EMBEDDING_GLOBAL_RANKS[0]: + return is_pipeline_first_stage(ignore_virtual=False) + elif rank == _EMBEDDING_GLOBAL_RANKS[-1]: + return is_pipeline_last_stage(ignore_virtual=False) + else: + return True + return False + + +def is_rank_in_position_embedding_group(): + """Return true if current rank is in position embedding group, False otherwise.""" + rank = torch.distributed.get_rank() + global _POSITION_EMBEDDING_GLOBAL_RANKS + return rank in _POSITION_EMBEDDING_GLOBAL_RANKS + + +def is_pipeline_stage_before_split(rank=None): + """Return True if pipeline stage executes encoder block for a model + with both encoder and decoder.""" + if get_pipeline_model_parallel_world_size() == 1: + return True + if rank is None: + rank = get_pipeline_model_parallel_rank() + global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None: + return True + if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK: + return True + return False + + +def is_pipeline_stage_after_split(rank=None): + """Return True if pipeline stage executes decoder block for a model + with both encoder and decoder.""" + if get_pipeline_model_parallel_world_size() == 1: + return True + if rank is None: + rank = get_pipeline_model_parallel_rank() + global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK + if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None: + return True + if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK: + return True + return False + + +def is_pipeline_stage_at_split(): + """Return true if pipeline stage executes decoder block and next + stage executes encoder block for a model with both encoder and + decoder.""" + rank = get_pipeline_model_parallel_rank() + return is_pipeline_stage_before_split(rank) and is_pipeline_stage_after_split(rank + 1) + + +def get_virtual_pipeline_model_parallel_rank(): + """Return the virtual pipeline-parallel rank.""" + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK + return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK + + +def set_virtual_pipeline_model_parallel_rank(rank): + """Set the virtual pipeline-parallel rank.""" + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK + _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank + + +def get_virtual_pipeline_model_parallel_world_size(): + """Return the virtual pipeline-parallel world size.""" + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + + +def get_tensor_model_parallel_src_rank(): + """Calculate the global rank corresponding to the first local rank + in the tensor model parallel group.""" + assert ( + _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None + ), "Tensor model parallel group is not initialized" + return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[0] + + +def get_data_parallel_src_rank(with_context_parallel=False): + """Calculate the global rank corresponding to the first local rank + in the data parallel group.""" + if with_context_parallel: + assert ( + _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP is not None + ), "Data parallel group with context parallel combined is not initialized" + return _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP[0] + else: + assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized" + return _DATA_PARALLEL_GLOBAL_RANKS[0] + + +def get_pipeline_model_parallel_first_rank(): + """Return the global rank of the first process in the pipeline for the + current tensor parallel group""" + assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized" + return _PIPELINE_GLOBAL_RANKS[0] + + +def get_pipeline_model_parallel_last_rank(): + """Return the global rank of the last process in the pipeline for the + current tensor parallel group""" + assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized" + last_rank_local = get_pipeline_model_parallel_world_size() - 1 + return _PIPELINE_GLOBAL_RANKS[last_rank_local] + + +def get_pipeline_model_parallel_next_rank(): + """Return the global rank that follows the caller in the pipeline""" + assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized" + rank_in_pipeline = get_pipeline_model_parallel_rank() + world_size = get_pipeline_model_parallel_world_size() + return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size] + + +def get_pipeline_model_parallel_prev_rank(): + """Return the global rank that preceeds the caller in the pipeline""" + assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized" + rank_in_pipeline = get_pipeline_model_parallel_rank() + world_size = get_pipeline_model_parallel_world_size() + return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size] + + +def get_data_parallel_world_size(with_context_parallel=False): + """Return world size for the data parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_world_size( + group=get_data_parallel_group(with_context_parallel=with_context_parallel) + ) + else: + return 0 + + +def get_data_parallel_rank(with_context_parallel=False): + """Return my rank for the data parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_rank( + group=get_data_parallel_group(with_context_parallel=with_context_parallel) + ) + else: + return 0 + + +def get_context_parallel_world_size(): + """Return world size for the context parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_world_size(group=get_context_parallel_group()) + else: + return 0 + + +def get_context_parallel_rank(): + """Return my rank for the context parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_rank(group=get_context_parallel_group()) + else: + return 0 + + +def get_tensor_and_context_parallel_world_size(): + """Return world size for the tensor and context parallel group""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_world_size(group=get_tensor_and_context_parallel_group()) + else: + return 0 + + +def get_tensor_and_context_parallel_rank(): + """Return my rank for the tensor and context parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_rank(group=get_tensor_and_context_parallel_group()) + else: + return 0 + + +def get_expert_model_parallel_world_size(): + """Return world size for the expert model parallel group""" + if _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE is not None: + return _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE + if torch.distributed.is_available() and torch.distributed.is_initialized(): + tensor_and_expert_parallel_world_size = torch.distributed.get_world_size( + group=get_tensor_and_expert_parallel_group() + ) + return tensor_and_expert_parallel_world_size // get_tensor_model_parallel_world_size() + else: + return 0 + + +def get_tensor_and_expert_parallel_world_size(): + """Return world size for the expert model parallel group times model parallel group. + Currently, each expert will also be distributed across TP group by default. + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + tensor_and_expert_parallel_world_size = torch.distributed.get_world_size( + group=get_tensor_and_expert_parallel_group() + ) + return tensor_and_expert_parallel_world_size + else: + return 0 + + +def get_expert_model_parallel_rank(): + """Return my rank for the expert parallel group""" + if _MPU_EXPERT_MODEL_PARALLEL_RANK is not None: + return _MPU_EXPERT_MODEL_PARALLEL_RANK + if torch.distributed.is_available() and torch.distributed.is_initialized(): + tensor_and_expert_parallel_rank = torch.distributed.get_rank( + group=get_tensor_and_expert_parallel_group() + ) + return tensor_and_expert_parallel_rank // get_tensor_model_parallel_world_size() + else: + return 0 + + +def get_data_modulo_expert_parallel_rank(with_context_parallel=False): + """Return my rank for the context parallel group.""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_rank( + group=get_data_modulo_expert_parallel_group(with_context_parallel=with_context_parallel) + ) + else: + return 0 + + +def get_tensor_and_expert_parallel_rank(): + """Return my rank for the tensor and expert parallel group""" + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return torch.distributed.get_rank(group=get_tensor_and_expert_parallel_group()) + else: + return 0 + + +def _set_global_memory_buffer(): + """Initialize global buffer""" + global _GLOBAL_MEMORY_BUFFER + assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized' + _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer() + + +def get_global_memory_buffer(): + """Return the global GlobalMemoryBuffer object""" + assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized' + return _GLOBAL_MEMORY_BUFFER + + +def destroy_global_memory_buffer(): + """Sets the global memory buffer to None""" + global _GLOBAL_MEMORY_BUFFER + _GLOBAL_MEMORY_BUFFER = None + + +def get_moe_layer_wise_logging_tracker(): + """Return the moe layer wise tracker.""" + global _MOE_LAYER_WISE_LOGGING_TRACKER + return _MOE_LAYER_WISE_LOGGING_TRACKER + + +def destroy_model_parallel(): + """Set the groups to none.""" + global _MODEL_PARALLEL_GROUP + _MODEL_PARALLEL_GROUP = None + global _MODEL_AND_EXPERT_PARALLEL_GROUP + _MODEL_AND_EXPERT_PARALLEL_GROUP = None + global _TENSOR_MODEL_PARALLEL_GROUP + _TENSOR_MODEL_PARALLEL_GROUP = None + global _PIPELINE_MODEL_PARALLEL_GROUP + _PIPELINE_MODEL_PARALLEL_GROUP = None + global _DATA_PARALLEL_GROUP + _DATA_PARALLEL_GROUP = None + global _DATA_PARALLEL_GROUP_WITH_CP + _DATA_PARALLEL_GROUP_WITH_CP = None + global _CONTEXT_PARALLEL_GROUP + _CONTEXT_PARALLEL_GROUP = None + global _CONTEXT_PARALLEL_GLOBAL_RANKS + _CONTEXT_PARALLEL_GLOBAL_RANKS = None + global _EMBEDDING_GROUP + _EMBEDDING_GROUP = None + global _POSITION_EMBEDDING_GROUP + _POSITION_EMBEDDING_GROUP = None + global _TENSOR_AND_DATA_PARALLEL_GROUP + _TENSOR_AND_DATA_PARALLEL_GROUP = None + global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP + _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None + global _TENSOR_AND_CONTEXT_PARALLEL_GROUP + _TENSOR_AND_CONTEXT_PARALLEL_GROUP = None + global _EXPERT_MODEL_PARALLEL_GROUP + _EXPERT_MODEL_PARALLEL_GROUP = None + global _TENSOR_AND_EXPERT_PARALLEL_GROUP + _TENSOR_AND_EXPERT_PARALLEL_GROUP = None + global _DATA_MODULO_EXPERT_PARALLEL_GROUP + _DATA_MODULO_EXPERT_PARALLEL_GROUP = None + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = None + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK + _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None + global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None + global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None + global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE + _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None + global _MPU_TENSOR_MODEL_PARALLEL_RANK + _MPU_TENSOR_MODEL_PARALLEL_RANK = None + global _MPU_PIPELINE_MODEL_PARALLEL_RANK + _MPU_PIPELINE_MODEL_PARALLEL_RANK = None + global _GLOBAL_MEMORY_BUFFER + _GLOBAL_MEMORY_BUFFER = None + global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE + _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None + global _MPU_EXPERT_MODEL_PARALLEL_RANK + _MPU_EXPERT_MODEL_PARALLEL_RANK = None + global _DATA_PARALLEL_GROUP_GLOO + _DATA_PARALLEL_GROUP_GLOO = None + global _DATA_PARALLEL_GROUP_WITH_CP_GLOO + _DATA_PARALLEL_GROUP_WITH_CP_GLOO = None + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO + _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = None + global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO + _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = None diff --git a/megatron/core/pipeline_parallel/__init__.py b/megatron/core/pipeline_parallel/__init__.py new file mode 100644 index 0000000..00cd1ff --- /dev/null +++ b/megatron/core/pipeline_parallel/__init__.py @@ -0,0 +1 @@ +from .schedules import get_forward_backward_func diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py new file mode 100644 index 0000000..a95ed63 --- /dev/null +++ b/megatron/core/pipeline_parallel/p2p_communication.py @@ -0,0 +1,596 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import operator +from functools import reduce +from typing import Callable, List, Optional, Tuple, Union + +import torch + +from megatron import core +from megatron.core import ModelParallelConfig +from megatron.core.parallel_state import ( + get_pipeline_model_parallel_group, + get_pipeline_model_parallel_next_rank, + get_pipeline_model_parallel_prev_rank, + get_pipeline_model_parallel_rank, + get_pipeline_model_parallel_world_size, +) + +# Types +Shape = Union[List[int], torch.Size] + + +def _communicate_shapes(tensor_send_next, tensor_send_prev, recv_prev, recv_next, config): + """Communicate tensor shapes between stages. Used to communicate + tensor shapes before the actual tensor communication happens. + This is required when the sequence lengths across micro batches + are not uniform. + + Args: + tensor_send_next: tensor to send to next rank (no tensor sent if + set to None). + tensor_send_prev: tensor to send to prev rank (no tensor sent if + set to None). + recv_prev: boolean for whether tensor should be received from + previous rank. + recv_next: boolean for whether tensor should be received from + next rank. + Returns: + (recv_prev_shape, recv_next_shape) + """ + + recv_prev_shape_tensor = None + recv_next_shape_tensor = None + send_prev_shape_tensor = None + send_next_shape_tensor = None + if recv_prev: + recv_prev_shape_tensor = torch.empty( + (3), device=torch.cuda.current_device(), dtype=torch.int64 + ) + if recv_next: + recv_next_shape_tensor = torch.empty( + (3), device=torch.cuda.current_device(), dtype=torch.int64 + ) + if tensor_send_prev is not None: + send_prev_shape_tensor = torch.tensor( + tensor_send_prev.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + if tensor_send_next is not None: + send_next_shape_tensor = torch.tensor( + tensor_send_next.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + + if config.use_ring_exchange_p2p: + torch.distributed.ring_exchange( + tensor_send_prev=send_prev_shape_tensor, + tensor_recv_prev=recv_prev_shape_tensor, + tensor_send_next=send_next_shape_tensor, + tensor_recv_next=recv_next_shape_tensor, + group=get_pipeline_model_parallel_group(), + ) + else: + ops = [] + if send_prev_shape_tensor is not None: + send_prev_op = torch.distributed.P2POp( + torch.distributed.isend, + send_prev_shape_tensor, + get_pipeline_model_parallel_prev_rank(), + ) + ops.append(send_prev_op) + if recv_prev_shape_tensor is not None: + recv_prev_op = torch.distributed.P2POp( + torch.distributed.irecv, + recv_prev_shape_tensor, + get_pipeline_model_parallel_prev_rank(), + ) + ops.append(recv_prev_op) + if send_next_shape_tensor is not None: + send_next_op = torch.distributed.P2POp( + torch.distributed.isend, + send_next_shape_tensor, + get_pipeline_model_parallel_next_rank(), + ) + ops.append(send_next_op) + if recv_next_shape_tensor is not None: + recv_next_op = torch.distributed.P2POp( + torch.distributed.irecv, + recv_next_shape_tensor, + get_pipeline_model_parallel_next_rank(), + ) + ops.append(recv_next_op) + if len(ops) > 0: + reqs = torch.distributed.batch_isend_irecv(ops) + for req in reqs: + req.wait() + + # To protect against race condition when using batch_isend_irecv(). + # should take this out once the bug with batch_isend_irecv is resolved. + torch.cuda.synchronize() + + recv_prev_shape = [0, 0, 0] + if recv_prev_shape_tensor is not None: + recv_prev_shape = recv_prev_shape_tensor.tolist() + + recv_next_shape = [0, 0, 0] + if recv_next_shape_tensor is not None: + recv_next_shape = recv_next_shape_tensor.tolist() + + return recv_prev_shape, recv_next_shape + + +def _batched_p2p_ops( + *, + tensor_send_prev: Optional[torch.Tensor], + tensor_recv_prev: Optional[torch.Tensor], + tensor_send_next: Optional[torch.Tensor], + tensor_recv_next: Optional[torch.Tensor], + group: torch.distributed.ProcessGroup +): + ops = [] + if tensor_send_prev is not None: + send_prev_op = torch.distributed.P2POp( + torch.distributed.isend, + tensor_send_prev, + get_pipeline_model_parallel_prev_rank(), + group, + ) + ops.append(send_prev_op) + if tensor_recv_prev is not None: + recv_prev_op = torch.distributed.P2POp( + torch.distributed.irecv, + tensor_recv_prev, + get_pipeline_model_parallel_prev_rank(), + group, + ) + ops.append(recv_prev_op) + if tensor_send_next is not None: + send_next_op = torch.distributed.P2POp( + torch.distributed.isend, + tensor_send_next, + get_pipeline_model_parallel_next_rank(), + group, + ) + ops.append(send_next_op) + if tensor_recv_next is not None: + recv_next_op = torch.distributed.P2POp( + torch.distributed.irecv, + tensor_recv_next, + get_pipeline_model_parallel_next_rank(), + group, + ) + ops.append(recv_next_op) + if len(ops) > 0: + reqs = torch.distributed.batch_isend_irecv(ops) + else: + reqs = [] + return reqs + + +def _p2p_ops( + *, + tensor_send_prev: Optional[torch.Tensor], + tensor_recv_prev: Optional[torch.Tensor], + tensor_send_next: Optional[torch.Tensor], + tensor_recv_next: Optional[torch.Tensor], + group: torch.distributed.ProcessGroup +): + reqs = [] + rank = get_pipeline_model_parallel_rank() + even_send_odd_recv_group = group + if get_pipeline_model_parallel_world_size() == 2: + # Use the global process group for one of the two p2p communications + # to allow the overlap of the independent communications. + # Using the global process group is compatible because the pipeline-parallel + # communications set the source and destination by global rank. + even_recv_odd_send_group = torch.distributed.group.WORLD + else: + even_recv_odd_send_group = group + if get_pipeline_model_parallel_rank() % 2 == 0: + if tensor_send_next is not None: + send_next_req = torch.distributed.isend( + tensor=tensor_send_next, + dst=get_pipeline_model_parallel_next_rank(), + group=even_send_odd_recv_group, + ) + reqs.append(send_next_req) + + if tensor_recv_prev is not None: + recv_prev_req = torch.distributed.irecv( + tensor=tensor_recv_prev, + src=get_pipeline_model_parallel_prev_rank(), + group=even_recv_odd_send_group, + ) + reqs.append(recv_prev_req) + + if tensor_send_prev is not None: + send_prev_req = torch.distributed.isend( + tensor=tensor_send_prev, + dst=get_pipeline_model_parallel_prev_rank(), + group=even_send_odd_recv_group, + ) + reqs.append(send_prev_req) + + if tensor_recv_next is not None: + recv_next_req = torch.distributed.irecv( + tensor=tensor_recv_next, + src=get_pipeline_model_parallel_next_rank(), + group=even_recv_odd_send_group, + ) + reqs.append(recv_next_req) + + else: + if tensor_recv_prev is not None: + recv_prev_req = torch.distributed.irecv( + tensor=tensor_recv_prev, + src=get_pipeline_model_parallel_prev_rank(), + group=even_send_odd_recv_group, + ) + reqs.append(recv_prev_req) + + if tensor_send_next is not None: + send_next_req = torch.distributed.isend( + tensor=tensor_send_next, + dst=get_pipeline_model_parallel_next_rank(), + group=even_recv_odd_send_group, + ) + reqs.append(send_next_req) + + if tensor_recv_next is not None: + recv_next_req = torch.distributed.irecv( + tensor=tensor_recv_next, + src=get_pipeline_model_parallel_next_rank(), + group=even_send_odd_recv_group, + ) + reqs.append(recv_next_req) + + if tensor_send_prev is not None: + send_prev_req = torch.distributed.isend( + tensor=tensor_send_prev, + dst=get_pipeline_model_parallel_prev_rank(), + group=even_recv_odd_send_group, + ) + reqs.append(send_prev_req) + return reqs + + +def _communicate( + *, + tensor_send_next: Optional[torch.Tensor], + tensor_send_prev: Optional[torch.Tensor], + recv_prev: bool, + recv_next: bool, + tensor_shape: Shape, + config: ModelParallelConfig, + wait_on_reqs: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + """Communicate tensors between stages. Used as helper method in other + communication methods that are used in megatron/schedules.py. + + Args: + tensor_send_next (torch.Tensor, optional): + Tensor to send to next rank (no tensor sent if None) + + tensor_send_prev (torch.Tensor, optional): + Tensor to send to prev rank (no tensor sent if None) + + recv_prev (boolean, required): + whether tensor should be received from previous rank. + + recv_next (boolean, required): + whether tensor should be received from next rank. + + tensor_shape (List[int] or torch.Size, required): + shape of tensor to receive (this method assumes that all + tensors sent and received in a single function call are + the same shape). + + wait_on_reqs (boolean, optional, default=False): + For non-batched p2p communication, wait on each request + before returning. + + Returns: + tuple containing + + - tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise. + - tensor_recv_next: torch.Tensor if recv_next is True, None otherwise. + + """ + + # Create placeholder tensors for receive in forward and backward directions + # if needed. + tensor_recv_prev = None + tensor_recv_next = None + + if not config.variable_seq_lengths: + recv_prev_shape = tensor_shape + recv_next_shape = tensor_shape + else: + recv_prev_shape, recv_next_shape = _communicate_shapes( + tensor_send_next, tensor_send_prev, recv_prev, recv_next, config + ) + + if recv_prev: + if config.pipeline_dtype is None: + raise RuntimeError("pipeline_dtype must be provided if recv_prev is True") + if tensor_shape is None: + raise RuntimeError( + "tensor_shape must be specified if recv_prev is True. " + "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)" + ) + tensor_recv_prev = torch.empty( + recv_prev_shape, + requires_grad=True, + device=torch.cuda.current_device(), + dtype=config.pipeline_dtype, + ) + if recv_next: + if config.pipeline_dtype is None: + raise RuntimeError("dtype must be provided if recv_next is True") + if tensor_shape is None: + raise RuntimeError( + "tensor_shape must be specified if recv_next is True. " + "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)" + ) + tensor_recv_next = torch.empty( + recv_next_shape, + requires_grad=True, + device=torch.cuda.current_device(), + dtype=config.pipeline_dtype, + ) + + # Send tensors in both the forward and backward directions as appropriate. + if config.use_ring_exchange_p2p: + + def _ring_exchange_wrapper(**kwargs): + torch.distributed.ring_exchange(**kwargs) + return [] + + p2p_func = _ring_exchange_wrapper + elif config.batch_p2p_comm: + assert wait_on_reqs + p2p_func = _batched_p2p_ops + else: + p2p_func = _p2p_ops + + reqs = p2p_func( + tensor_send_prev=tensor_send_prev, + tensor_recv_prev=tensor_recv_prev, + tensor_send_next=tensor_send_next, + tensor_recv_next=tensor_recv_next, + group=get_pipeline_model_parallel_group(), + ) + + if wait_on_reqs and len(reqs) > 0: + for req in reqs: + req.wait() + reqs = None + + if config.batch_p2p_comm and config.batch_p2p_sync: + # To protect against race condition when using batch_isend_irecv(). + # User should assert that we have a modern enough PyTorch to not need this + torch.cuda.synchronize() + + return tensor_recv_prev, tensor_recv_next, reqs + + +def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: + """ Receive tensor from previous rank in pipeline (forward receive). + + See _communicate for argument details. + """ + + if core.parallel_state.is_pipeline_first_stage(): + input_tensor = None + else: + if config.timers is not None: + config.timers('forward-recv', log_level=2).start() + input_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + recv_prev=True, + recv_next=False, + tensor_shape=tensor_shape, + config=config, + ) + if config.timers is not None: + config.timers('forward-recv').stop() + return input_tensor + + +def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: + """Receive tensor from next rank in pipeline (backward receive). + + See _communicate for argument details. + """ + if core.parallel_state.is_pipeline_last_stage(): + output_tensor_grad = None + else: + if config.timers is not None: + config.timers('backward-recv', log_level=2).start() + _, output_tensor_grad, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + recv_prev=False, + recv_next=True, + tensor_shape=tensor_shape, + config=config, + ) + if config.timers is not None: + config.timers('backward-recv').stop() + return output_tensor_grad + + +def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None: + """Send tensor to next rank in pipeline (forward send). + + See _communicate for argument details. + """ + + if not core.parallel_state.is_pipeline_last_stage(): + if config.timers is not None: + config.timers('forward-send', log_level=2).start() + _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + recv_prev=False, + recv_next=False, + tensor_shape=None, + config=config, + ) + if config.timers is not None: + config.timers('forward-send').stop() + + +def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None: + """Send tensor to previous rank in pipeline (backward send). + + See _communicate for argument details. + """ + if not core.parallel_state.is_pipeline_first_stage(): + if config.timers is not None: + config.timers('backward-send', log_level=2).start() + _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + recv_prev=False, + recv_next=False, + tensor_shape=None, + config=config, + ) + if config.timers is not None: + config.timers('backward-send').stop() + + +def send_forward_recv_backward( + output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig +) -> torch.Tensor: + """Batched send and recv with next rank in pipeline. + + See _communicate for argument details. + """ + if core.parallel_state.is_pipeline_last_stage(): + output_tensor_grad = None + else: + if config.timers is not None: + config.timers('forward-send-backward-recv', log_level=2).start() + _, output_tensor_grad, _ = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + recv_prev=False, + recv_next=True, + tensor_shape=tensor_shape, + config=config, + ) + if config.timers is not None: + config.timers('forward-send-backward-recv').stop() + return output_tensor_grad + + +def send_backward_recv_forward( + input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig +) -> torch.Tensor: + """Batched send and recv with previous rank in pipeline. + + See _communicate for argument details. + """ + if core.parallel_state.is_pipeline_first_stage(): + input_tensor = None + else: + if config.timers is not None: + config.timers('backward-send-forward-recv', log_level=2).start() + input_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + recv_prev=True, + recv_next=False, + tensor_shape=tensor_shape, + config=config, + ) + if config.timers is not None: + config.timers('backward-send-forward-recv').stop() + return input_tensor + + +def send_forward_recv_forward( + output_tensor: torch.Tensor, + recv_prev: bool, + tensor_shape: Shape, + config: ModelParallelConfig, + overlap_p2p_comm: bool = False, +) -> torch.Tensor: + """Batched recv from previous rank and send to next rank in pipeline. + + See _communicate for argument details. + """ + if config.timers is not None: + config.timers('forward-send-forward-recv', log_level=2).start() + input_tensor, _, wait_handles = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + recv_prev=recv_prev, + recv_next=False, + tensor_shape=tensor_shape, + wait_on_reqs=(not overlap_p2p_comm), + config=config, + ) + if config.timers is not None: + config.timers('forward-send-forward-recv').stop() + if overlap_p2p_comm: + return input_tensor, wait_handles + return input_tensor + + +def send_backward_recv_backward( + input_tensor_grad: torch.Tensor, + recv_next: bool, + tensor_shape: Shape, + config: ModelParallelConfig, + overlap_p2p_comm: bool = False, +) -> torch.Tensor: + """Batched recv from next rank and send to previous rank in pipeline. + + See _communicate for argument details. + """ + if config.timers is not None: + config.timers('backward-send-backward-recv', log_level=2).start() + _, output_tensor_grad, wait_handles = _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + recv_prev=False, + recv_next=recv_next, + tensor_shape=tensor_shape, + wait_on_reqs=(not overlap_p2p_comm), + config=config, + ) + if config.timers is not None: + config.timers('backward-send-backward-recv').stop() + if overlap_p2p_comm: + return output_tensor_grad, wait_handles + return output_tensor_grad + + +def send_forward_backward_recv_forward_backward( + output_tensor: torch.Tensor, + input_tensor_grad: torch.Tensor, + recv_prev: bool, + recv_next: bool, + tensor_shape: Shape, + config: ModelParallelConfig, +) -> torch.Tensor: + """Batched send and recv with previous and next ranks in pipeline. + + See _communicate for argument details. + """ + if config.timers is not None: + config.timers('forward-backward-send-forward-backward-recv', log_level=2).start() + input_tensor, output_tensor_grad, _ = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=input_tensor_grad, + recv_prev=recv_prev, + recv_next=recv_next, + tensor_shape=tensor_shape, + config=config, + ) + if config.timers is not None: + config.timers('forward-backward-send-forward-backward-recv').stop() + return input_tensor, output_tensor_grad diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py new file mode 100644 index 0000000..632bdd6 --- /dev/null +++ b/megatron/core/pipeline_parallel/schedules.py @@ -0,0 +1,1524 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import contextlib +from typing import Callable, Iterator, List, Optional, Union + +import torch +from torch.autograd.variable import Variable + +from megatron.core import parallel_state +from megatron.core.enums import ModelType +from megatron.core.pipeline_parallel import p2p_communication +from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler +from megatron.core.utils import ( + drain_embedding_wgrad_compute, + get_attr_wrapped_model, + get_model_config, + get_model_type, +) + +# Types +Shape = Union[List[int], torch.Size] + + +def get_forward_backward_func(): + """Retrieves the appropriate forward_backward function given the + configuration of parallel_state. + + Returns a function that will perform all of the forward and + backward passes of the model given the pipeline model parallel + world size and virtual pipeline model parallel world size in the + global parallel_state. + + Note that if using sequence parallelism, the sequence length component of + the tensor shape is updated to original_sequence_length / + tensor_model_parallel_world_size. + + The function returned takes the following arguments: + + forward_step_func (required): A function that takes a data + iterator and a model as its arguments and return the model's + forward output and the loss function. The loss function should + take one torch.Tensor and return a torch.Tensor of loss and a + dictionary of string -> torch.Tensor. + + A third argument, checkpoint_activations_microbatch, indicates + that the activations for this microbatch should be + checkpointed. A None value for this argument indicates that + the default from the configuration should be used. This is + used when the + num_microbatches_with_partial_activation_checkpoints is used. + + For example: + + def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + def forward_step(data_iterator, model): + data, loss_mask = next(data_iterator) + output = model(data) + return output, partial(loss_func, loss_mask) + + + forward_backward_func(forward_step_func=forward_step, ...) + + + data_iterator (required): an iterator over the data, will be + passed as is to forward_step_func. Expected to be a list of + iterators in the case of interleaved pipeline parallelism. + + model (required): the actual model. Expected to be a list of modules in the case of interleaved + pipeline parallelism. Must be a (potentially wrapped) megatron.core.models.MegatronModule. + + num_microbatches (int, required): + The number of microbatches to go through + + seq_length (int, required): Sequence length of the current global batch. If this is a dual-stack + transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths + in the config is True. Otherwise, each microbatch in the current global batch size must use + this sequence length. + + micro_batch_size (int, required): The number of sequences in a microbatch. + + decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack + transformer. This is ignored for a single-stack transformer. + + forward_only (optional, default = False): Perform only the forward step + + collect_non_loss_data (optional, bool, default=False): TODO + + first_val_step (bool, optional): Is the first step of the validation phase. Used by + Transformer Engine modules to only update their fp8 weights only on the first validation step. + + """ + pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() + if pipeline_model_parallel_size > 1: + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + forward_backward_func = forward_backward_pipelining_with_interleaving + else: + forward_backward_func = forward_backward_pipelining_without_interleaving + else: + forward_backward_func = forward_backward_no_pipelining + return forward_backward_func + + +def deallocate_output_tensor(out, deallocate_pipeline_outputs=False): + '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field. + + This method should be called right after the output tensor has been + sent to the next pipeline stage. At this point, the output tensor is + only useful for its '.grad_fn' field, and not its '.data'. + ''' + if (out is None) or (not deallocate_pipeline_outputs): + return + assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__ + assert out._base is None, "counter-productive to free a view of another tensor." + out.data = torch.empty( + (1,), + device=out.device, + dtype=out.dtype, + ) + + +def custom_backward(output, grad_output): + '''Directly call C++ autograd engine. + + To make the 'deallocate_output_tensor' (above) optimization work, the C++ + autograd engine must be called directly, bypassing Pytorch's + torch.autograd.backward. Pytorch's 'backward' checks that the output and + grad have the same shape, while C++'s 'backward' does not. + ''' + + assert output.numel() == 1, "output should be pseudo-'freed' in schedule, to optimize memory" + assert isinstance(output, torch.Tensor), "output == '%s'." % type(output).__name__ + assert isinstance(grad_output, (torch.Tensor, type(None))), ( + "grad_output == '%s'." % type(grad_output).__name__ + ) + + # Handle scalar output + if grad_output is None: + assert output.numel() == 1, "implicit grad requires scalar output." + grad_output = torch.ones_like( + output, + memory_format=torch.preserve_format, + ) + + # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ] + Variable._execution_engine.run_backward( + tensors=(output,), + grad_tensors=(grad_output,), + keep_graph=False, + create_graph=False, + inputs=tuple(), + allow_unreachable=True, + accumulate_grad=True, + ) + + +def set_current_microbatch(model, microbatch_id): + decoder_exists = True + decoder = None + try: + decoder = get_attr_wrapped_model(model, "decoder") + except RuntimeError: + decoder_exists = False + if decoder_exists and decoder is not None: + decoder.current_microbatch = microbatch_id + + +def forward_step( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data=False, + checkpoint_activations_microbatch=None, + is_first_microbatch=False, + current_microbatch=None, +): + """Forward step for passed-in model. + + If it is the first stage, the input tensor is obtained from the data_iterator. + Otherwise, the passed-in input_tensor is used. + + Args: + forward_step_func (callable): The forward step function for the model that takes the + data iterator as the first argument, and model as the second. + This user's forward step is expected to output a tuple of two elements: + 1. The output object from the forward step. This output object needs to be a + tensor or some kind of collection of tensors. The only hard requirement + for this object is that it needs to be acceptible as input into the second + function. + 2. A function to reduce (optionally) the output from the forward step. This + could be a reduction over the loss from the model, it could be a function that + grabs the output from the model and reformats, it could be a function that just + passes through the model output. This function must have one of the following + patterns, and depending on the pattern different things happen internally. + a. A tuple of reduced loss and some other data. Note that in this case + the first argument is divided by the number of global microbatches, + assuming it is a loss, so that the loss is stable as a function of + the number of devices the step is split across. + b. A triple of reduced loss, number of tokens, and some other data. This + is similar to case (a), but the loss is further averaged across the + number of tokens in the batch. If the user is not already averaging + across the number of tokens, this pattern is useful to use. + c. Any arbitrary data the user wants (eg a dictionary of tensors, a list + of tensors, etc in the case of inference). To trigger case 3 you need + to specify `collect_non_loss_data=True` and you may also want to + specify `forward_only=True` in the call to the parent forward_backward + function. + data_iterator (iterator): The data iterator. + model (nn.Module): The model to perform the forward step on. + num_microbatches (int): The number of microbatches. + input_tensor (Tensor or list[Tensor]): The input tensor(s) for the forward step. + forward_data_store (list): The list to store the forward data. If you go down path 2.a or + 2.b for the return of your forward reduction function then this will store only the + final dimension of the output, for example the metadata output by the loss function. + If you go down the path of 2.c then this will store the entire output of the forward + reduction function applied to the model output. + config (object): The configuration object. + collect_non_loss_data (bool, optional): Whether to collect non-loss data. Defaults to False. + This is the path to use if you want to collect arbitrary output from the model forward, + such as with inference use cases. Defaults to False. + checkpoint_activations_microbatch (int, optional): The microbatch to checkpoint activations. + Defaults to None. + is_first_microbatch (bool, optional): Whether it is the first microbatch. Defaults to False. + current_microbatch (int, optional): The current microbatch. Defaults to None. + + Returns: + Tensor or list[Tensor]: The output object(s) from the forward step. + Tensor: The number of tokens. + """ + if config.timers is not None: + config.timers('forward-compute', log_level=2).start() + + if is_first_microbatch and hasattr(model, 'set_is_first_microbatch'): + model.set_is_first_microbatch() + if current_microbatch is not None: + set_current_microbatch(model, current_microbatch) + + unwrap_output_tensor = False + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + unwrap_output_tensor = True + + set_input_tensor = get_attr_wrapped_model(model, "set_input_tensor") + set_input_tensor(input_tensor) + + if config.enable_autocast: + context_manager = torch.autocast("cuda", dtype=config.autocast_dtype) + else: + context_manager = contextlib.nullcontext() + with context_manager: + if checkpoint_activations_microbatch is None: + output_tensor, loss_func = forward_step_func(data_iterator, model) + else: + output_tensor, loss_func = forward_step_func( + data_iterator, model, checkpoint_activations_microbatch + ) + + num_tokens = torch.tensor(0, dtype=torch.int) + if parallel_state.is_pipeline_last_stage(): + if not collect_non_loss_data: + outputs = loss_func(output_tensor) + if len(outputs) == 3: + output_tensor, num_tokens, loss_reduced = outputs + if not config.calculate_per_token_loss: + output_tensor /= num_tokens + output_tensor /= num_microbatches + else: + # preserve legacy loss averaging behavior (ie, over the number of microbatches) + assert len(outputs) == 2 + output_tensor, loss_reduced = outputs + output_tensor /= num_microbatches + forward_data_store.append(loss_reduced) + else: + data = loss_func(output_tensor, non_loss_data=True) + forward_data_store.append(data) + + if config.timers is not None: + config.timers('forward-compute').stop() + + # Set the loss scale for the auxiliary loss of the MoE layer. + # Since we use a trick to do backward on the auxiliary loss, we need to set the scale explicitly. + if hasattr(config, 'num_moe_experts') and config.num_moe_experts is not None: + # Calculate the loss scale based on the grad_scale_func if available, else default to 1. + loss_scale = ( + config.grad_scale_func(torch.ones(1, device=output_tensor.device)) + if config.grad_scale_func is not None + else torch.tensor(1.0) + ) + # Set the loss scale + MoEAuxLossAutoScaler.set_loss_scale(loss_scale / num_microbatches) + + # If T5 model (or other model with encoder and decoder) + # and in decoder stack, then send encoder_hidden_state + # downstream as well. + model_type = get_model_type(model) + if ( + parallel_state.is_pipeline_stage_after_split() + and model_type == ModelType.encoder_and_decoder + ): + return [output_tensor, input_tensor[-1]], num_tokens + + if unwrap_output_tensor: + return output_tensor, num_tokens + return [output_tensor], num_tokens + + +def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config): + """Backward step through passed-in output tensor. + + If last stage, output_tensor_grad is None, otherwise gradient of loss + with respect to stage's output tensor. + + Returns gradient of loss with respect to input tensor (None if first + stage).""" + + # NOTE: This code currently can handle at most one skip connection. It + # needs to be modified slightly to support arbitrary numbers of skip + # connections. + + if config.timers is not None: + config.timers('backward-compute', log_level=2).start() + + # Retain the grad on the input_tensor. + unwrap_input_tensor_grad = False + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + unwrap_input_tensor_grad = True + for x in input_tensor: + if x is not None: + x.retain_grad() + + if not isinstance(output_tensor, list): + output_tensor = [output_tensor] + if not isinstance(output_tensor_grad, list): + output_tensor_grad = [output_tensor_grad] + + # Backward pass. + if output_tensor_grad[0] is None and config.grad_scale_func is not None: + output_tensor[0] = config.grad_scale_func(output_tensor[0]) + + if config.deallocate_pipeline_outputs: + custom_backward(output_tensor[0], output_tensor_grad[0]) + else: + torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0]) + + # Collect the grad of the input_tensor. + input_tensor_grad = [None] + if input_tensor is not None: + input_tensor_grad = [] + for x in input_tensor: + if x is None: + input_tensor_grad.append(None) + else: + input_tensor_grad.append(x.grad) + + # Handle single skip connection if it exists (encoder_hidden_state in + # model with encoder and decoder). + if ( + parallel_state.get_pipeline_model_parallel_world_size() > 1 + and parallel_state.is_pipeline_stage_after_split() + and model_type == ModelType.encoder_and_decoder + ): + if output_tensor_grad[1] is not None: + input_tensor_grad[-1].add_(output_tensor_grad[1]) + if unwrap_input_tensor_grad: + input_tensor_grad = input_tensor_grad[0] + + if config.timers is not None: + config.timers('backward-compute').stop() + + return input_tensor_grad + + +def check_first_val_step(first_val_step, forward_only, cond): + if (first_val_step is not None) and forward_only: + return first_val_step and cond + else: + return cond + + +def forward_backward_no_pipelining( + *, + forward_step_func, + data_iterator: Union[Iterator, List[Iterator]], + model: Union[torch.nn.Module, List[torch.nn.Module]], + num_microbatches: int, + seq_length: int, # unused + micro_batch_size: int, # unused + decoder_seq_length: int = None, # unused + forward_only: bool = False, + collect_non_loss_data: bool = False, + first_val_step: bool = None, +): + """Run forward and backward passes with no pipeline parallelism + (no inter-stage communication). + + Returns dictionary with losses. + + + See get_forward_backward_func() for argument details + """ + + if isinstance(model, list): + assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking" + model = model[0] + if isinstance(data_iterator, list): + assert ( + len(data_iterator) == 1 + ), "non-pipeline-parallel schedule does not support model chunking" + data_iterator = data_iterator[0] + + config = get_model_config(model) + if config.timers is not None: + config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + + no_sync_func = config.no_sync_func + if no_sync_func is None: + no_sync_func = contextlib.nullcontext + + model_type = get_model_type(model) + + forward_data_store = [] + input_tensor, output_tensor_grad = None, None + total_num_tokens = torch.zeros([], dtype=torch.int, device="cuda") + with no_sync_func(): + for i in range(num_microbatches - 1): + output_tensor, num_tokens = forward_step( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + is_first_microbatch=check_first_val_step(first_val_step, forward_only, i == 0), + current_microbatch=i, + ) + total_num_tokens += num_tokens.item() + if not forward_only: + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + + # Run computation for last microbatch out of context handler (want to + # synchronize gradients). + output_tensor, num_tokens = forward_step( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + is_first_microbatch=check_first_val_step( + first_val_step, forward_only, num_microbatches == 1 + ), + current_microbatch=num_microbatches - 1, + ) + total_num_tokens += num_tokens.item() + + if not forward_only: + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + + if config.finalize_model_grads_func is not None and not forward_only: + # Finalize model grads (perform full grad all-reduce / reduce-scatter for + # data parallelism and layernorm all-reduce for sequence parallelism). + config.finalize_model_grads_func( + [model], total_num_tokens if config.calculate_per_token_loss else None + ) + + if config.timers is not None: + config.timers('forward-backward').stop() + + return forward_data_store + + +def clear_embedding_activation_buffer(config, model): + + if ( + parallel_state.is_pipeline_last_stage(ignore_virtual=True) + and config.defer_embedding_wgrad_compute + ): + if isinstance(model, list): + embedding_module = get_attr_wrapped_model( + model[-1], 'post_process', return_model_obj=True + ) + else: + embedding_module = get_attr_wrapped_model(model, 'post_process', return_model_obj=True) + + # Need to ensure no stray activations exists in this buffer + embedding_module.embedding_activation_buffer.clear() + + return embedding_module + else: + return None + + +def finish_embedding_wgrad_compute(config, embedding_module): + if ( + parallel_state.is_pipeline_last_stage(ignore_virtual=True) + and config.defer_embedding_wgrad_compute + ): + embedding_activation_buffer = embedding_module.embedding_activation_buffer + grad_output_buffer = embedding_module.grad_output_buffer + weight = ( + embedding_module.output_layer.weight + if embedding_module.share_embeddings_and_output_weights + else embedding_module.shared_embedding_or_output_weight() + ) + + drain_embedding_wgrad_compute( + config, embedding_activation_buffer, grad_output_buffer, weight + ) + + +def forward_backward_pipelining_with_interleaving( + *, + forward_step_func, + data_iterator: Union[Iterator, List[Iterator]], + model: Union[torch.nn.Module, List[torch.nn.Module]], + num_microbatches: int, + seq_length: int, + micro_batch_size: int, + decoder_seq_length: int = None, + forward_only: bool = False, + collect_non_loss_data: bool = False, + first_val_step: bool = None, +): + """Run interleaved 1F1B schedule (model split into model chunks), with + communication between pipeline stages as needed. + + Returns dictionary with losses if the last stage, empty dict otherwise.""" + assert isinstance(model, list), "interleaved pipeline parallelism expected model chunking" + assert all(isinstance(chunk, torch.nn.Module) for chunk in model), "invalid model chunking" + assert isinstance( + data_iterator, list + ), "interleaved pipeline parallelism expected each model chunk to have a data iterator" + + config = get_model_config(model[0]) + if config.overlap_p2p_comm and config.batch_p2p_comm: + raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm") + + # Needed only when gradients are finalized in M-Core + if config.finalize_model_grads_func is not None and not forward_only: + embedding_module = clear_embedding_activation_buffer(config, model) + + if config.timers is not None: + config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + + # Disable async grad reductions + no_sync_func = config.no_sync_func + if isinstance(no_sync_func, list): + + def multi_no_sync(): + stack = contextlib.ExitStack() + for model_chunk_no_sync_func in config.no_sync_func: + stack.enter_context(model_chunk_no_sync_func()) + return stack + + no_sync_func = multi_no_sync + if no_sync_func is None: + no_sync_func = contextlib.nullcontext + no_sync_context = None + + if config.grad_sync_func is not None and not isinstance(config.grad_sync_func, list): + config.grad_sync_func = [config.grad_sync_func for _ in model] + + if config.param_sync_func is not None and not isinstance(config.param_sync_func, list): + config.param_sync_func = [config.param_sync_func for _ in model] + + def disable_grad_sync(): + """Disable asynchronous grad reductions""" + nonlocal no_sync_context + if no_sync_context is None: + no_sync_context = no_sync_func() + no_sync_context.__enter__() + + def enable_grad_sync(): + """Enable asynchronous grad reductions""" + nonlocal no_sync_context + if no_sync_context is not None: + no_sync_context.__exit__(None, None, None) + no_sync_context = None + + disable_grad_sync() + + # Model chunk IDs with synchronized grads + synchronized_model_chunks = set() + + input_tensors = [[] for _ in range(len(model))] + output_tensors = [[] for _ in range(len(model))] + total_num_tokens = torch.tensor(0, dtype=torch.int).cuda() + + forward_data_store = [] + if not forward_only: + output_tensor_grads = [[] for _ in range(len(model))] + + pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() + pipeline_parallel_rank = parallel_state.get_pipeline_model_parallel_rank() + + if num_microbatches % pipeline_parallel_size != 0: + msg = f'number of microbatches ({num_microbatches}) is not divisible by ' + msg += f'pipeline-model-parallel-size ({pipeline_parallel_size}) ' + msg += 'when using interleaved schedule' + raise RuntimeError(msg) + + model_type = get_model_type(model[0]) + if model_type == ModelType.encoder_and_decoder: + raise RuntimeError("Interleaving is not supported with an encoder and decoder model.") + + if decoder_seq_length is not None and decoder_seq_length != seq_length: + raise RuntimeError( + "Interleaving is not supported with a different decoder sequence length." + ) + + tensor_shape = [seq_length, micro_batch_size, config.hidden_size] + tensor_shape[0] = tensor_shape[0] // parallel_state.get_context_parallel_world_size() + if config.sequence_parallel: + tensor_shape[0] = tensor_shape[0] // parallel_state.get_tensor_model_parallel_world_size() + + # Compute number of warmup and remaining microbatches. + num_model_chunks = len(model) + total_num_microbatches = num_microbatches * num_model_chunks + all_warmup_microbatches = False + if forward_only: + num_warmup_microbatches = total_num_microbatches + else: + # Run all forward passes and then all backward passes if number of + # microbatches is just the number of pipeline stages. + # Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on + # all workers, followed by more microbatches after depending on + # stage ID (more forward passes for earlier stages, later stages can + # immediately start with 1F1B). + if num_microbatches == pipeline_parallel_size: + num_warmup_microbatches = total_num_microbatches + all_warmup_microbatches = True + else: + num_warmup_microbatches = (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2 + num_warmup_microbatches += (num_model_chunks - 1) * pipeline_parallel_size + num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches) + num_microbatches_remaining = total_num_microbatches - num_warmup_microbatches + + # Checkpoint the activations of partial Transformer layers in a number of micro-batches + # within the maximum outstanding micro-batch backpropagations. + # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints' + # checkpoint partial Transformer layers (or skip checkpointing) and + # the rest of micro-batches within a window of micro-batches checkpoint + # all Transformer layers. The window of micro-batches is set by the maximum + # outstanding backpropagations and becomes smaller at later pipeline stages. + # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf + max_outstanding_backprops = None + if config.num_microbatches_with_partial_activation_checkpoints is not None: + max_outstanding_backprops = num_warmup_microbatches + 1 + + # Synchronize params for first two model chunks + if config.param_sync_func is not None: + config.param_sync_func[0](model[0].parameters()) + config.param_sync_func[1](model[1].parameters()) + + def get_model_chunk_id(microbatch_id, forward): + """Helper method to get the model chunk ID given the iteration number.""" + microbatch_id_in_group = microbatch_id % (pipeline_parallel_size * num_model_chunks) + model_chunk_id = microbatch_id_in_group // pipeline_parallel_size + if not forward: + model_chunk_id = num_model_chunks - model_chunk_id - 1 + return model_chunk_id + + def get_microbatch_id_in_model_chunk(iteration_id, forward): + """Helper method to get the microbatch_id within model chunk given the iteration number.""" + assert forward + iteration_group_id = iteration_id // (pipeline_parallel_size * num_model_chunks) + microbatch_id_in_model_chunk = (iteration_group_id * pipeline_parallel_size) + ( + iteration_id % pipeline_parallel_size + ) + return microbatch_id_in_model_chunk + + def is_first_microbatch_for_model_chunk(microbatch_id: int) -> bool: + """Check if an iteration is the first for a model chunk.""" + microbatch_group_size = pipeline_parallel_size * num_model_chunks + num_microbatch_groups = total_num_microbatches // microbatch_group_size + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + if microbatch_group_id == 0: + return microbatch_id_in_group % pipeline_parallel_size == 0 + else: + return False + + def is_last_microbatch_for_model_chunk(microbatch_id: int) -> bool: + """Check if an iteration is the last for a model chunk.""" + microbatch_group_size = pipeline_parallel_size * num_model_chunks + num_microbatch_groups = total_num_microbatches // microbatch_group_size + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + if microbatch_group_id == num_microbatch_groups - 1: + return microbatch_id_in_group % pipeline_parallel_size == pipeline_parallel_size - 1 + else: + return False + + def forward_step_helper(microbatch_id, current_microbatch, checkpoint_activations_microbatch): + """Helper method to run forward step with model split into chunks + (run set_virtual_pipeline_model_parallel_rank() before calling + forward_step()).""" + model_chunk_id = get_model_chunk_id(microbatch_id, forward=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id) + + # launch param synchronization for next model chunk + # Note: Asynchronous communication tends to slow down compute. + # To reduce idling from mismatched microbatch times, we launch + # asynchronous communication at the same time across the + # pipeline-parallel group. + if config.param_sync_func is not None: + param_sync_microbatch_id = microbatch_id + pipeline_parallel_rank + if ( + param_sync_microbatch_id < total_num_microbatches + and is_first_microbatch_for_model_chunk(param_sync_microbatch_id) + ): + param_sync_chunk_id = get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1 + if 1 < param_sync_chunk_id < num_model_chunks: + config.param_sync_func[param_sync_chunk_id]( + model[param_sync_chunk_id].parameters() + ) + + # forward step + if parallel_state.is_pipeline_first_stage(): + if len(input_tensors[model_chunk_id]) == len(output_tensors[model_chunk_id]): + input_tensors[model_chunk_id].append(None) + input_tensor = input_tensors[model_chunk_id][-1] + + output_tensor, num_tokens = forward_step( + forward_step_func, + data_iterator[model_chunk_id], + model[model_chunk_id], + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + checkpoint_activations_microbatch, + check_first_val_step( + first_val_step, + forward_only, + is_first_microbatch_for_model_chunk(microbatch_id), + ), + current_microbatch=current_microbatch, + ) + output_tensors[model_chunk_id].append(output_tensor) + + nonlocal total_num_tokens + total_num_tokens += num_tokens.item() + + # if forward-only, no need to save tensors for a backward pass + if forward_only: + input_tensors[model_chunk_id].pop() + output_tensors[model_chunk_id].pop() + + return output_tensor + + def backward_step_helper(microbatch_id): + """Helper method to run backward step with model split into chunks + (run set_virtual_pipeline_model_parallel_rank() before calling + backward_step()).""" + model_chunk_id = get_model_chunk_id(microbatch_id, forward=False) + parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id) + + # launch grad synchronization (default) + if config.grad_sync_func is None and is_last_microbatch_for_model_chunk(microbatch_id): + enable_grad_sync() + synchronized_model_chunks.add(model_chunk_id) + + if parallel_state.is_pipeline_last_stage(): + if len(output_tensor_grads[model_chunk_id]) == 0: + output_tensor_grads[model_chunk_id].append(None) + input_tensor = input_tensors[model_chunk_id].pop(0) + output_tensor = output_tensors[model_chunk_id].pop(0) + output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0) + input_tensor_grad = backward_step( + input_tensor, output_tensor, output_tensor_grad, model_type, config + ) + + # launch grad synchronization (custom grad sync) + # Note: Asynchronous communication tends to slow down compute. + # To reduce idling from mismatched microbatch times, we launch + # asynchronous communication at the same time across the + # pipeline-parallel group. + if config.grad_sync_func is not None: + grad_sync_microbatch_id = microbatch_id - pipeline_parallel_rank + if grad_sync_microbatch_id >= 0 and is_last_microbatch_for_model_chunk( + grad_sync_microbatch_id + ): + grad_sync_chunk_id = get_model_chunk_id(grad_sync_microbatch_id, forward=False) + enable_grad_sync() + config.grad_sync_func[grad_sync_chunk_id](model[grad_sync_chunk_id].parameters()) + synchronized_model_chunks.add(grad_sync_chunk_id) + disable_grad_sync() + + return input_tensor_grad + + # Run warmup forward passes. + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + input_tensors[0].append(p2p_communication.recv_forward(tensor_shape, config)) + + fwd_wait_handles = None + bwd_wait_handles = None + + for k in range(num_warmup_microbatches): + + if fwd_wait_handles is not None: + for req in fwd_wait_handles: + req.wait() + + cur_model_chunk_id = get_model_chunk_id(k, forward=True) + # Decide to checkpoint all layers' activations of the current micro-batch + if max_outstanding_backprops is not None: + checkpoint_activations_microbatch = ( + k % max_outstanding_backprops + >= config.num_microbatches_with_partial_activation_checkpoints + ) + else: + checkpoint_activations_microbatch = None + + current_microbatch = get_microbatch_id_in_model_chunk(k, forward=True) + output_tensor = forward_step_helper( + k, current_microbatch, checkpoint_activations_microbatch + ) + + # Determine if tensor should be received from previous stage. + next_forward_model_chunk_id = get_model_chunk_id(k + 1, forward=True) + recv_prev = True + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + if next_forward_model_chunk_id == 0: + recv_prev = False + if k == (total_num_microbatches - 1): + recv_prev = False + + # Don't send tensor downstream if on last stage. + if parallel_state.is_pipeline_last_stage(): + output_tensor = None + + # Send and receive tensors as appropriate (send tensors computed + # in this iteration; receive tensors for next iteration). + if not config.overlap_p2p_comm: + if ( + k == (num_warmup_microbatches - 1) + and not forward_only + and not all_warmup_microbatches + ): + input_tensor_grad = None + recv_next = True + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + recv_next = False + ( + input_tensor, + output_tensor_grad, + ) = p2p_communication.send_forward_backward_recv_forward_backward( + output_tensor, + input_tensor_grad, + recv_prev=recv_prev, + recv_next=recv_next, + tensor_shape=tensor_shape, + config=config, + ) + output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad) + else: + input_tensor = p2p_communication.send_forward_recv_forward( + output_tensor, recv_prev=recv_prev, tensor_shape=tensor_shape, config=config + ) + input_tensors[next_forward_model_chunk_id].append(input_tensor) + else: + input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward( + output_tensor, + recv_prev=recv_prev, + tensor_shape=tensor_shape, + config=config, + overlap_p2p_comm=True, + ) + + if ( + k == (num_warmup_microbatches - 1) + and not forward_only + and not all_warmup_microbatches + ): + input_tensor_grad = None + recv_next = True + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + recv_next = False + + ( + output_tensor_grad, + bwd_wait_handles, + ) = p2p_communication.send_backward_recv_backward( + input_tensor_grad, + recv_next=recv_next, + tensor_shape=tensor_shape, + config=config, + overlap_p2p_comm=True, + ) + + output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad) + input_tensors[next_forward_model_chunk_id].append(input_tensor) + + deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs) + + # Run 1F1B in steady state. + for k in range(num_microbatches_remaining): + # Forward pass. + forward_k = k + num_warmup_microbatches + + # Decide to checkpoint all layers' activations of the current micro-batch + if max_outstanding_backprops is not None: + checkpoint_activations_microbatch = ( + forward_k % max_outstanding_backprops + >= config.num_microbatches_with_partial_activation_checkpoints + ) + else: + checkpoint_activations_microbatch = None + + cur_model_chunk_id = get_model_chunk_id(forward_k, forward=True) + current_microbatch = get_microbatch_id_in_model_chunk(forward_k, forward=True) + if config.overlap_p2p_comm: + if fwd_wait_handles is not None: + for req in fwd_wait_handles: + req.wait() + + deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs) + + output_tensor = forward_step_helper( + forward_k, current_microbatch, checkpoint_activations_microbatch + ) + + # Determine if current stage has anything to send in either direction, + # otherwise set tensor to None. + forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id) + + # Last virtual stage no activation tensor to send + if parallel_state.is_pipeline_last_stage(): + output_tensor = None + + # Determine if peers are sending, and where in data structure to put + # received tensors. + recv_prev = True + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + # First stage is ahead of last stage by (pipeline_parallel_size - 1). + next_forward_model_chunk_id = get_model_chunk_id( + forward_k - (pipeline_parallel_size - 1), forward=True + ) + if next_forward_model_chunk_id == (num_model_chunks - 1): + recv_prev = False + next_forward_model_chunk_id += 1 + else: + next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True) + + # If last iteration, don't receive; we already received one extra + # before the start of the for loop. + if k == (num_microbatches_remaining - 1): + recv_prev = False + + # Send activation tensor to the next stage and receive activation tensor from the + # previous stage + input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward( + output_tensor, + recv_prev=recv_prev, + tensor_shape=tensor_shape, + config=config, + overlap_p2p_comm=True, + ) + # assert fwd_wait_handles is not None + + if bwd_wait_handles is not None: + for req in bwd_wait_handles: + req.wait() + + # Backward pass. + backward_k = k + input_tensor_grad = backward_step_helper(backward_k) + + backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False) + parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id) + + # First virtual stage no activation gradient tensor to send + if parallel_state.is_pipeline_first_stage(): + input_tensor_grad = None + + # Determine if the current virtual stage has an activation gradient tensor to receive + recv_next = True + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + # Last stage is ahead of first stage by (pipeline_parallel_size - 1). + next_backward_model_chunk_id = get_model_chunk_id( + backward_k - (pipeline_parallel_size - 1), forward=False + ) + if next_backward_model_chunk_id == 0: + recv_next = False + next_backward_model_chunk_id -= 1 + else: + next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False) + + output_tensor_grad, bwd_wait_handles = p2p_communication.send_backward_recv_backward( + input_tensor_grad, + recv_next=recv_next, + tensor_shape=tensor_shape, + config=config, + overlap_p2p_comm=True, + ) + + else: # no p2p overlap + output_tensor = forward_step_helper( + forward_k, current_microbatch, checkpoint_activations_microbatch + ) + + # Backward pass. + backward_k = k + input_tensor_grad = backward_step_helper(backward_k) + + # Send output_tensor and input_tensor_grad, receive input_tensor + # and output_tensor_grad. + + # Determine if current stage has anything to send in either direction, + # otherwise set tensor to None. + forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id) + if parallel_state.is_pipeline_last_stage(): + output_tensor = None + + backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False) + parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id) + if parallel_state.is_pipeline_first_stage(): + input_tensor_grad = None + + # Determine if peers are sending, and where in data structure to put + # received tensors. + recv_prev = True + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + # First stage is ahead of last stage by (pipeline_parallel_size - 1). + next_forward_model_chunk_id = get_model_chunk_id( + forward_k - (pipeline_parallel_size - 1), forward=True + ) + if next_forward_model_chunk_id == (num_model_chunks - 1): + recv_prev = False + next_forward_model_chunk_id += 1 + else: + next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True) + + recv_next = True + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + # Last stage is ahead of first stage by (pipeline_parallel_size - 1). + next_backward_model_chunk_id = get_model_chunk_id( + backward_k - (pipeline_parallel_size - 1), forward=False + ) + if next_backward_model_chunk_id == 0: + recv_next = False + next_backward_model_chunk_id -= 1 + else: + next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False) + + # If last iteration, don't receive; we already received one extra + # before the start of the for loop. + if k == (num_microbatches_remaining - 1): + recv_prev = False + + # Communicate tensors. + ( + input_tensor, + output_tensor_grad, + ) = p2p_communication.send_forward_backward_recv_forward_backward( + output_tensor, + input_tensor_grad, + recv_prev=recv_prev, + recv_next=recv_next, + tensor_shape=tensor_shape, + config=config, + ) + deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs) + + # Put input_tensor and output_tensor_grad in data structures in the + # right location. + if recv_prev: + input_tensors[next_forward_model_chunk_id].append(input_tensor) + if recv_next: + output_tensor_grads[next_backward_model_chunk_id].append(output_tensor_grad) + + deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs) + + # Run cooldown backward passes (flush out pipeline). + if not forward_only: + if config.overlap_p2p_comm and bwd_wait_handles is not None: + for wait_handle in bwd_wait_handles: + wait_handle.wait() + + if all_warmup_microbatches: + output_tensor_grads[num_model_chunks - 1].append( + p2p_communication.recv_backward(tensor_shape, config=config) + ) + for k in range(num_microbatches_remaining, total_num_microbatches): + input_tensor_grad = backward_step_helper(k) + next_backward_model_chunk_id = get_model_chunk_id(k + 1, forward=False) + recv_next = True + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + if next_backward_model_chunk_id == (num_model_chunks - 1): + recv_next = False + if k == (total_num_microbatches - 1): + recv_next = False + output_tensor_grads[next_backward_model_chunk_id].append( + p2p_communication.send_backward_recv_backward( + input_tensor_grad, recv_next=recv_next, tensor_shape=tensor_shape, config=config + ) + ) + + # Launch any remaining grad reductions. + enable_grad_sync() + if config.grad_sync_func is not None: + for model_chunk_id in range(num_model_chunks): + if model_chunk_id not in synchronized_model_chunks: + config.grad_sync_func[model_chunk_id](model[model_chunk_id].parameters()) + synchronized_model_chunks.add(model_chunk_id) + + if config.finalize_model_grads_func is not None and not forward_only: + + # If defer_embedding_wgrad_compute is enabled we need to do the + # weight gradient GEMM's here. + finish_embedding_wgrad_compute(config, embedding_module) + + # Finalize model grads (perform full grad all-reduce / reduce-scatter for + # data parallelism, layernorm all-reduce for sequence parallelism, and + # embedding all-reduce for pipeline parallelism). + config.finalize_model_grads_func( + model, total_num_tokens if config.calculate_per_token_loss else None + ) + + if config.timers is not None: + config.timers('forward-backward').stop() + + return forward_data_store + + +def get_tensor_shapes( + *, + rank: int, + model_type: ModelType, + seq_length: int, + micro_batch_size: int, + decoder_seq_length: int, + config, +): + # Determine right tensor sizes (based on position of rank with respect to split + # rank) and model size. + # Send two tensors if model is T5 and rank is in decoder stage: + # first tensor is decoder (pre-transpose), + # second tensor is encoder (post-transpose). + # If model is T5 and rank is at the boundary: + # send one tensor (post-transpose from encoder). + # Otherwise, send one tensor (pre-transpose). + tensor_shapes = [] + + seq_length = seq_length // parallel_state.get_context_parallel_world_size() + if model_type == ModelType.encoder_and_decoder: + decoder_seq_length = decoder_seq_length // parallel_state.get_context_parallel_world_size() + + if config.sequence_parallel: + seq_length = seq_length // parallel_state.get_tensor_model_parallel_world_size() + if model_type == ModelType.encoder_and_decoder: + decoder_seq_length = ( + decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size() + ) + + if model_type == ModelType.encoder_and_decoder: + if parallel_state.is_pipeline_stage_before_split(rank): + tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size)) + else: + tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size)) + tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size)) + else: + tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size)) + return tensor_shapes + + +def recv_forward(tensor_shapes, config): + input_tensors = [] + for tensor_shape in tensor_shapes: + if tensor_shape is None: + input_tensors.append(None) + else: + input_tensors.append(p2p_communication.recv_forward(tensor_shape, config)) + return input_tensors + + +def recv_backward(tensor_shapes, config): + output_tensor_grads = [] + for tensor_shape in tensor_shapes: + if tensor_shape is None: + output_tensor_grads.append(None) + else: + output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config)) + return output_tensor_grads + + +def send_forward(output_tensors, tensor_shapes, config): + if not isinstance(output_tensors, list): + output_tensors = [output_tensors] + for output_tensor, tensor_shape in zip(output_tensors, tensor_shapes): + if tensor_shape is None: + continue + p2p_communication.send_forward(output_tensor, config) + + +def send_backward(input_tensor_grads, tensor_shapes, config): + if not isinstance(input_tensor_grads, list): + input_tensor_grads = [input_tensor_grads] + for input_tensor_grad, tensor_shape in zip(input_tensor_grads, tensor_shapes): + if tensor_shape is None: + continue + p2p_communication.send_backward(input_tensor_grad, config) + + +def send_forward_recv_backward(output_tensors, tensor_shapes, config): + if not isinstance(output_tensors, list): + output_tensors = [output_tensors] + output_tensor_grads = [] + for output_tensor, tensor_shape in zip(output_tensors, tensor_shapes): + if tensor_shape is None: + output_tensor_grads.append(None) + continue + output_tensor_grad = p2p_communication.send_forward_recv_backward( + output_tensor, tensor_shape, config + ) + output_tensor_grads.append(output_tensor_grad) + return output_tensor_grads + + +def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config): + if not isinstance(input_tensor_grads, list): + input_tensor_grads = [input_tensor_grads] + input_tensors = [] + for input_tensor_grad, tensor_shape in zip(input_tensor_grads, tensor_shapes): + if tensor_shape is None: + input_tensors.append(None) + continue + input_tensor = p2p_communication.send_backward_recv_forward( + input_tensor_grad, tensor_shape, config + ) + input_tensors.append(input_tensor) + return input_tensors + + +def forward_backward_pipelining_without_interleaving( + *, + forward_step_func, + data_iterator: Union[Iterator, List[Iterator]], + model: Union[torch.nn.Module, List[torch.nn.Module]], + num_microbatches: int, + seq_length: int, + micro_batch_size: int, + decoder_seq_length: int = None, + forward_only: bool = False, + collect_non_loss_data: bool = False, + first_val_step: bool = None, +): + """Run non-interleaved 1F1B schedule, with communication between pipeline + stages. + + Returns dictionary with losses if the last stage, empty dict otherwise.""" + + if isinstance(model, list): + assert ( + len(model) == 1 + ), "non-interleaved pipeline parallelism does not support model chunking" + model = model[0] + if isinstance(data_iterator, list): + assert ( + len(data_iterator) == 1 + ), "non-pipeline-parallel schedule does not support model chunking" + data_iterator = data_iterator[0] + + config = get_model_config(model) + if config.overlap_p2p_comm: + raise ValueError( + "Non-interleaved pipeline parallelism does not support overlapping p2p communication" + ) + + # Needed only when gradients are finalized in M-Core + if config.finalize_model_grads_func is not None and not forward_only: + embedding_module = clear_embedding_activation_buffer(config, model) + + if config.timers is not None: + config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + + # Disable async grad reductions + no_sync_func = config.no_sync_func + if no_sync_func is None: + no_sync_func = contextlib.nullcontext + no_sync_context = None + + def disable_grad_sync(): + """Disable asynchronous grad reductions""" + nonlocal no_sync_context + if no_sync_context is None: + no_sync_context = no_sync_func() + no_sync_context.__enter__() + + def enable_grad_sync(): + """Enable asynchronous grad reductions""" + nonlocal no_sync_context + if no_sync_context is not None: + no_sync_context.__exit__(None, None, None) + no_sync_context = None + + disable_grad_sync() + + # Compute number of warmup microbatches. + num_warmup_microbatches = ( + parallel_state.get_pipeline_model_parallel_world_size() + - parallel_state.get_pipeline_model_parallel_rank() + - 1 + ) + num_warmup_microbatches = min(num_warmup_microbatches, num_microbatches) + num_microbatches_remaining = num_microbatches - num_warmup_microbatches + + # Checkpoint the activations of partial Transformer layers in a number of micro-batches + # within the maximum outstanding micro-batch backpropagations. + # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints' + # checkpoint partial Transformer layers (or skip checkpointing) and + # the rest of micro-batches within a window of micro-batches checkpoint + # all Transformer layers. The window of micro-batches is set by the maximum + # outstanding backpropagations and becomes smaller at later pipeline stages. + # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf + max_outstanding_backprops = None + if config.num_microbatches_with_partial_activation_checkpoints is not None: + max_outstanding_backprops = num_warmup_microbatches + 1 + + model_type = get_model_type(model) + + rank = parallel_state.get_pipeline_model_parallel_rank() + recv_tensor_shapes = get_tensor_shapes( + rank=rank - 1, + model_type=model_type, + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=config, + ) + send_tensor_shapes = get_tensor_shapes( + rank=rank, + model_type=model_type, + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=config, + ) + + # Input, output tensors only need to be saved when doing backward passes + input_tensors = None + output_tensors = None + total_num_tokens = torch.tensor(0, dtype=torch.int).cuda() + + if not forward_only: + input_tensors = [] + output_tensors = [] + forward_data_store = [] + + # Run warmup forward passes. + for i in range(num_warmup_microbatches): + # Decide to checkpoint all layers' activations of the current micro-batch + if max_outstanding_backprops is not None: + checkpoint_activations_microbatch = ( + i % max_outstanding_backprops + >= config.num_microbatches_with_partial_activation_checkpoints + ) + else: + checkpoint_activations_microbatch = None + + input_tensor = recv_forward(recv_tensor_shapes, config) + output_tensor, num_tokens = forward_step( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + checkpoint_activations_microbatch, + check_first_val_step(first_val_step, forward_only, i == 0), + current_microbatch=i, + ) + send_forward(output_tensor, send_tensor_shapes, config) + total_num_tokens += num_tokens.item() + + if not forward_only: + input_tensors.append(input_tensor) + output_tensors.append(output_tensor) + deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs) + + # Before running 1F1B, need to receive first forward tensor. + # If all microbatches are run in warmup / cooldown phase, then no need to + # receive this tensor here. + if num_microbatches_remaining > 0: + input_tensor = recv_forward(recv_tensor_shapes, config) + + # Run 1F1B in steady state. + for i in range(num_microbatches_remaining): + last_iteration = i == (num_microbatches_remaining - 1) + + # Decide to checkpoint all layers' activations of the current micro-batch + if max_outstanding_backprops is not None: + checkpoint_activations_microbatch = ( + (i + num_warmup_microbatches) % max_outstanding_backprops + ) >= config.num_microbatches_with_partial_activation_checkpoints + else: + checkpoint_activations_microbatch = None + + output_tensor, num_tokens = forward_step( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + checkpoint_activations_microbatch, + check_first_val_step( + first_val_step, forward_only, (i == 0) and (num_warmup_microbatches == 0) + ), + current_microbatch=i + num_warmup_microbatches, + ) + total_num_tokens += num_tokens.item() + + if forward_only: + send_forward(output_tensor, send_tensor_shapes, config) + + if not last_iteration: + input_tensor = recv_forward(recv_tensor_shapes, config) + + else: + output_tensor_grad = send_forward_recv_backward( + output_tensor, send_tensor_shapes, config + ) + + # Add input_tensor and output_tensor to end of list. + input_tensors.append(input_tensor) + output_tensors.append(output_tensor) + deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs) + + # Pop input_tensor and output_tensor from the start of the list for + # the backward pass. + input_tensor = input_tensors.pop(0) + output_tensor = output_tensors.pop(0) + + # Enable grad sync for the last microbatch in the batch if the full + # backward pass completes in the 1F1B stage. + if num_warmup_microbatches == 0 and last_iteration: + if config.grad_sync_func is None or rank == 0: + enable_grad_sync() + + input_tensor_grad = backward_step( + input_tensor, output_tensor, output_tensor_grad, model_type, config + ) + + if last_iteration: + input_tensor = None + send_backward(input_tensor_grad, recv_tensor_shapes, config) + else: + input_tensor = send_backward_recv_forward( + input_tensor_grad, recv_tensor_shapes, config + ) + + # Run cooldown backward passes. + if not forward_only: + for i in range(num_warmup_microbatches): + + # Enable async grad reduction in the last backward pass + # Note: If grad sync function is provided, only enable + # async grad reduction in first pipeline stage. Other + # pipeline stages do grad reduction during pipeline + # bubble. + if i == num_warmup_microbatches - 1: + if config.grad_sync_func is None or rank == 0: + enable_grad_sync() + + input_tensor = input_tensors.pop(0) + output_tensor = output_tensors.pop(0) + + output_tensor_grad = recv_backward(send_tensor_shapes, config) + + input_tensor_grad = backward_step( + input_tensor, output_tensor, output_tensor_grad, model_type, config + ) + + send_backward(input_tensor_grad, recv_tensor_shapes, config) + + # Launch any remaining grad reductions. + if no_sync_context is not None: + enable_grad_sync() + if config.grad_sync_func is not None: + config.grad_sync_func(model.parameters()) + + if config.finalize_model_grads_func is not None and not forward_only: + + # If defer_embedding_wgrad_compute is enabled we need to do the + # weight gradient GEMM's here. + finish_embedding_wgrad_compute(config, embedding_module) + + # Finalize model grads (perform full grad all-reduce / reduce-scatter for + # data parallelism, layernorm all-reduce for sequence parallelism, and + # embedding all-reduce for pipeline parallelism). + config.finalize_model_grads_func( + [model], total_num_tokens if config.calculate_per_token_loss else None + ) + + if config.timers is not None: + config.timers('forward-backward').stop() + + return forward_data_store diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt new file mode 100644 index 0000000..08ed5ee --- /dev/null +++ b/megatron/core/requirements.txt @@ -0,0 +1 @@ +torch \ No newline at end of file diff --git a/megatron/core/ssm/__init__.py b/megatron/core/ssm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py new file mode 100644 index 0000000..9b18554 --- /dev/null +++ b/megatron/core/ssm/mamba_block.py @@ -0,0 +1,228 @@ +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +# Some of this code was adopted from https://github.com/state-spaces/mamba/ +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass +from functools import partial +from typing import Union + +from torch import Tensor, nn + +from megatron.core import parallel_state +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols +from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers +from megatron.core.tensor_parallel import get_cuda_rng_tracker +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_viewless_tensor + + +# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454 +def _init_weights( + module, + n_layer, + initializer_range=0.02, # Now only used for embedding layer. + rescale_prenorm_residual=True, + n_residuals_per_layer=1, # Change to 2 if we have MLP +): + with get_cuda_rng_tracker().fork(): + if isinstance(module, nn.Linear): + if not getattr(module.weight, "_no_reinit", False): + nn.init.normal_(module.weight, std=initializer_range) + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=initializer_range) + + for name, p in module.named_parameters(): + if name in ["in_proj.weight", "x_proj.weight", "conv1d.weight", "out_proj.weight"]: + nn.init.kaiming_uniform(p, a=math.sqrt(5)) + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight", "fc2.weight"]: + # Special Scaled Initialization + nn.init.normal_( + p, + mean=0.0, + std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer), + ) + + +@dataclass +class MambaStackSubmodules: + mamba_layer: Union[ModuleSpec, type] = IdentityOp + attention_layer: Union[ModuleSpec, type] = IdentityOp + mlp_layer: Union[ModuleSpec, type] = IdentityOp + + +class MambaStack(MegatronModule): + def __init__( + self, + config: TransformerConfig, + submodules: MambaStackSubmodules, + mamba_ssm_ngroups: int = 8, + residual_in_fp32=False, + pre_process: bool = True, + hybrid_attention_ratio: float = 0.0, + hybrid_mlp_ratio: float = 0.0, + hybrid_override_pattern: str = None, + post_layer_norm: bool = True, + post_process: bool = True, + device=None, + dtype=None, + ) -> None: + super().__init__(config=config) + self.residual_in_fp32 = residual_in_fp32 + self.pre_process = pre_process + self.post_layer_norm = post_layer_norm + self.post_process = post_process + + # Required for pipeline parallel schedules + self.input_tensor = None + + self.hybrid_attention_ratio = hybrid_attention_ratio + self.hybrid_mlp_ratio = hybrid_mlp_ratio + self.hybrid_override_pattern = hybrid_override_pattern + + layer_type_list = allocate_layers( + self.config.num_layers, + self.hybrid_attention_ratio, + self.hybrid_mlp_ratio, + self.hybrid_override_pattern, + ) + + pp_layer_offset = 0 + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + pp_layer_offset, layer_type_list = self._select_layers_for_pipeline_parallel( + layer_type_list + ) + + self.layers = nn.ModuleList() + for i, layer_type in enumerate(layer_type_list): + if layer_type == LayerSymbols.MAMBA: + layer = build_module( + submodules.mamba_layer, + config=self.config, + mamba_ssm_ngroups=mamba_ssm_ngroups, + residual_in_fp32=residual_in_fp32, + layer_number=i + 1 + pp_layer_offset, + ) + elif layer_type == LayerSymbols.ATTENTION: + # Transformer layers apply their own pp_layer_offset + layer = build_module( + submodules.attention_layer, config=self.config, layer_number=i + 1 + ) + elif layer_type == LayerSymbols.MLP: + # Transformer layers apply their own pp_layer_offset + layer = build_module(submodules.mlp_layer, config=self.config, layer_number=i + 1) + else: + assert True, "unexpected layer_type" + self.layers.append(layer) + + # Required for activation recomputation + self.num_layers_per_pipeline_rank = len(self.layers) + + if self.post_process and self.post_layer_norm: + # Final layer norm before output. + self.final_norm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + + self.apply( + partial( + _init_weights, + n_layer=self.config.num_layers, + ) + ) + + def _select_layers_for_pipeline_parallel(self, layer_type_list): + pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() + num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + assert parallel_state.get_virtual_pipeline_model_parallel_world_size() is None, ( + "The Mamba hybrid model does not currently support " + "virtual/interleaved pipeline parallelism" + ) + + offset = pipeline_rank * num_layers_per_pipeline_rank + selected_list = layer_type_list[offset : offset + num_layers_per_pipeline_rank] + + return offset, selected_list + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): + return { + i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype) + for i, layer in enumerate(self.layers) + } + + def set_input_tensor(self, input_tensor: Tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + inference_params=None, + rotary_pos_emb: Tensor = None, + ): + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + if inference_params: + # NOTE(bnorick): match InferenceParams attributes for mamba_ssm.utils.generation.InferenceParams, + # this hack supports eval + inference_params.max_seqlen = inference_params.max_sequence_length + inference_params.seqlen_offset = inference_params.sequence_len_offset + + for layer in self.layers: + hidden_states = layer( + hidden_states, + attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + # The attention layer (currently a simplified transformer layer) + # outputs a tuple of (hidden_states, context). Context is intended + # for cross-attention, and is not needed in our model. + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + + # Final layer norm. + if self.post_process and self.post_layer_norm: + hidden_states = self.final_norm(hidden_states) + + # Ensure that the tensor passed between pipeline parallel stages is + # viewless. See related notes in TransformerBlock and TransformerLayer + output = make_viewless_tensor( + inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True + ) + + return hidden_states diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py new file mode 100644 index 0000000..abfa2ae --- /dev/null +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -0,0 +1,191 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import logging + +if __name__ != "__main__": + from megatron.core.utils import log_single_rank +else: + from typing import Any + + def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any): + print(*args[1:], **kwargs) + + +logger = logging.getLogger(__name__) + + +class Symbols: + MAMBA = 'M' + ATTENTION = '*' + MLP = '-' + VALID = {MAMBA, ATTENTION, MLP} + + +def _allocate_auto( + total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float +) -> list: + # First, allocate attention (evenly spaced, starting and ending with mamba) + attention_layers_count: int = round(total_layers_count * target_attention_ratio) + mamba_layers_count: int = total_layers_count - attention_layers_count + mamba_sections_count: int = attention_layers_count + 1 + mamba_section_length: float = mamba_layers_count / mamba_sections_count + + layer_type_list = [Symbols.MAMBA] * total_layers_count + x: float = mamba_section_length + for l in range(total_layers_count): + if x < 0.5: + layer_type_list[l] = Symbols.ATTENTION + x += mamba_section_length + else: + x -= 1 + + # Next, allocate mlp + # (evenly distributed, but right-justified, not replacing attention) + mlp_layers_count: int = round(total_layers_count * target_mlp_ratio) + if mlp_layers_count > 0: + mamba_layers_count -= mlp_layers_count + mamba_to_mlp_ratio: float = mamba_layers_count / mlp_layers_count + + x: float = mamba_to_mlp_ratio + for l in range(total_layers_count): + if layer_type_list[l] == Symbols.MAMBA: + if x < 0.5: + layer_type_list[l] = Symbols.MLP + x += mamba_to_mlp_ratio + else: + x -= 1 + + return layer_type_list + + +def _allocate_override(total_layers_count: int, override_pattern: str) -> list: + layer_type_list = list(override_pattern) + override_pattern_length = len(layer_type_list) + if override_pattern_length != total_layers_count: + raise ValueError( + "The hybrid override pattern is the wrong " + f"length: got {override_pattern_length}, expected " + f"{total_layers_count}" + ) + for l in layer_type_list: + if l not in Symbols.VALID: + raise ValueError(f"In hybrid override pattern, '{l}' is not " f"one of {Symbols.VALID}") + + return layer_type_list + + +def _layer_counts_match(a: list, b: list) -> bool: + for s in Symbols.VALID: + if a.count(s) != b.count(s): + return False + return True + + +def allocate_layers( + total_layers_count: int, + target_attention_ratio: float, + target_mlp_ratio: float, + override_pattern: str = None, +) -> list: + assert total_layers_count > 0 + assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0 + assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0 + assert target_attention_ratio + target_mlp_ratio <= 1.0 + # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio + + layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio) + + if override_pattern is not None: + layer_type_list_override = _allocate_override(total_layers_count, override_pattern) + log_single_rank(logger, logging.INFO, "Using hybrid override pattern") + if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match( + layer_type_list_override, layer_type_list + ): + raise ValueError( + "The number of each type of layer in the override " + "pattern must match the number in the overridden " + "pattern." + ) + if layer_type_list_override == layer_type_list: + log_single_rank( + logger, logging.INFO, "The override pattern matches the overridden pattern" + ) + else: + log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B") + log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") + log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") + layer_type_list = layer_type_list_override + + if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None: + actual_attention_layers_count = layer_type_list.count(Symbols.ATTENTION) + actual_attention_ratio = actual_attention_layers_count / total_layers_count + actual_mlp_layers_count = layer_type_list.count(Symbols.MLP) + actual_mlp_ratio = actual_mlp_layers_count / total_layers_count + allocation_string = ''.join(layer_type_list) + log_single_rank( + logger, + logging.INFO, + f"Hybrid allocation ({Symbols.MAMBA} is mamba, " + f"{Symbols.ATTENTION} is attention, " + f"{Symbols.MLP} is mlp):", + ) + log_single_rank(logger, logging.INFO, allocation_string) + log_single_rank( + logger, + logging.INFO, + f"{actual_attention_layers_count} attention layers in " + f"{total_layers_count} total layers.", + ) + log_single_rank( + logger, + logging.INFO, + f"Target attention ratio: {target_attention_ratio:.2f}. " + f"Actual attention ratio: {actual_attention_ratio:.2f}.", + ) + log_single_rank( + logger, + logging.INFO, + f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.", + ) + log_single_rank( + logger, + logging.INFO, + f"Target mlp ratio: {target_mlp_ratio:.2f}. " + f"Actual mlp ratio: {actual_mlp_ratio:.2f}.", + ) + return layer_type_list + + +if __name__ == "__main__": + test_cases = [ + # (10, 0.2, 0.0), + # (48, 0.0, 0.0), # will not print anything + # (48, 0.1, 0.0), + # 48, 0.3, 0.0), + # (48, 0.5, 0.0), + # (48, 0.6, 0.0), + # (48, 0.7, 0.0), + # (10, 0.0, 0.1), + # (10, 0.0, 0.3), + # (10, 0.0, 0.5), + # (10, 0.1, 0.1), + # (10, 0.2, 0.2), + # (10, 0.3, 0.3), + # (10, 0.5, 0.5), + # (48, 0.2, 0.3), + # (48, 0.5, 0.2), + # (48, 0.5, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"), + # (48, 0.25, 0.25, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"), + # (48, 0.25, 0.25, "MM-*MM-*MM*-MM*-MM*-MM*-M*M-M*M-M*M-M*M-*MM-*MM-"), + # (48, 0.0, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"), + # (48, 0.2, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"), + # (48, 0.0, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"), + # (48, 0.5, 0.5), + # (10, 0.3, 0.2, "MMM*-*M*M-"), + # (10, 0.3, 0.2, "MM*M-*M*M-"), + (9, 0.0, 0.0, "M*-M*-M*-"), + (9, 0.0, 0.0, "MMMMMMMMM"), + ] + for t in test_cases: + print("") + allocate_layers(*t) diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py new file mode 100644 index 0000000..96ec81a --- /dev/null +++ b/megatron/core/ssm/mamba_layer.py @@ -0,0 +1,80 @@ +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +# Some of this code was adopted from https://github.com/state-spaces/mamba/ +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Union + +import torch +from torch import Tensor + +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + + +@dataclass +class MambaLayerSubmodules: + norm: Union[ModuleSpec, type] = IdentityOp + mixer: Union[ModuleSpec, type] = IdentityOp + mamba_bda: Union[ModuleSpec, type] = IdentityOp + + +class MambaLayer(MegatronModule): + def __init__( + self, + config: TransformerConfig, + submodules: MambaLayerSubmodules, + mamba_ssm_ngroups=8, + layer_number: int = 1, + residual_in_fp32=False, + ): + """ + Top level Mamba Layer + """ + super().__init__(config) + self.config = config + self.layer_number = layer_number + self.residual_in_fp32 = residual_in_fp32 + self.hidden_dropout = config.hidden_dropout + self.mixer = build_module( + submodules.mixer, + self.config, + d_model=self.config.hidden_size, + ngroups=mamba_ssm_ngroups, + layer_number=layer_number, + ) + self.norm = build_module(submodules.norm, self.config, self.config.hidden_size) + self.mamba_bda = build_module(submodules.mamba_bda) + self.bias_dropout_add_exec_handler = torch.enable_grad + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, # Not used in MambaLayer + inference_params=None, + rotary_pos_emb: Tensor = None, # Not used in MambaLayer + ): + + residual = hidden_states + if self.residual_in_fp32: + residual = residual.to(torch.float32) + + hidden_states = hidden_states.to(dtype=self.config.params_dtype) + hidden_states = self.norm(hidden_states) + + mixer_out_with_bias = self.mixer(hidden_states, inference_params=inference_params) + + with self.bias_dropout_add_exec_handler(): + hidden_states = self.mamba_bda(self.training, self.config.bias_dropout_fusion)( + mixer_out_with_bias, residual, self.hidden_dropout + ) + + return hidden_states + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): + return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype) diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py new file mode 100644 index 0000000..6a6f89a --- /dev/null +++ b/megatron/core/ssm/mamba_mixer.py @@ -0,0 +1,518 @@ +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +# Some of this code was adopted from https://github.com/state-spaces/mamba/ +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass +from typing import Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from megatron.core.parallel_state import get_tensor_model_parallel_world_size +from megatron.core.tensor_parallel import get_cuda_rng_tracker +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + +try: + from mamba_ssm.ops.triton.selective_state_update import selective_state_update +except ImportError: + selective_state_update = None + +try: + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +except ImportError: + causal_conv1d_fn = None + causal_conv1d_update = None + +try: + from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated + from mamba_ssm.ops.triton.ssd_combined import ( + mamba_chunk_scan_combined, + mamba_split_conv1d_scan_combined, + ) +except ImportError: + raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported") + +try: + from einops import rearrange, repeat +except ImportError: + raise ImportError("einops is required by the Mamba model but cannot be imported") + + +@dataclass +class MambaMixerSubmodules: + in_proj: Union[ModuleSpec, type] = None + out_proj: Union[ModuleSpec, type] = None + + +class MambaMixer(MegatronModule): + def __init__( + self, + config: TransformerConfig, + submodules: MambaMixerSubmodules, + d_model, + d_state=128, + d_conv=4, + conv_init=None, + expand=2, + headdim=64, + ngroups=8, + A_init_range=(1, 16), + D_has_hdim=False, + rmsnorm=True, + norm_before_gate=False, + dt_min=0.001, + dt_max=0.1, + dt_init="random", + dt_scale=1.0, + dt_init_floor=1e-4, + bias=False, + conv_bias=True, + # Fused kernel and sharding options + chunk_size=128, + use_mem_eff_path=True, + layer_number=None, + ): + super().__init__(config) + self.config = config + self.d_model = d_model + self.d_state = d_state + self.d_conv = d_conv + self.conv_init = conv_init + self.expand = expand + self.d_inner = int(self.expand * self.d_model) + self.headdim = headdim + self.ngroups = ngroups + assert self.d_inner % self.headdim == 0 + self.nheads = self.d_inner // self.headdim + self.D_has_hdim = D_has_hdim + self.rmsnorm = rmsnorm + self.norm_before_gate = norm_before_gate + self.chunk_size = chunk_size + self.use_mem_eff_path = use_mem_eff_path + self.layer_number = layer_number + + self.tensor_model_parallel_size = get_tensor_model_parallel_world_size() + assert self.d_inner % self.tensor_model_parallel_size == 0 + assert self.ngroups % self.tensor_model_parallel_size == 0 + assert self.nheads % self.tensor_model_parallel_size == 0 + assert not bias + assert not self.norm_before_gate + + self.d_inner_local = self.d_inner // self.tensor_model_parallel_size + self.ngroups_local = self.ngroups // self.tensor_model_parallel_size + self.nheads_local = self.nheads // self.tensor_model_parallel_size + + assert self.d_inner_local % self.ngroups_local == 0 + + # Assume sequence parallelism: input is already partitioned along the + # sequence dimension + self.in_proj = build_module( + submodules.in_proj, + self.d_model, + self.d_inner * 2 + 2 * self.ngroups * self.d_state + self.nheads, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='fc1', + ) + + conv_dim = self.d_inner_local + 2 * self.ngroups_local * self.d_state + with get_cuda_rng_tracker().fork(): + self.conv1d = nn.Conv1d( + in_channels=conv_dim, + out_channels=conv_dim, + bias=conv_bias, + kernel_size=d_conv, + groups=conv_dim, + padding=d_conv - 1, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + setattr(self.conv1d.weight, 'tensor_model_parallel', True) + setattr(self.conv1d.bias, 'tensor_model_parallel', True) + + if self.conv_init is not None: + nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) + + self.activation = "silu" + self.act = nn.SiLU() + + with get_cuda_rng_tracker().fork(): + # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max + dt = torch.exp( + torch.rand( + self.nheads_local, device=torch.cuda.current_device(), dtype=config.params_dtype + ) + * (math.log(dt_max) - math.log(dt_min)) + + math.log(dt_min) + ).clamp(min=dt_init_floor) + # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + with torch.no_grad(): + self.dt_bias = nn.Parameter(inv_dt) + # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit + self.dt_bias._no_reinit = True + # Just to be explicit. Without this we already don't put wd on dt_bias because of the check + # name.endswith("bias") in param_grouping.py + self.dt_bias._no_weight_decay = True + + assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0] + A = torch.empty( + self.nheads_local, dtype=torch.float32, device=torch.cuda.current_device() + ).uniform_(*A_init_range) + A_log = torch.log(A) # Keep A_log in fp32 + self.A_log = nn.Parameter(A_log) + self.A_log._no_weight_decay = True + setattr(self.A_log, 'tensor_model_parallel', True) + + # D "skip" parameter + self.D = nn.Parameter( + torch.ones( + self.d_inner_local if self.D_has_hdim else self.nheads_local, + device=torch.cuda.current_device(), + ) + ) # Keep in fp32 + self.D._no_weight_decay = True + setattr(self.D, 'tensor_model_parallel', True) + + if self.rmsnorm: + assert RMSNormGated is not None + self.norm = RMSNormGated( + self.d_inner_local, + eps=1e-5, + group_size=self.d_inner_local // self.ngroups_local, + norm_before_gate=self.norm_before_gate, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + + # Assume sequence parallelism: input is partitioned along d_inner and + # output is partitioned along the sequence dimension + self.out_proj = build_module( + submodules.out_proj, + self.d_inner, + self.d_model, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=bias, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='fc2', + ) + + def forward(self, hidden_states, inference_params=None): + """ + hidden_states: (nL, B, D) / (L B D) + Returns: same shape as hidden_states + """ + _, batch, dim = hidden_states.shape + + conv_state, ssm_state = None, None + if inference_params is not None: + assert not self.config.sequence_parallel + conv_state, ssm_state = self._get_states_from_cache(inference_params, batch) + if inference_params.seqlen_offset > 0: + # The states are updated inplace + out, out_bias, _, _ = self.step(hidden_states, conv_state, ssm_state) + return out, out_bias + + # (nheads_local) + A = -torch.exp(self.A_log.float()) + + xz, _ = self.in_proj(hidden_states) + + # transpose: l b pd --> b l pd + xz = rearrange(xz, "l b d -> b l d").contiguous() + + if self.use_mem_eff_path and inference_params is None: + assert ssm_state is None + + if self.conv1d.bias is not None: + self.conv1d.bias.data_ptr() + + y = mamba_split_conv1d_scan_combined( + xz, + rearrange(self.conv1d.weight, "d 1 w -> d w"), + self.conv1d.bias, + self.dt_bias.float(), + A, + D=( + rearrange(self.D.float(), "(h p) -> h p", p=self.headdim) + if self.D_has_hdim + else self.D + ), + chunk_size=self.chunk_size, + activation=self.activation, + headdim=None if self.D_has_hdim else self.headdim, + ngroups=self.ngroups_local, + norm_before_gate=self.norm_before_gate, + ) + + if self.rmsnorm: + y = self.norm(y) + else: + z, xBC, dt = torch.split( + xz, + [ + self.d_inner_local, + self.d_inner_local + 2 * self.ngroups_local * self.d_state, + self.nheads_local, + ], + dim=-1, + ) + + # transpose: b l pd --> b pd l + xBC = rearrange(xBC, "b l d -> b d l").contiguous() + + # Compute short convolution + if conv_state is not None: + # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv + # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. + conv_state.copy_( + F.pad(xBC, (self.d_conv - xBC.shape[-1], 0)) + ) # Update state (B D W) + + seqlen = xBC.size(2) + if causal_conv1d_fn is None: + xBC = self.act(self.conv1d(xBC)[..., :seqlen]) + else: + assert self.activation in ["silu", "swish"] + xBC = causal_conv1d_fn( + x=xBC, + weight=rearrange(self.conv1d.weight, "d 1 w -> d w"), + bias=self.conv1d.bias, + activation=self.activation, + ) + + # transpose b pd l --> b l pd + xBC = rearrange(xBC, "b d l -> b l d").contiguous() + + x, B, C = torch.split( + xBC, + [ + self.d_inner_local, + self.ngroups_local * self.d_state, + self.ngroups_local * self.d_state, + ], + dim=-1, + ) + + # TODO Vijay: fuse most of the transposes with the GEMMS + x = rearrange(x, "b l (h p) -> b l h p", p=self.headdim).contiguous() + dt = dt.contiguous() + B = rearrange(B, "b l (g n) -> b l g n", n=self.d_state).contiguous() + C = rearrange(C, "b l (g n) -> b l g n", n=self.d_state).contiguous() + z = rearrange(z, "b l (h p) -> b l h p", p=self.headdim).contiguous() + y = mamba_chunk_scan_combined( + x, + dt, + A, + B, + C, + self.chunk_size, + D=( + rearrange(self.D.float(), "(h p) -> h p", p=self.headdim) + if self.D_has_hdim + else self.D + ), + z=z if not self.rmsnorm else None, + dt_bias=self.dt_bias.float(), + dt_softplus=True, + return_final_states=ssm_state is not None, + ) + + if ssm_state is not None: + y, last_state = y + ssm_state.copy_(last_state) + + if self.rmsnorm: + y = rearrange(y, "b l h p -> b l (h p)").contiguous() + z = rearrange(z, "b l h p -> b l (h p)").contiguous() + y = self.norm(y, z) + else: + y = rearrange(y, "b l h p -> b l (h p)").contiguous() + + y = rearrange(y, "b l d -> l b d").contiguous() + out, out_bias = self.out_proj(y) + + return out, out_bias + + def step(self, hidden_states, conv_state, ssm_state): + # assert self.ngroups_local == 1, "Only support ngroups=1 for inference for now" + dtype = hidden_states.dtype + assert hidden_states.shape[0] == 1, "Only support decoding with 1 token at a time for now" + + # l b d --> b d + hidden_states = hidden_states.squeeze(0) + + # b d_model --> b p(2d) + xz, _ = self.in_proj(hidden_states) + + z, xBC, dt = torch.split( + xz, + [ + self.d_inner_local, + self.d_inner_local + 2 * self.ngroups_local * self.d_state, + self.nheads_local, + ], + dim=-1, + ) + + # Conv step + if causal_conv1d_update is None: + conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1)) # Update state (B D W) + conv_state[:, :, -1] = xBC + xBC = torch.sum( + conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1 + ) # (B D) + if self.conv1d.bias is not None: + xBC = xBC + self.conv1d.bias + xBC = self.act(xBC).to(dtype=dtype) + else: + xBC = causal_conv1d_update( + xBC, + conv_state, + rearrange(self.conv1d.weight, "d 1 w -> d w"), + self.conv1d.bias, + self.activation, + ) + + x, B, C = torch.split( + xBC, + [ + self.d_inner_local, + self.ngroups_local * self.d_state, + self.ngroups_local * self.d_state, + ], + dim=-1, + ) + A = -torch.exp(self.A_log.float()) + + # SSM step + if selective_state_update is None: + if self.ngroups_local > 1: + B = rearrange(B, "b (g n) -> b g n", n=self.d_state) + C = rearrange(C, "b (g n) -> b g n", n=self.d_state) + B = repeat(B, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local) + C = repeat(C, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local) + + dt = repeat(dt, "b h -> b (h p)", p=self.headdim) + dt_bias = repeat(self.dt_bias, "h -> (h p)", p=self.headdim) + A = repeat(A, "h -> (h p) n", p=self.headdim, n=self.d_state) + D = repeat(self.D, "h -> (h p)", p=self.headdim) + + dt = F.softplus(dt + dt_bias.to(dtype=dt.dtype)) + dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A)) + + dB_x = torch.einsum('bd,bdn,bd->bdn', dt, B, x) + ssm_state.copy_( + ssm_state * rearrange(dA, "b (h p) n -> b h p n", p=self.headdim) + + rearrange(dB_x, "b (h p) n -> b h p n", p=self.headdim) + ) + + y = torch.einsum( + "bdn,bdn->bd", + rearrange(ssm_state.to(dtype), "b h p n -> b (h p) n", p=self.headdim), + C, + ) + y = y + D.to(dtype) * x + if not self.rmsnorm: + y = y * self.act(z) # (B D) + else: + # Discretize A and B (b (g n)) + dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype)) # (batch, nheads) + dA = torch.exp(dt * A) + x = rearrange(x, "b (h p) -> b h p", p=self.headdim) + dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x) + ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx) + y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C) + y = y + rearrange(self.D.to(dtype), "h -> h 1") * x + y = rearrange(y, "b h p -> b (h p)") + if not self.rmsnorm: + y = y * self.act(z) # (B D) + else: + A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32) + dt = repeat(dt, "b h -> b h p", p=self.headdim) + dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim) + D = repeat(self.D, "h -> h p", p=self.headdim) + B = rearrange(B, "b (g n) -> b g n", g=self.ngroups_local) + C = rearrange(C, "b (g n) -> b g n", g=self.ngroups_local) + x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim) + if not self.rmsnorm: + z = rearrange(z, "b (h p) -> b h p", p=self.headdim) + y = selective_state_update( + ssm_state, + x_reshaped, + dt, + A, + B, + C, + D, + z=z if not self.rmsnorm else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + y = rearrange(y, "b h p -> b (h p)") + + if self.rmsnorm: + y = self.norm(y, z) + + # b pd --> b d + out, out_bias = self.out_proj(y) + return out.unsqueeze(0), out_bias, conv_state, ssm_state + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): + device = self.out_proj.weight.device + conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype + conv_state = torch.zeros( + batch_size, self.conv1d.weight.shape[0], self.d_conv, device=device, dtype=conv_dtype + ) + ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype + # ssm_dtype = torch.float32 + ssm_state = torch.zeros( + batch_size, + self.nheads_local, + self.headdim, + self.d_state, + device=device, + dtype=ssm_dtype, + ) + return conv_state, ssm_state + + def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False): + assert self.layer_number is not None + if self.layer_number not in inference_params.key_value_memory_dict: + conv_state = torch.zeros( + batch_size, + self.conv1d.weight.shape[0], + self.d_conv, + device=self.conv1d.weight.device, + dtype=self.conv1d.weight.dtype, + ) + ssm_state = torch.zeros( + batch_size, + self.nheads_local, + self.headdim, + self.d_state, + device=self.in_proj.weight.device, + dtype=self.in_proj.weight.dtype, + ) + inference_params.key_value_memory_dict[self.layer_number] = (conv_state, ssm_state) + else: + conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_number] + # TODO: What if batch size changes between generation, and we reuse the same states? + if initialize_states: + conv_state.zero_() + ssm_state.zero_() + return conv_state, ssm_state diff --git a/megatron/core/ssm/triton_cache_manager.py b/megatron/core/ssm/triton_cache_manager.py new file mode 100644 index 0000000..43b5b34 --- /dev/null +++ b/megatron/core/ssm/triton_cache_manager.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os +import socket +from pathlib import Path + +import torch + +try: + from triton.runtime.cache import FileCacheManager +except ImportError: + raise ImportError("triton is required by the Mamba model but cannot be imported") + + +def get_rank(): + return torch.distributed.get_rank() + + +def default_cache_dir(): + return os.path.join(Path.home(), ".triton", "cache") + + +class ParallelFileCacheManager(FileCacheManager): + + # See https://github.com/triton-lang/triton/blob/main/python/triton/runtime/cache.py + + # When running Triton with multiple ranks, they each create their own cache manager. Their input + # keys to that class are mostly (but not entirely) the same across ranks, which leads many ranks + # to write to the same 'key' directories in the cache dir at the same time during compilation, + # leading to conflicts. This works around that by making each cache dir be rank specific by + # adding "rank__" to the cache directory. + + def __init__(self, key): + self.key = key + self.lock_path = None + # create cache directory if it doesn't exist + self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir()) + self.cache_dir = os.path.join( + self.cache_dir, "rank_{}_{}".format(socket.gethostname(), os.getpid()) + ) + if self.cache_dir: + self.cache_dir = os.path.join(self.cache_dir, self.key) + self.lock_path = os.path.join(self.cache_dir, "lock") + os.makedirs(self.cache_dir, exist_ok=True) diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py new file mode 100644 index 0000000..e7da888 --- /dev/null +++ b/megatron/core/tensor_parallel/__init__.py @@ -0,0 +1,75 @@ +from .cross_entropy import vocab_parallel_cross_entropy +from .data import broadcast_data +from .layers import ( + ColumnParallelLinear, + RowParallelLinear, + VocabParallelEmbedding, + copy_tensor_model_parallel_attributes, + linear_with_grad_accumulation_and_async_allreduce, + param_is_not_tensor_parallel_duplicate, + set_defaults_if_not_set_tensor_model_parallel_attributes, + set_tensor_model_parallel_attributes, +) +from .mappings import ( + all_gather_last_dim_from_tensor_parallel_region, + all_to_all, + all_to_all_hp2sp, + all_to_all_sp2hp, + copy_to_tensor_model_parallel_region, + gather_from_sequence_parallel_region, + gather_from_sequence_parallel_region_to_moe, + gather_from_tensor_model_parallel_region, + reduce_from_tensor_model_parallel_region, + reduce_scatter_last_dim_to_tensor_parallel_region, + reduce_scatter_to_sequence_parallel_region, + reduce_scatter_to_sequence_parallel_region_from_moe, + scatter_to_sequence_parallel_region, + scatter_to_tensor_model_parallel_region, +) +from .random import ( + checkpoint, + get_cuda_rng_tracker, + get_data_parallel_rng_tracker_name, + get_expert_parallel_rng_tracker_name, + model_parallel_cuda_manual_seed, +) +from .utils import ( + gather_split_1d_tensor, + split_tensor_along_last_dim, + split_tensor_into_1d_equal_chunks, +) + +__all__ = [ + # cross_entropy.py + "vocab_parallel_cross_entropy", + # data.py + "broadcast_data", + # layers.py + "ColumnParallelLinear", + "RowParallelLinear", + "VocabParallelEmbedding", + "set_tensor_model_parallel_attributes", + "set_defaults_if_not_set_tensor_model_parallel_attributes", + "copy_tensor_model_parallel_attributes", + "param_is_not_tensor_parallel_duplicate", + "linear_with_grad_accumulation_and_async_allreduce", + # mappings.py + "copy_to_tensor_model_parallel_region", + "gather_from_tensor_model_parallel_region", + "gather_from_sequence_parallel_region", + "reduce_from_tensor_model_parallel_region", + "reduce_scatter_to_sequence_parallel_region", + "scatter_to_tensor_model_parallel_region", + "scatter_to_sequence_parallel_region", + # random.py + "checkpoint", + "get_cuda_rng_tracker", + "model_parallel_cuda_manual_seed", + "get_expert_parallel_rng_tracker_name", + # utils.py + "split_tensor_along_last_dim", + "split_tensor_into_1d_equal_chunks", + "gather_split_1d_tensor", + "gather_from_sequence_parallel_region_to_moe", + "reduce_scatter_to_sequence_parallel_region_from_moe", +] diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py new file mode 100644 index 0000000..45fa075 --- /dev/null +++ b/megatron/core/tensor_parallel/cross_entropy.py @@ -0,0 +1,232 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from typing import Tuple + +import torch + +from megatron.core.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) + +from .utils import VocabUtility + + +class VocabParallelCrossEntropy: + """ + Computes the Cross Entropy Loss splitting the Vocab size across tensor parallel + ranks. This implementation is used in both fused and unfused cross entropy implementations + """ + + @staticmethod + def calculate_logits_max( + vocab_parallel_logits: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + + vocab_parallel_logits = vocab_parallel_logits.float() + # Maximum value along vocab dimension across all GPUs. + logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] + + return vocab_parallel_logits, logits_max + + @staticmethod + def calculate_predicted_logits( + vocab_parallel_logits: torch.Tensor, + target: torch.Tensor, + logits_max: torch.Tensor, + vocab_start_index: int, + vocab_end_index: int, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + # In-place subtraction reduces memory pressure. + vocab_parallel_logits -= logits_max.unsqueeze(dim=-1) + + # Create a mask of valid vocab ids (1 means it needs to be masked). + target_mask = (target < vocab_start_index) | (target >= vocab_end_index) + masked_target = target.clone() - vocab_start_index + masked_target[target_mask] = 0 + + # Get predicted-logits = logits[target]. + # For Simplicity, we convert logits to a 2-D tensor with size + # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. + partition_vocab_size = vocab_parallel_logits.size()[-1] + logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) + masked_target_1d = masked_target.view(-1) + arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device) + predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] + predicted_logits_1d = predicted_logits_1d.clone().contiguous() + predicted_logits = predicted_logits_1d.view_as(target) + predicted_logits[target_mask] = 0.0 + + exp_logits = vocab_parallel_logits + torch.exp(vocab_parallel_logits, out=exp_logits) + sum_exp_logits = exp_logits.sum(dim=-1) + + return target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits + + @staticmethod + def calculate_cross_entropy_loss( + exp_logits: torch.Tensor, predicted_logits: torch.Tensor, sum_exp_logits: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + + # Loss = log(sum(exp(logits))) - predicted-logit. + loss = torch.log(sum_exp_logits) - predicted_logits + + # Normalize and optionally smooth logits + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + + return exp_logits, loss + + @staticmethod + def prepare_gradient_calculation_operands( + softmax: torch.Tensor, + target_mask: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + # All the inputs have softmax as thier gradient. + grad_input = softmax + # For simplicity, work with the 2D gradient. + partition_vocab_size = softmax.size()[-1] + grad_2d = grad_input.view(-1, partition_vocab_size) + + # Add the gradient from matching classes. + arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) + + softmax_update = 1.0 - target_mask.view(-1).float() + + return grad_2d, arange_1d, softmax_update, grad_input + + @staticmethod + def calculate_gradients( + grad_2d: torch.Tensor, + arange_1d: torch.Tensor, + masked_target_1d: torch.Tensor, + softmax_update: torch.Tensor, + grad_input: torch.Tensor, + grad_output: torch.Tensor, + ) -> torch.Tensor: + + grad_2d[arange_1d, masked_target_1d] -= softmax_update + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + + return grad_input + + +class _VocabParallelCrossEntropy(torch.autograd.Function): + @staticmethod + def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): + + vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max( + vocab_parallel_logits + ) + torch.distributed.all_reduce( + logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group() + ) + + # Get the partition's vocab indices + get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size + partition_vocab_size = vocab_parallel_logits.size()[-1] + rank = get_tensor_model_parallel_rank() + world_size = get_tensor_model_parallel_world_size() + vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size) + + ( + target_mask, + masked_target_1d, + predicted_logits, + sum_exp_logits, + exp_logits, + ) = VocabParallelCrossEntropy.calculate_predicted_logits( + vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index + ) + + # All reduce is needed to get the chunks from other GPUs. + torch.distributed.all_reduce( + predicted_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_tensor_model_parallel_group(), + ) + + torch.distributed.all_reduce( + sum_exp_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_tensor_model_parallel_group(), + ) + + exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss( + exp_logits, predicted_logits, sum_exp_logits + ) + + vocab_size = exp_logits.size(-1) + if label_smoothing > 0: + """ + We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth. + = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt}) + = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i + = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i + = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i + = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K + From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py + """ + assert 1.0 > label_smoothing > 0.0 + smoothing = label_smoothing * vocab_size / (vocab_size - 1) + + # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs. + log_probs = torch.log(exp_logits) + mean_log_probs = log_probs.mean(dim=-1) + loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs + + ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size + + # Store softmax, target-mask and masked-target for backward pass. + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output): + + # Retreive tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size + + ( + grad_2d, + arange_1d, + softmax_update, + grad_input, + ) = VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask) + + if label_smoothing > 0: + smoothing = label_smoothing * vocab_size / (vocab_size - 1) + grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update + average_grad = 1 / vocab_size + grad_2d[arange_1d, :] -= smoothing * average_grad + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + else: + grad_input = VocabParallelCrossEntropy.calculate_gradients( + grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output + ) + + return grad_input, None, None + + +def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0): + """ + Performs cross entropy loss when logits are split across tensor parallel ranks + + Args: + vocab_parallel_logits: logits split across tensor parallel ranks + dimension is [sequence_length, batch_size, vocab_size/num_parallel_ranks] + + target: correct vocab ids of dimseion [sequence_length, micro_batch_size] + + lobal_smoothing: smoothing factor, must be in range [0.0, 1.0) + default is no smoothing (=0.0) + """ + return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing) diff --git a/megatron/core/tensor_parallel/data.py b/megatron/core/tensor_parallel/data.py new file mode 100644 index 0000000..01dd90d --- /dev/null +++ b/megatron/core/tensor_parallel/data.py @@ -0,0 +1,104 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + +from megatron.core.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_src_rank, +) + +_MAX_DATA_DIM = 5 + + +def _check_data_types(keys, data, target_dtype): + """Check that all the keys have the same target data type.""" + for key in keys: + assert data[key].dtype == target_dtype, ( + '{} has data type {} which ' + 'is different than {}'.format(key, data[key].dtype, target_dtype) + ) + + +def _build_key_size_numel_dictionaries(keys, data): + """Build the size on rank 0 and broadcast.""" + max_dim = _MAX_DATA_DIM + sizes = [0 for _ in range(max_dim) for _ in keys] + + # Pack the sizes on rank zero. + if get_tensor_model_parallel_rank() == 0: + offset = 0 + for key in keys: + assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' + size = data[key].size() + for i, s in enumerate(size): + sizes[i + offset] = s + offset += max_dim + + # Move to GPU and broadcast. + sizes_cuda = torch.tensor(sizes, dtype=torch.long, device='cuda') + torch.distributed.broadcast( + sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group() + ) + + # Move back to cpu and unpack. + sizes_cpu = sizes_cuda.cpu() + key_size = {} + key_numel = {} + total_numel = 0 + offset = 0 + for key in keys: + i = 0 + size = [] + numel = 1 + while sizes_cpu[offset + i] > 0: + this_size = sizes_cpu[offset + i] + size.append(this_size) + numel *= this_size + i += 1 + key_size[key] = size + key_numel[key] = numel + total_numel += numel + offset += max_dim + + return key_size, key_numel, total_numel + + +def broadcast_data(keys, data, datatype): + """Broadcast data from rank zero of each model parallel group to the + members of the same model parallel group. + + Args: + keys: list of keys in the data disctionary to be broadcasted + data: data dictionary of string keys and cpu tensor values. + datatype: torch data type of all tensors in data associated + with keys. + """ + # Build (key, size) and (key, number of elements) dictionaries along + # with the total number of elements on all ranks. + key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data) + + # Pack on rank zero. + if get_tensor_model_parallel_rank() == 0: + # Check that all keys have the same data type. + _check_data_types(keys, data, datatype) + # Flatten the data associated with the keys + flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda() + else: + flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype) + + # Broadcast + torch.distributed.broadcast( + flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group() + ) + + # Unpack + output = {} + offset = 0 + for key in keys: + size = key_size[key] + numel = key_numel[key] + output[key] = flatten_data.narrow(0, offset, numel).view(size) + offset += numel + + return output diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py new file mode 100644 index 0000000..d644eb8 --- /dev/null +++ b/megatron/core/tensor_parallel/layers.py @@ -0,0 +1,1147 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + +import io +import math +import os +import warnings +from typing import Any, Callable, List, Optional, Tuple + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.nn.parameter import Parameter + +from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.parallel_state import ( + get_global_memory_buffer, + get_tensor_and_expert_parallel_rank, + get_tensor_and_expert_parallel_world_size, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) + +from ..dist_checkpointing.mapping import ShardedStateDict +from ..transformer.utils import make_sharded_tensors_for_checkpoint +from ..utils import make_tp_sharded_tensor_for_checkpoint, prepare_input_tensors_for_wgrad_compute +from .mappings import ( + copy_to_tensor_model_parallel_region, + gather_from_sequence_parallel_region, + gather_from_tensor_model_parallel_region, + reduce_from_tensor_model_parallel_region, + reduce_scatter_to_sequence_parallel_region, + scatter_to_tensor_model_parallel_region, +) +from .random import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from .utils import VocabUtility, divide, split_tensor_along_last_dim + +_grad_accum_fusion_available = True +try: + import fused_weight_gradient_mlp_cuda +except ImportError: + _grad_accum_fusion_available = False + +_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = { + 'tensor_model_parallel': False, + 'partition_dim': -1, + 'partition_stride': 1, +} + + +def param_is_not_tensor_parallel_duplicate(param): + return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or ( + get_tensor_model_parallel_rank() == 0 + ) + + +def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride): + # Make sure the attributes are not set. + for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS: + assert not hasattr(tensor, attribute) + # Set the attributes. + setattr(tensor, 'tensor_model_parallel', is_parallel) + setattr(tensor, 'partition_dim', dim) + setattr(tensor, 'partition_stride', stride) + + +def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor): + def maybe_set(attribute, value): + if not hasattr(tensor, attribute): + setattr(tensor, attribute, value) + + for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS: + maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute]) + + +def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor): + def maybe_copy(attribute): + if hasattr(source_tensor, attribute): + setattr(destination_tensor, attribute, getattr(source_tensor, attribute)) + + for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS: + maybe_copy(attribute) + + +def _initialize_affine_weight_gpu( + weight, init_method, partition_dim, stride=1, expert_parallel=False +): + """Initialize affine weight for model parallel on GPU.""" + + set_tensor_model_parallel_attributes( + tensor=weight, is_parallel=True, dim=partition_dim, stride=stride + ) + + if not expert_parallel: + with get_cuda_rng_tracker().fork(): + init_method(weight) + else: + with get_cuda_rng_tracker().fork(get_expert_parallel_rng_tracker_name()): + init_method(weight) + + +def _initialize_affine_weight_cpu( + weight, + output_size, + input_size, + per_partition_size, + partition_dim, + init_method, + stride=1, + return_master_weight=False, + *, + params_dtype=torch.float32, + rank=None, + world_size=None, +): + """Initialize affine weight for model parallel. + + Build the master weight on all processes and scatter + the relevant chunk.""" + + set_tensor_model_parallel_attributes( + tensor=weight, is_parallel=True, dim=partition_dim, stride=stride + ) + + # Initialize master weight + master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False) + init_method(master_weight) + master_weight = master_weight.to(dtype=params_dtype) + + # Split and copy + per_partition_per_stride_size = divide(per_partition_size, stride) + weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim) + if rank is None: + rank = get_tensor_model_parallel_rank() + world_size = get_tensor_model_parallel_world_size() + my_weight_list = weight_list[rank::world_size] + + with torch.no_grad(): + # all tensors must live on the same device + cpu_weight = torch.cat(my_weight_list, dim=partition_dim).to_dense() + weight.data.copy_(cpu_weight) + if return_master_weight: + return master_weight + return None + + +class VocabParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the vocabulary dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + + Args: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + reduce_scatter_embeddings: Decides whether to perform ReduceScatter after embedding lookup + + Keyword Args: + config: A megatron.core.ModelParallelConfig object + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + *, + init_method: Callable, + reduce_scatter_embeddings: bool = False, + config: ModelParallelConfig, + ): + super(VocabParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.reduce_scatter_embeddings = reduce_scatter_embeddings + self.tensor_model_parallel_size = get_tensor_model_parallel_world_size() + # Divide the weight matrix along the vocaburaly dimension. + ( + self.vocab_start_index, + self.vocab_end_index, + ) = VocabUtility.vocab_range_from_global_vocab_size( + self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size + ) + self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index + self.deterministic_mode = config.deterministic_mode + + # Allocate weights and initialize. + if config.use_cpu_initialization: + self.weight = Parameter( + torch.empty( + self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype + ) + ) + if config.perform_initialization: + _initialize_affine_weight_cpu( + self.weight, + self.num_embeddings, + self.embedding_dim, + self.num_embeddings_per_partition, + 0, + init_method, + params_dtype=config.params_dtype, + ) + else: + self.weight = Parameter( + torch.empty( + self.num_embeddings_per_partition, + self.embedding_dim, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + if config.perform_initialization: + _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1) + + def forward(self, input_): + if self.tensor_model_parallel_size > 1: + # Build the mask. + input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index) + # Mask the input. + masked_input = input_.clone() - self.vocab_start_index + masked_input[input_mask] = 0 + else: + masked_input = input_ + # Get the embeddings. + if self.deterministic_mode: + output_parallel = self.weight[masked_input] + else: + # F.embedding currently has a non-deterministic backward function + output_parallel = F.embedding(masked_input, self.weight) + # Mask the output embedding. + if self.tensor_model_parallel_size > 1: + output_parallel[input_mask, :] = 0.0 + + if self.reduce_scatter_embeddings: + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + output_parallel = output_parallel.transpose(0, 1).contiguous() + output = reduce_scatter_to_sequence_parallel_region(output_parallel) + else: + # Reduce across all the model parallel GPUs. + output = reduce_from_tensor_model_parallel_region(output_parallel) + return output + + def sharded_state_dict( + self, + prefix: str = '', + sharded_offsets: Tuple[Tuple[int, int, int]] = (), + metadata: Optional[dict] = None, + ) -> ShardedStateDict: + """Non-default implementation for embeddings due to `allow_shape_mismatch` param""" + state_dict = self.state_dict(prefix='', keep_vars=True) + + weight_prefix = f'{prefix}weight' + return { + weight_prefix: make_tp_sharded_tensor_for_checkpoint( + tensor=state_dict['weight'], + key=weight_prefix, + allow_shape_mismatch=True, + prepend_offsets=sharded_offsets, + ) + } + + +class LinearWithFrozenWeight(torch.autograd.Function): + """Linear operator that does not calculate gradient for weight. + This op and LinearWithGradAccumulationAndAsyncCommunication performs + mathematically-identical forward and DGRAD. + + Conceptually this op is the same as torch.nn.functional.linear with + weight.requires_grad==False, but in experiments they are not identical + mathematically.""" + + @staticmethod + @custom_fwd + def forward( + ctx, + input, + weight, + bias, + allreduce_dgrad, + ): + ctx.save_for_backward(weight) + ctx.allreduce_dgrad = allreduce_dgrad + output = torch.matmul(input, weight.t()) + if bias is not None: + output = output + bias + return output + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + (weight,) = ctx.saved_tensors + grad_input = grad_output.matmul(weight) + + if ctx.allreduce_dgrad: + # All-reduce. Note: here async and sync are effectively the same. + torch.distributed.all_reduce(grad_input, group=get_tensor_model_parallel_group()) + + return grad_input, None, None, None + + +def linear_with_frozen_weight( + input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + gradient_accumulation_fusion: bool, + async_grad_allreduce: bool, + sequence_parallel: bool, + grad_output_buffer: Optional[List[torch.Tensor]] = None, + wgrad_deferral_limit: Optional[int] = None, + allreduce_dgrad: bool = None, +) -> torch.Tensor: + """Linear layer execution with weight.requires_grad == False. + + This function handles linear layers with weight frozen (untrainable). + In the forward, it only saves weight and does not save input activations. + In the backward, it does not perform weight gradient calculation, or + weight gradient allreduce. + + Args: + + input (torch.Tensor required): input like torch.nn.functional.linear + + weight (torch.Tensor required): weight like torch.nn.functional.linear + + bias (torch.Tensor optional): bias like torch.nn.functional.linear + + gradient_accumulation_fusion (bool required): dummy argument, used to + keep the API unified between all forward implementation functions. + + async_grad_allreduce (bool required): dummy argument, used to + keep the API unified between all forward implementation functions. + + sequence_parallel (bool required): Indicates that sequence + parallelism is used and thus in the forward pass the input is + all gathered, and the backward pass the input gradients are + reduce scattered. + + grad_output_buffer (List[torch.Tensor] optional): dummy argument, used to + keep the API unified between all forward implementation functions. + + wgrad_deferral_limit (int optional): dummy argument, used to + keep the API unified between all forward implementation functions. + + allreduce_dgrad (bool): Do the allreduce of input gradients. + Here, async and sync allreduce are the same. If sequence_parallel is + True, this must be False, as no all reduce is performed. + + """ + + assert grad_output_buffer is None, ( + "grad_output_buffer kwarg is only supported with " + "linear_with_grad_accumulation_and_async_allreduce" + ) + + assert wgrad_deferral_limit is None, ( + "This arg is only supported with " "linear_with_grad_accumulation_and_async_allreduce" + ) + + if sequence_parallel: + input = gather_from_sequence_parallel_region(input, tensor_parallel_output_grad=True) + else: + input = input + + if allreduce_dgrad is None: + warnings.warn( + "async_grad_allreduce is deprecated and will be removed in a future release. use allreduce_dgrad instead." + ) + allreduce_dgrad = async_grad_allreduce + + args = [ + input, + weight, + bias, + allreduce_dgrad, + ] + + return LinearWithFrozenWeight.apply(*args) + + +class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function): + """See linear_with_grad_accumulation_and_async_allreduce""" + + @staticmethod + @custom_fwd + def forward( + ctx, + input, + weight, + bias, + gradient_accumulation_fusion, + allreduce_dgrad, + sequence_parallel, + grad_output_buffer, + wgrad_deferral_limit, + ): + ctx.save_for_backward(input, weight) + ctx.use_bias = bias is not None + ctx.gradient_accumulation_fusion = gradient_accumulation_fusion + ctx.allreduce_dgrad = allreduce_dgrad + ctx.sequence_parallel = sequence_parallel + ctx.wgrad_deferral_limit = wgrad_deferral_limit + ctx.grad_output_buffer = grad_output_buffer + + if sequence_parallel: + world_size = get_tensor_model_parallel_world_size() + dim_size = list(input.size()) + dim_size[0] = dim_size[0] * world_size + + all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu") + torch.distributed._all_gather_base( + all_gather_buffer, input, group=get_tensor_model_parallel_group() + ) + total_input = all_gather_buffer + else: + total_input = input + + output = torch.matmul(total_input, weight.t()) + if bias is not None: + output = output + bias + return output + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + input, weight = ctx.saved_tensors + use_bias = ctx.use_bias + grad_output_buffer = ctx.grad_output_buffer + wgrad_deferral_limit = ctx.wgrad_deferral_limit + + wgrad_compute = True + if grad_output_buffer is not None: + if wgrad_deferral_limit == 0 or len(grad_output_buffer) < wgrad_deferral_limit: + grad_output_buffer.append(grad_output) + wgrad_compute = False + + if wgrad_compute: + if ctx.sequence_parallel: + world_size = get_tensor_model_parallel_world_size() + dim_size = list(input.size()) + dim_size[0] = dim_size[0] * world_size + + all_gather_buffer = get_global_memory_buffer().get_tensor( + dim_size, input.dtype, "mpu" + ) + handle = torch.distributed._all_gather_base( + all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True + ) + + # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the + # gather is scheduled before the input gradient computation + total_input = all_gather_buffer + else: + total_input = input + grad_input = grad_output.matmul(weight) + + if ctx.sequence_parallel and wgrad_compute: + handle.wait() + + if wgrad_compute: + grad_output, total_input = prepare_input_tensors_for_wgrad_compute( + grad_output, total_input + ) + + if ctx.allreduce_dgrad: + # Asynchronous all-reduce + handle = torch.distributed.all_reduce( + grad_input, group=get_tensor_model_parallel_group(), async_op=True + ) + # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the + # all-reduce is scheduled before the weight gradient computation + + if ctx.sequence_parallel: + assert not ctx.allreduce_dgrad + dim_size = list(input.size()) + sub_grad_input = torch.empty( + dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False + ) + # reduce_scatter + handle = torch.distributed._reduce_scatter_base( + sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True + ) + # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the + # reduce scatter is scheduled before the weight gradient computation + + if ctx.gradient_accumulation_fusion: + if wgrad_compute: + if weight.main_grad.dtype == torch.float32: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32( + total_input, grad_output, weight.main_grad + ) + elif weight.main_grad.dtype in (torch.float16, torch.bfloat16): + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16( + total_input, grad_output, weight.main_grad + ) + else: + raise RuntimeError("Unsupported gradient type for gradient accumulation fusion") + + if hasattr(weight, 'grad_added_to_main_grad'): + # When overlap_grad_reduce is True, need to ensure that backward hooks + # are all run on the main backprop thread to prevent deadlocks. Setup + # dummy grad_weight tensor to prevent backward hooks from being run + # in a background thread. + if getattr(weight, 'zero_out_wgrad', False): + grad_weight = torch.zeros( + weight.main_grad.shape, + dtype=input.dtype, + device=torch.cuda.current_device(), + requires_grad=False, + ) + else: + grad_weight = torch.empty( + weight.main_grad.shape, + dtype=input.dtype, + device=torch.cuda.current_device(), + requires_grad=False, + ) + weight.grad_added_to_main_grad = True + else: + grad_weight = None + else: + grad_weight = grad_output.t().matmul(total_input) + grad_bias = grad_output.sum(dim=0) if use_bias else None + + if ctx.sequence_parallel: + handle.wait() + # Need to return None's as gradient has to flow for all the input arguments + # provided during forward + return sub_grad_input, grad_weight, grad_bias, None, None, None, None, None + + if ctx.allreduce_dgrad: + handle.wait() + + return grad_input, grad_weight, grad_bias, None, None, None, None, None + + +def linear_with_grad_accumulation_and_async_allreduce( + input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + gradient_accumulation_fusion: bool, + async_grad_allreduce: bool, + sequence_parallel: bool, + grad_output_buffer: Optional[List[torch.Tensor]] = None, + wgrad_deferral_limit: Optional[int] = 0, + allreduce_dgrad: bool = None, +) -> torch.Tensor: + """Linear layer execution with asynchronous communication and + gradient accumulation fusion in backprop. + + This has the option to accumulate the result of backprop + calculation into an existing gradient buffer, preventing the need + to do an additional addition kernel after the gradient + calculation. + + Additionally, the tensor parallel all reduce of the input + gradients can be done asynchronously with the calculation of + the weight gradients. + + In the case of sequence parallelism, the reduce scatter of the + input gradients is done asynchronously with the calcluation of the + weight gradients. + + Use of this module requires that the environment variable + CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective + operations, noted in the code, that should be scheduled before + compute kernels to overlap the communication with the computation, + which is necessary for a speedup but not for correctness so that + ordering isn't imposed by the scheduler. Setting + CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled + in the order they are called. + + Args: + input (torch.Tensor required): input like torch.nn.functional.linear + + weight (torch.Tensor required): weight like torch.nn.functional.linear + + bias (torch.Tensor optional): bias like torch.nn.functional.linear + + gradient_accumulation_fusion (bool required): Perform the gradient + accumulation fusion, requires the custom CUDA extension + fused_weight_gradient_mlp_cuda module. To use + gradient_accumulation_fusion you must install APEX with + --cpp_ext and --cuda_ext. For example: "pip install + --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" + " Note that the extension requires CUDA>=11. Otherwise, you + must turn off gradient accumulation fusion." + + + async_grad_allreduce (bool required): Do the allreduce of input + gradients asyncronously with the computation of weight + gradients. If sequence_parallel is True, this must be + False, as no all reduce is performed. + + + sequence_parallel (bool required): Indicates that sequence + parallelism is used and thus in the forward pass the input is + all gathered, and the backward pass the input gradients are + reduce scattered. + + grad_output_buffer (List[torch.Tensor] optional): Buffer used to save + output gradients when embedding table wgrad compute is deferred. + Defaults to None. + + wgrad_deferral_limit (int optional): Limit on the number of + micro-batches for which embedding weight gradient GEMM should be + deferred. Defaults to 0. + + allreduce_dgrad (bool): Do the allreduce of input gradients. + The allreduce is done asynchronously with the computation of weight + gradients. If sequence_parallel is True, this must be + False, as no all reduce is performed. + """ + if allreduce_dgrad is None: + warnings.warn( + "async_grad_allreduce is deprecated and will be removed in a future release. use allreduce_dgrad instead." + ) + allreduce_dgrad = async_grad_allreduce + + args = [ + input, + weight, + bias, + gradient_accumulation_fusion, + allreduce_dgrad, + sequence_parallel, + grad_output_buffer, + wgrad_deferral_limit, + ] + + if not linear_with_grad_accumulation_and_async_allreduce.warned: + if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": + if sequence_parallel: + warnings.warn( + "When using sequence parallelism it is recommended to set the " + "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for " + "maximum speedup" + ) + linear_with_grad_accumulation_and_async_allreduce.warned = True + + if allreduce_dgrad: + warnings.warn( + "When using async grad allreduce it is recommended to set the " + "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for " + "maximum speedup" + ) + linear_with_grad_accumulation_and_async_allreduce.warned = True + + return LinearWithGradAccumulationAndAsyncCommunication.apply(*args) + + +linear_with_grad_accumulation_and_async_allreduce.warned = False + + +class ColumnParallelLinear(torch.nn.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Args: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias + gather_output: If true, call all-gather on output and make Y available to all GPUs, otherwise, every GPU will have its output which is Y_i = XA_i + init_method: method to initialize weights. Note that bias is always set to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization. + skip_bias_add: If True, do not add the bias term, instead return it to be added by the caller. This enables performance optimations where bias can be fused with other elementwise operations. + skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed as a keyword argument `weight` during the forward pass. Note that this does not affect bias, which will be allocated if bias is True. Defaults to False. + embedding_activation_buffer: This buffer holds the input activations of the final embedding linear layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled. + grad_output_buffer: This buffer holds the gradient outputs of the final embedding linear layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled. + is_expert: If True, the layer is treated as an MoE expert layer. + config: ModelParallelConfig object + tp_comm_buffer_name: Communication buffer name is not used in non-Transformer-Engine modules. + disable_grad_reduce: If True, reduction of output gradients across tensor-parallel ranks will be disabled. Defaults to False. This feature is used by Lora Adapter in Nemo to delay and fuse reduction along with other gradients for performance optimization. + """ + + def __init__( + self, + input_size, + output_size, + *, + config: ModelParallelConfig, + init_method: Callable, + bias=True, + gather_output=False, + stride=1, + keep_master_weight_for_test=False, + skip_bias_add=False, + skip_weight_param_allocation: bool = False, + embedding_activation_buffer: Optional[List[torch.Tensor]] = None, + grad_output_buffer: Optional[List[torch.Tensor]] = None, + is_expert: bool = False, + tp_comm_buffer_name: str = None, # Not used + disable_grad_reduce: bool = False, + ): + super(ColumnParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.gather_output = gather_output + # Divide the weight matrix along the last dimension. + self.skip_bias_add = skip_bias_add + self.is_expert = is_expert + self.expert_parallel = config.expert_model_parallel_size > 1 + self.embedding_activation_buffer = embedding_activation_buffer + self.grad_output_buffer = grad_output_buffer + self.config = config + self.disable_grad_reduce = disable_grad_reduce + + self.explicit_expert_comm = self.is_expert and ( + config.tensor_model_parallel_size > 1 or self.expert_parallel + ) + if self.explicit_expert_comm and config.moe_extended_tp: + world_size = get_tensor_and_expert_parallel_world_size() + rank = get_tensor_and_expert_parallel_rank() + else: + world_size = get_tensor_model_parallel_world_size() + rank = get_tensor_model_parallel_rank() + + self.output_size_per_partition = divide(output_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + # Initialize weight. + if not skip_weight_param_allocation: + if config.use_cpu_initialization: + self.weight = Parameter( + torch.empty( + self.output_size_per_partition, self.input_size, dtype=config.params_dtype + ) + ) + if config.perform_initialization: + self.master_weight = _initialize_affine_weight_cpu( + self.weight, + self.output_size, + self.input_size, + self.output_size_per_partition, + 0, + init_method, + stride=stride, + return_master_weight=keep_master_weight_for_test, + rank=rank, + world_size=world_size, + ) + else: + self.weight = Parameter( + torch.empty( + self.output_size_per_partition, + self.input_size, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + if config.perform_initialization: + _initialize_affine_weight_gpu( + self.weight, + init_method, + partition_dim=0, + stride=stride, + expert_parallel=(self.is_expert and self.expert_parallel), + ) + + setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel)) + else: + self.weight = None + + if bias: + if config.use_cpu_initialization: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, dtype=config.params_dtype) + ) + else: + self.bias = Parameter( + torch.empty( + self.output_size_per_partition, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + set_tensor_model_parallel_attributes(self.bias, True, 0, stride) + if config.perform_initialization: + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel)) + else: + self.register_parameter('bias', None) + + self.sequence_parallel = config.sequence_parallel + if self.sequence_parallel and world_size <= 1: + warnings.warn( + f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. " + f"Disabling sequence parallel." + ) + self.sequence_parallel = False + + self.allreduce_dgrad = ( + world_size > 1 and not self.sequence_parallel and not self.disable_grad_reduce + ) + + if config.gradient_accumulation_fusion and not _grad_accum_fusion_available: + raise RuntimeError( + "ColumnParallelLinear was called with gradient_accumulation_fusion set " + "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda " + "module is not found. To use gradient_accumulation_fusion you must " + "install APEX with --cpp_ext and --cuda_ext. For example: " + "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" " + "Note that the extension requires CUDA>=11. Otherwise, you must turn off " + "gradient accumulation fusion." + ) + self.gradient_accumulation_fusion = config.gradient_accumulation_fusion + + if self.allreduce_dgrad and self.sequence_parallel: + raise RuntimeError( + "`allreduce_dgrad` and `sequence_parallel` cannot be enabled at the same time." + ) + + self._forward_impl = linear_with_grad_accumulation_and_async_allreduce + + # Hook adding a default empty _extra_state for state dict + self._register_load_state_dict_pre_hook( + lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault( + f'{prefix}_extra_state' + ) + ) + + def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): + """Forward of ColumnParallelLinear + + Args: + input_: 3D tensor whose order of dimension is [sequence, batch, hidden] + + weight (optional): weight tensor to use, compulsory when + skip_weight_param_allocation is True. + + Returns: + - output + - bias + + """ + if weight is None: + if self.weight is None: + raise RuntimeError( + "weight was not supplied to ColumnParallelLinear forward pass " + "and skip_weight_param_allocation is True." + ) + weight = self.weight + else: + # Check the weight passed in is the correct shape + expected_shape = (self.output_size_per_partition, self.input_size) + if weight.shape != expected_shape: + raise RuntimeError( + f"supplied weight's shape is {tuple(weight.shape)}, " + f"not {expected_shape} as expected" + ) + + if self.config._cpu_offloading_context is not None: + if self.config._cpu_offloading_context.inside_context == True: + assert ( + self.config.cpu_offloading == False + ), "CPU Offloading cannot be enabled while using non-TE modules" + + bias = self.bias if not self.skip_bias_add else None + + if ( + self.allreduce_dgrad + or self.sequence_parallel + or self.explicit_expert_comm + or self.disable_grad_reduce + ): + input_parallel = input_ + else: + input_parallel = copy_to_tensor_model_parallel_region(input_) + + if self.config.defer_embedding_wgrad_compute: + if ( + self.config.wgrad_deferral_limit == 0 + or len(self.embedding_activation_buffer) < self.config.wgrad_deferral_limit + ): + self.embedding_activation_buffer.append(input_parallel) + + # Matrix multiply. + if not weight.requires_grad: + self._forward_impl = linear_with_frozen_weight + else: + self._forward_impl = linear_with_grad_accumulation_and_async_allreduce + + allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad + + output_parallel = self._forward_impl( + input=input_parallel, + weight=weight, + bias=bias, + gradient_accumulation_fusion=self.gradient_accumulation_fusion, + async_grad_allreduce=allreduce_dgrad, + sequence_parallel=False if self.explicit_expert_comm else self.sequence_parallel, + grad_output_buffer=( + self.grad_output_buffer if self.config.defer_embedding_wgrad_compute else None + ), + wgrad_deferral_limit=( + self.config.wgrad_deferral_limit + if self.config.defer_embedding_wgrad_compute + else None + ), + allreduce_dgrad=allreduce_dgrad, + ) + if self.gather_output: + # All-gather across the partitions. + assert not self.sequence_parallel + output = gather_from_tensor_model_parallel_region(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Sharding along axis 0, bias sharded""" + state_dict = self.state_dict(prefix='', keep_vars=True) + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets + ) + + def set_extra_state(self, state: Any): + """Extra state is ignored""" + + def get_extra_state(self) -> None: + """Keep compatibility with TE state dict.""" + return None + + +class RowParallelLinear(torch.nn.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along its first dimension and X along its second dimension. A = transpose([A_1 .. A_p]) X = [X_1, ..., X_p] + + Args: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already split across the GPUs and we do not split again. + init_method: method to initialize weights. Note that bias is always set to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization. + skip_bias_add: If True, do not add the bias term, instead return it to be added by the caller. This enables performance optimations where bias can be fused with other elementwise operations. + is_expert: If True, the layer is treated as an MoE expert layer + tp_comm_buffer_name: Communication buffer name. Not used in + non-Transformer-Engine modules. + config: ModelParallelConfig object + + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + input_is_parallel: bool, + skip_bias_add: bool, + stride: int = 1, + keep_master_weight_for_test: bool = False, + is_expert: bool = False, + tp_comm_buffer_name: str = None, # Not used + ): + super(RowParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.input_is_parallel = input_is_parallel + self.skip_bias_add = skip_bias_add + self.config = config + self.is_expert = is_expert + self.expert_parallel = config.expert_model_parallel_size > 1 + self.gradient_accumulation_fusion = config.gradient_accumulation_fusion + self.sequence_parallel = config.sequence_parallel + if self.sequence_parallel and not self.input_is_parallel: + raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`") + + self.explicit_expert_comm = self.is_expert and ( + config.tensor_model_parallel_size > 1 or self.expert_parallel + ) + + # Divide the weight matrix along the last dimension. + if self.explicit_expert_comm and config.moe_extended_tp: + world_size = get_tensor_and_expert_parallel_world_size() + rank = get_tensor_and_expert_parallel_rank() + else: + world_size = get_tensor_model_parallel_world_size() + rank = get_tensor_model_parallel_rank() + + self.input_size_per_partition = divide(input_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + # Initialize weight. + if config.use_cpu_initialization: + self.weight = Parameter( + torch.empty( + self.output_size, self.input_size_per_partition, dtype=config.params_dtype + ) + ) + if config.perform_initialization: + self.master_weight = _initialize_affine_weight_cpu( + self.weight, + self.output_size, + self.input_size, + self.input_size_per_partition, + 1, + init_method, + stride=stride, + return_master_weight=keep_master_weight_for_test, + params_dtype=config.params_dtype, + rank=rank, + world_size=world_size, + ) + else: + self.weight = Parameter( + torch.empty( + self.output_size, + self.input_size_per_partition, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + if config.perform_initialization: + _initialize_affine_weight_gpu( + self.weight, + init_method, + partition_dim=1, + stride=stride, + expert_parallel=(self.is_expert and self.expert_parallel), + ) + setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel)) + + if bias: + if config.use_cpu_initialization: + self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype)) + else: + self.bias = Parameter( + torch.empty( + self.output_size, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + + if config.perform_initialization: + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel)) + setattr(self.bias, 'sequence_parallel', self.sequence_parallel) + else: + self.register_parameter('bias', None) + + self._forward_impl = linear_with_grad_accumulation_and_async_allreduce + + # Hook adding a default empty _extra_state for state dict + self._register_load_state_dict_pre_hook( + lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault( + f'{prefix}_extra_state' + ) + ) + + def forward(self, input_): + """Forward of RowParallelLinear + + Args: + input_: 3D tensor whose order of dimension is [sequence, batch, hidden] + + Returns: + - output + - bias + """ + + if self.config._cpu_offloading_context is not None: + if self.config._cpu_offloading_context.inside_context == True: + assert ( + self.config.cpu_offloading == False + ), "CPU Offloading cannot be enabled while using non-TE modules" + + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + assert not self.sequence_parallel + input_parallel = scatter_to_tensor_model_parallel_region(input_) + # Matrix multiply. + if not self.weight.requires_grad: + self._forward_impl = linear_with_frozen_weight + else: + self._forward_impl = linear_with_grad_accumulation_and_async_allreduce + + allreduce_dgrad = False + + output_parallel = self._forward_impl( + input=input_parallel, + weight=self.weight, + bias=None, + gradient_accumulation_fusion=self.gradient_accumulation_fusion, + async_grad_allreduce=allreduce_dgrad, + sequence_parallel=False, + grad_output_buffer=None, + allreduce_dgrad=allreduce_dgrad, + ) + + # All-reduce across all the partitions. + if self.explicit_expert_comm: + assert self.skip_bias_add + output_ = output_parallel + elif self.sequence_parallel: + output_ = reduce_scatter_to_sequence_parallel_region(output_parallel) + else: + output_ = reduce_from_tensor_model_parallel_region(output_parallel) + if not self.skip_bias_add: + output = (output_ + self.bias) if self.bias is not None else output_ + output_bias = None + else: + output = output_ + output_bias = self.bias + return output, output_bias + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Sharding along axis 1, bias not sharded""" + state_dict = self.state_dict(prefix='', keep_vars=True) + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {'weight': 1}, sharded_offsets + ) + + def set_extra_state(self, state: Any): + """Extra state is ignored""" + + def get_extra_state(self) -> None: + """Keep compatibility with TE state dict.""" + return None diff --git a/megatron/core/tensor_parallel/mappings.py b/megatron/core/tensor_parallel/mappings.py new file mode 100644 index 0000000..88e7754 --- /dev/null +++ b/megatron/core/tensor_parallel/mappings.py @@ -0,0 +1,530 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + +from megatron.core.parallel_state import ( + get_expert_model_parallel_group, + get_global_memory_buffer, + get_tensor_and_expert_parallel_group, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) + +from .utils import split_tensor_along_last_dim + + +def _reduce(input_): + """All-reduce the input tensor across model parallel group.""" + + # Bypass the function if we are using only 1 GPU. + if get_tensor_model_parallel_world_size() == 1: + return input_ + + # All-reduce. + torch.distributed.all_reduce(input_.contiguous(), group=get_tensor_model_parallel_group()) + + return input_ + + +def _split_along_last_dim(input_): + """Split the tensor along its last dimension and keep the + corresponding slice.""" + + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # Split along last dimension. + input_list = split_tensor_along_last_dim(input_, world_size) + + # Note: torch.split does not create contiguous tensors by default. + rank = get_tensor_model_parallel_rank() + output = input_list[rank].contiguous() + + return output + + +def _split_along_first_dim(input_): + """Split the tensor along its first dimension and keep the + corresponding slice.""" + + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # Split along first dimension. + dim_size = input_.size()[0] + assert ( + dim_size % world_size == 0 + ), "First dimension of the tensor should be divisible by tensor parallel size" + local_dim_size = dim_size // world_size + rank = get_tensor_model_parallel_rank() + dim_offset = rank * local_dim_size + + output = input_[dim_offset : dim_offset + local_dim_size].contiguous() + + return output + + +def _gather_along_last_dim(input_): + """Gather tensors and concatinate along the last dimension.""" + + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + dim_size[0] = dim_size[0] * world_size + + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed.all_gather_into_tensor( + output, input_.contiguous(), group=get_tensor_model_parallel_group() + ) + tensor_list = output.chunk(world_size, dim=0) + output = torch.cat(tensor_list, dim=-1).contiguous() + + return output + + +def _reduce_scatter_along_last_dim(input_): + """Reduce-scatter tensors on the last dimension.""" + world_size = get_tensor_model_parallel_world_size() + target_shape = list(input_.size()) + target_shape[-1] = target_shape[-1] // world_size + input_ = input_.reshape(-1, input_.shape[-1]) + split_tensors = torch.split( + input_, split_size_or_sections=input_.shape[-1] // world_size, dim=1 + ) + concat_tensor = torch.cat(split_tensors, dim=0) + output = _reduce_scatter_along_first_dim(concat_tensor).reshape(target_shape) + return output + + +def _gather_along_first_dim(input_): + """Gather tensors and concatinate along the first dimension.""" + + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + dim_size[0] = dim_size[0] * world_size + + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed._all_gather_base( + output, input_.contiguous(), group=get_tensor_model_parallel_group() + ) + + return output + + +def _reduce_scatter_along_first_dim(input_): + """Reduce-scatter the input tensor across model parallel group.""" + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + assert ( + dim_size[0] % world_size == 0 + ), "First dimension of the tensor should be divisible by tensor parallel size" + + dim_size[0] = dim_size[0] // world_size + + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed._reduce_scatter_base( + output, input_.contiguous(), group=get_tensor_model_parallel_group() + ) + return output + + +def _gather_along_first_dim_moe(input_, use_global_buffer=False): + """Gather tensors and concatenate along the first dimension.""" + group = get_tensor_and_expert_parallel_group() + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + dim_size[0] = dim_size[0] * world_size + + if use_global_buffer: + output = get_global_memory_buffer().get_tensor(dim_size, input_.dtype, "mpu") + else: + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed._all_gather_base(output, input_.contiguous(), group=group) + + return output + + +def _reduce_scatter_along_first_dim_moe(input_, use_global_buffer=False): + """Reduce-scatter the input tensor across model parallel group.""" + group = get_tensor_and_expert_parallel_group() + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + assert dim_size[0] % world_size == 0 + dim_size[0] = dim_size[0] // world_size + + if use_global_buffer: + output = get_global_memory_buffer().get_tensor(dim_size, input_.dtype, "mpu") + else: + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed._reduce_scatter_base(output, input_.contiguous(), group=group) + return output + + +def _gather_along_first_dim_expert_parallel(input_): + """Gather tensors and concatenate along the first dimension.""" + group = get_expert_model_parallel_group() + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + dim_size[0] = dim_size[0] * world_size + + output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device()) + torch.distributed._all_gather_base(output, input_.contiguous(), group=group) + + return output + + +class _CopyToModelParallelRegion(torch.autograd.Function): + """Pass the input to the model parallel region.""" + + @staticmethod + def symbolic(graph, input_): + return input_ + + @staticmethod + def forward(ctx, input_): + return input_ + + @staticmethod + def backward(ctx, grad_output): + return _reduce(grad_output) + + +class _ReduceFromModelParallelRegion(torch.autograd.Function): + """All-reduce the input from the model parallel region.""" + + @staticmethod + def symbolic(graph, input_): + return _reduce(input_) + + @staticmethod + def forward(ctx, input_): + return _reduce(input_) + + @staticmethod + def backward(ctx, grad_output): + return grad_output + + +class _ScatterToModelParallelRegion(torch.autograd.Function): + """Split the input and keep only the corresponding chuck to the rank.""" + + @staticmethod + def symbolic(graph, input_): + return _split_along_last_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _split_along_last_dim(input_) + + @staticmethod + def backward(ctx, grad_output): + return _gather_along_last_dim(grad_output) + + +class _GatherFromModelParallelRegion(torch.autograd.Function): + """Gather the input from model parallel region and concatinate.""" + + @staticmethod + def symbolic(graph, input_): + return _gather_along_last_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _gather_along_last_dim(input_) + + @staticmethod + def backward(ctx, grad_output): + return _split_along_last_dim(grad_output) + + +class _ScatterToSequenceParallelRegion(torch.autograd.Function): + """Split the input and keep only the corresponding chuck to the rank.""" + + @staticmethod + def symbolic(graph, input_): + return _split_along_first_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _split_along_first_dim(input_) + + @staticmethod + def backward(ctx, grad_output): + return _gather_along_first_dim(grad_output) + + +class _GatherFromSequenceParallelRegion(torch.autograd.Function): + """Gather the input from sequence parallel region and concatinate.""" + + @staticmethod + def symbolic(graph, input_, tensor_parallel_output_grad=True): + return _gather_along_first_dim(input_) + + @staticmethod + def forward(ctx, input_, tensor_parallel_output_grad=True): + ctx.tensor_parallel_output_grad = tensor_parallel_output_grad + return _gather_along_first_dim(input_) + + @staticmethod + def backward(ctx, grad_output): + tensor_parallel_output_grad = ctx.tensor_parallel_output_grad + + # If the computation graph after the gather operation is + # in the tensor parallel mode, output gradients need to reduce + # scattered and whereas if the computation is duplicated, + # output gradients need to be scattered. + if tensor_parallel_output_grad: + return _reduce_scatter_along_first_dim(grad_output), None + else: + return _split_along_first_dim(grad_output), None + + +class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function): + """Reduce scatter the input from the model parallel region.""" + + @staticmethod + def symbolic(graph, input_): + return _reduce_scatter_along_first_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _reduce_scatter_along_first_dim(input_) + + @staticmethod + def backward(ctx, grad_output): + return _gather_along_first_dim(grad_output) + + +class _GatherFromSequenceParallelRegionToMOE(torch.autograd.Function): + """Gather the input from model parallel region and concatenate.""" # TODO + + @staticmethod + def symbolic(graph, input_, use_global_buffer=False): + return _gather_along_first_dim_moe(input_, use_global_buffer) + + @staticmethod + def forward(ctx, input_, use_global_buffer=False): + ctx.use_global_buffer = use_global_buffer + return _gather_along_first_dim_moe(input_, use_global_buffer) + + @staticmethod + def backward(ctx, grad_output): + use_global_buffer = ctx.use_global_buffer + return _reduce_scatter_along_first_dim_moe(grad_output, use_global_buffer), None + + +class _ReduceScatterToSequenceParallelRegionFromMOE(torch.autograd.Function): + """Reduce scatter the input from the model parallel region.""" + + @staticmethod + def symbolic(graph, input_, use_global_buffer=False): + return _reduce_scatter_along_first_dim_moe(input_, use_global_buffer) + + @staticmethod + def forward(ctx, input_, use_global_buffer=False): + ctx.use_global_buffer = use_global_buffer + return _reduce_scatter_along_first_dim_moe(input_, use_global_buffer) + + @staticmethod + def backward(ctx, grad_output): + use_global_buffer = ctx.use_global_buffer + return _gather_along_first_dim_moe(grad_output, use_global_buffer), None + + +class _AllGatherFromTensorParallelRegion(torch.autograd.Function): + """Gather the input from model parallel region and concatenate.""" + + @staticmethod + def symbolic(graph, input_): + return _gather_along_last_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _gather_along_last_dim(input_,) + + @staticmethod + def backward(ctx, grad_output): + return _reduce_scatter_along_last_dim(grad_output) + + +class _ReduceScatterToTensorParallelRegion(torch.autograd.Function): + """Reduce scatter the input from the model parallel region.""" + + @staticmethod + def symbolic(graph, input_): + return _reduce_scatter_along_last_dim(input_) + + @staticmethod + def forward(ctx, input_): + return _reduce_scatter_along_last_dim(input_,) + + @staticmethod + def backward(ctx, grad_output): + return _gather_along_last_dim(grad_output) + + +class _AllToAll(torch.autograd.Function): + @staticmethod + def forward(ctx, group, input, output_split_sizes, input_split_sizes): + ctx.group = group + ctx.output_split_sizes = output_split_sizes + ctx.input_split_sizes = input_split_sizes + + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input + + input = input.contiguous() + if output_split_sizes is None: + # Equal split (all2all) + output = torch.empty_like(input) + else: + # Unequal split (all2all-v) + output = input.new_empty( + size=[sum(output_split_sizes)] + list(input.size()[1:]), + dtype=input.dtype, + device=torch.cuda.current_device(), + ) + torch.distributed.all_to_all_single( + output, + input, + output_split_sizes=output_split_sizes, + input_split_sizes=input_split_sizes, + group=group, + ) + return output + + @staticmethod + def backward(ctx, *grad_output): + return ( + None, + _AllToAll.apply(ctx.group, *grad_output, ctx.input_split_sizes, ctx.output_split_sizes), + None, + None, + ) + + +# ----------------- +# Helper functions. +# ----------------- + + +def copy_to_tensor_model_parallel_region(input_): + return _CopyToModelParallelRegion.apply(input_) + + +def reduce_from_tensor_model_parallel_region(input_): + return _ReduceFromModelParallelRegion.apply(input_) + + +def scatter_to_tensor_model_parallel_region(input_): + return _ScatterToModelParallelRegion.apply(input_) + + +def gather_from_tensor_model_parallel_region(input_): + return _GatherFromModelParallelRegion.apply(input_) + + +def scatter_to_sequence_parallel_region(input_): + return _ScatterToSequenceParallelRegion.apply(input_) + + +def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True): + return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad) + + +def reduce_scatter_to_sequence_parallel_region(input_): + return _ReduceScatterToSequenceParallelRegion.apply(input_) + + +def gather_from_sequence_parallel_region_to_moe(input_, use_global_buffer=False): + return _GatherFromSequenceParallelRegionToMOE.apply(input_, use_global_buffer) + + +def reduce_scatter_to_sequence_parallel_region_from_moe(input_, use_global_buffer=False): + return _ReduceScatterToSequenceParallelRegionFromMOE.apply(input_, use_global_buffer) + + +def all_gather_last_dim_from_tensor_parallel_region(input_): + return _AllGatherFromTensorParallelRegion.apply(input_) + + +def reduce_scatter_last_dim_to_tensor_parallel_region(input_): + return _ReduceScatterToTensorParallelRegion.apply(input_) + + +def all_to_all(group, input_, output_split_sizes_=None, input_split_sizes_=None): + return _AllToAll.apply(group, input_, output_split_sizes_, input_split_sizes_) + + +def all_to_all_sp2hp(input_): + """ + Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape [num_tokens/TP, H] to [num_tokens, H/TP]. + + Args: + input_ (torch.Tensor): The input tensor which has been distributed along the sequence dimension. + + Returns: + torch.Tensor: The output tensor with shape [num_tokens, H/TP]. + + """ + world_size = get_tensor_model_parallel_world_size() + tp_group = get_tensor_model_parallel_group() + input_ = input_.reshape(-1, input_.shape[-1]) + split_tensors = torch.split( + input_, split_size_or_sections=input_.shape[-1] // world_size, dim=1 + ) + concat_tensor = torch.cat(split_tensors, dim=0) + output = all_to_all(tp_group, concat_tensor) + return output + + +def all_to_all_hp2sp(input_): + """ + Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape [num_tokens, H/TP] to [num_tokens/TP, H]. + + Args: + input_ (torch.Tensor): The input tensor which has been distributed along the hidden dimension. + + Returns: + torch.Tensor: The output tensor with shape [num_tokens/TP, H]. + """ + world_size = get_tensor_model_parallel_world_size() + input_ = input_.reshape(-1, input_.shape[-1]) + tp_group = get_tensor_model_parallel_group() + input_exchanged = all_to_all(tp_group, input_) + input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1]) + split_tensors = torch.split( + input_reshaped, split_size_or_sections=input_reshaped.shape[0] // world_size, dim=0 + ) + output = torch.cat(split_tensors, dim=-1) + return output diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py new file mode 100644 index 0000000..20a2720 --- /dev/null +++ b/megatron/core/tensor_parallel/random.py @@ -0,0 +1,301 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + +import contextlib +from importlib.metadata import version + +import torch +from pkg_resources import packaging +from torch import _C +from torch.cuda import _lazy_call +from torch.cuda import device as device_ctx_manager +from torch.utils.checkpoint import detach_variable + +from megatron.core.parallel_state import ( + get_data_parallel_rank, + get_expert_model_parallel_rank, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from megatron.core.utils import safely_set_viewless_tensor_data + +from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks + +# Default name for the model parallel rng tracker. +_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' +_EXPERT_PARALLEL_RNG_TRACKER_NAME = 'expert-parallel-rng' +_DATA_PARALLEL_RNG_TRACKER_NAME = 'data-parallel-rng' + + +def _set_cuda_rng_state(new_state, device=-1): + """Sets the random number generator state of the current GPU. + + Argumentss: + new_state (torch.ByteTensor): The desired state + This function is adapted from PyTorch repo (torch.cuda.set_rng_state) + with a single change: the input state is not cloned. Cloning caused + major performance issues for +4 GPU cases. + """ + if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): + # older PyTorch + def cb(): + with device_ctx_manager(device): + _C._cuda_setRNGState(new_state) + + else: + # newer PyTorch + if device == -1: + device = torch.device('cuda') + elif isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device('cuda', device) + + def cb(): + idx = device.index + if idx is None: + idx = torch.cuda.current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.set_state(new_state) + + _lazy_call(cb) + + +def get_expert_parallel_rng_tracker_name(): + global _EXPERT_PARALLEL_RNG_TRACKER_NAME + return _EXPERT_PARALLEL_RNG_TRACKER_NAME + + +def get_data_parallel_rng_tracker_name(): + global _DATA_PARALLEL_RNG_TRACKER_NAME + return _DATA_PARALLEL_RNG_TRACKER_NAME + + +class CudaRNGStatesTracker: + """Tracker for the cuda RNG states. + + Using the `add` method, a cuda rng state is initialized based on + the input `seed` and is assigned to `name`. Later, by forking the + rng state, we can perform operations and return to our starting + cuda state. + """ + + def __init__(self): + self.reset() + + def is_initialized(self): + return self._is_initialized + + def reset(self): + """Set to the initial state (no tracker).""" + + # Track if initialized. + self._is_initialized = False + + # Map from a string name to the cuda rng state. + self.states_ = {} + + # Seeds are just for book keeping and ensure no seed is set twice. + self.seeds_ = set() + + def get_states(self): + """Get rng states. Copy the dictionary so we have direct + pointers to the states, not just a pointer to the dictionary.""" + states = {} + for name in self.states_: + states[name] = self.states_[name] + return states + + def set_states(self, states): + """Set the rng states. For efficiency purposes, we do not check + the size of seed for compatibility.""" + self._is_initialized = True + self.states_ = states + + def add(self, name, seed): + """Track the rng state.""" + self._is_initialized = True + # Check seed is not already used. + if seed in self.seeds_: + raise Exception('seed {} already exists'.format(seed)) + self.seeds_.add(seed) + # Check that state is not already defined. + if name in self.states_: + raise Exception('cuda rng state {} already exists'.format(name)) + # Get the current rng state. + orig_rng_state = torch.cuda.get_rng_state() + # Set the new state and store it. + torch.cuda.manual_seed(seed) + self.states_[name] = torch.cuda.get_rng_state() + # Reset rng state to what it was. + _set_cuda_rng_state(orig_rng_state) + + @contextlib.contextmanager + def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): + """Fork the cuda rng state, perform operations, and exit with + the original state.""" + # Check if we have added the state + if name not in self.states_: + raise Exception('cuda rng state {} is not added'.format(name)) + # Store current rng state. + orig_cuda_rng_state = torch.cuda.get_rng_state() + # Set rng state to the desired one + _set_cuda_rng_state(self.states_[name]) + # Do the stuff we wanted to do. + try: + yield + finally: + # Update the current rng state for later use. + self.states_[name] = torch.cuda.get_rng_state() + # And set the state to the original state we started with. + _set_cuda_rng_state(orig_cuda_rng_state) + + +# RNG tracker object. +_CUDA_RNG_STATE_TRACKER = None +_CUDA_RNG_STATE_TRACKER_INITIALIZED = False + + +def initialize_rng_tracker(use_te_rng_tracker: bool = False): + global _CUDA_RNG_STATE_TRACKER + global _CUDA_RNG_STATE_TRACKER_INITIALIZED + if _CUDA_RNG_STATE_TRACKER_INITIALIZED: + return + if use_te_rng_tracker: + try: + import transformer_engine.pytorch as te + + _te_version = packaging.version.Version(version("transformer-engine")) + if _te_version < packaging.version.Version("1.5.0"): + raise RuntimeError("use_te_rng_tracker requires TransformerEngine version >= 1.5") + except: + raise RuntimeError("use_te_rng_tracker requires TransformerEngine, but not installed") + if use_te_rng_tracker: + _CUDA_RNG_STATE_TRACKER = te.distributed.CudaRNGStatesTracker() + else: + _CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + _CUDA_RNG_STATE_TRACKER_INITIALIZED = True + + +def get_cuda_rng_tracker(): + """Get cuda rng tracker.""" + initialize_rng_tracker() + return _CUDA_RNG_STATE_TRACKER + + +def model_parallel_cuda_manual_seed(seed): + """Initialize model parallel cuda seed. + + This function should be called after the model parallel is + initialized. Also, no torch.cuda.manual_seed should be called + after this function. Basically, this is replacement for that + function. + Two set of RNG states are tracked: + default state: This is for data parallelism and is the same among a set of model parallel GPUs but different across different model paralle groups. This is used for example for dropout in the non-tensor-model-parallel regions. + tensor-model-parallel state: This state is different among a set of model parallel GPUs, but the same across data parallel groups. This is used for example for dropout in model parallel regions. + """ + # 2718 is just for fun and any POSITIVE value will work. + offset = seed + 2718 + tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank() + # Data parallel gets the original seed. + data_parallel_seed = seed + + initialize_rng_tracker() + _CUDA_RNG_STATE_TRACKER.reset() + # Set the default state. + torch.cuda.manual_seed(data_parallel_seed) + _CUDA_RNG_STATE_TRACKER.add(_DATA_PARALLEL_RNG_TRACKER_NAME, data_parallel_seed) + + # and model parallel state. + _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed) + + expert_parallel_seed = ( + seed + 1024 + 100 * get_expert_model_parallel_rank() + get_tensor_model_parallel_rank() + ) + _CUDA_RNG_STATE_TRACKER.add(_EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed) + + +class CheckpointFunction(torch.autograd.Function): + """Checkpoint Function + + This function is adapted from torch.utils.checkpoint with two main changes: + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly tracked/set/reset. + """ + + @staticmethod + def forward(ctx, run_function, distribute_saved_activations, *args): + ctx.run_function = run_function + ctx.distribute_saved_activations = distribute_saved_activations + + # Copy the rng states. + ctx.fwd_cpu_rng_state = torch.get_rng_state() + ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() + ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + with torch.no_grad(): + outputs = run_function(*args) + + # Divide hidden states across model parallel group and only keep + # the chunk corresponding to the current rank. + if distribute_saved_activations: + ctx.input_0_shape = args[0].data.shape + safely_set_viewless_tensor_data( + args[0], split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True) + ) + + # Store everything. + ctx.save_for_backward(*args) + + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError( + "Checkpointing is not compatible with .grad(), " + "please use .backward() if possible" + ) + inputs = ctx.saved_tensors + if ctx.distribute_saved_activations: + safely_set_viewless_tensor_data( + inputs[0], gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape) + ) + + # Store the current states. + bwd_cpu_rng_state = torch.get_rng_state() + bwd_cuda_rng_state = torch.cuda.get_rng_state() + bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + # Set the states to what it used to be before the forward pass. + torch.set_rng_state(ctx.fwd_cpu_rng_state) + _set_cuda_rng_state(ctx.fwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) + + # Compute the forward pass. + detached_inputs = detach_variable(inputs) + with torch.enable_grad(): + outputs = ctx.run_function(*detached_inputs) + + # Set the states back to what it was at the start of this function. + torch.set_rng_state(bwd_cpu_rng_state) + _set_cuda_rng_state(bwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) + + if isinstance(outputs, torch.Tensor): + outputs = (outputs,) + + # filter out non tensor outputs for backward pass + outputs, args = zip(*filter(lambda x: torch.is_tensor(x[0]), zip(outputs, args))) + torch.autograd.backward(outputs, args) + grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs) + return (None, None) + grads + + +def checkpoint(function, distribute_saved_activations, *args): + """Checkpoint a model or part of the model. + This has been directly copied from torch.utils.checkpoint.""" + return CheckpointFunction.apply(function, distribute_saved_activations, *args) diff --git a/megatron/core/tensor_parallel/utils.py b/megatron/core/tensor_parallel/utils.py new file mode 100644 index 0000000..53f0d60 --- /dev/null +++ b/megatron/core/tensor_parallel/utils.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from typing import List, Sequence + +import torch + +from megatron.core import parallel_state +from megatron.core.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from megatron.core.utils import divide + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """ Split a tensor along its last dimension. + + Args: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = divide(tensor.size()[last_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False): + """ Break a tensor into equal 1D chunks across tensor parallel ranks. + + Returns a Tensor or View with this rank's portion of the data. + + Args: + tensor: The tensor to split + + Keyword Args: + new_buffer (bool): If True, returns a new Tensor. + If False, returns a view into the existing Tensor. + Default is False + + """ + partition_size = torch.numel(tensor) // parallel_state.get_tensor_model_parallel_world_size() + start_index = partition_size * parallel_state.get_tensor_model_parallel_rank() + end_index = start_index + partition_size + if new_buffer: + data = torch.empty( + partition_size, + dtype=tensor.dtype, + device=torch.cuda.current_device(), + requires_grad=False, + ) + data.copy_(tensor.view(-1)[start_index:end_index]) + else: + data = tensor.view(-1)[start_index:end_index] + return data + + +def gather_split_1d_tensor(tensor): + """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor + model parallel ranks. + + Returns a new Tensor with the gathered data. + + Args: + tensor: A Tensor or view of this rank's portion of the data. + """ + numel_gathered = torch.numel(tensor) * parallel_state.get_tensor_model_parallel_world_size() + gathered = torch.empty( + numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False + ) + # TODO: This API is experimental in pytorch (as of Feb 2022) and + # this might break in future pytorch releases. We chose this API + # as opposed to torch.distributed.all_gather for efficiency reasons. + # This API calls directly NCCL all-gather versus the former does + # internal copies and can potentially cause slow down. + torch.distributed._all_gather_base( + gathered, tensor, group=parallel_state.get_tensor_model_parallel_group() + ) + return gathered + + +class VocabUtility: + """ Split the vocabulary into `world_size` chunks and return the first + and last index of the vocabulary belonging to the `rank` + partition: Note that indices in [fist, last) + + """ + + @staticmethod + def vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size: int, rank, world_size: int + ) -> Sequence[int]: + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f, index_l + + @staticmethod + def vocab_range_from_global_vocab_size( + global_vocab_size: int, rank: int, world_size: int + ) -> Sequence[int]: + per_partition_vocab_size = divide(global_vocab_size, world_size) + return VocabUtility.vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, world_size + ) diff --git a/megatron/core/timers.py b/megatron/core/timers.py new file mode 100644 index 0000000..b61eb4e --- /dev/null +++ b/megatron/core/timers.py @@ -0,0 +1,398 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron timers.""" + +import time +from abc import ABC, abstractmethod +from typing import List + +import torch + + +class TimerBase(ABC): + def __init__(self, name): + self.name = name + + @abstractmethod + def start(self, barrier=False): + pass + + @abstractmethod + def stop(self, barrier=False): + pass + + @abstractmethod + def reset(self): + pass + + @abstractmethod + def elapsed(self, reset=True, barrier=False): + pass + + +class DummyTimer(TimerBase): + def __init__(self): + super().__init__('dummy timer') + + def start(self, barrier=False): + return + + def stop(self, barrier=False): + return + + def reset(self): + return + + def elapsed(self, reset=True, barrier=False): + raise Exception('dummy timer should not be used to calculate elapsed time') + + +class Timer(TimerBase): + """ + Timer class with ability to start/stop. + + Comment on using `barrier`: If this flag is passed, then all + the caller processes will wait till all reach the timing routine. + It is up to the user to make sure all the ranks in `barrier_group` + call it otherwise, it will result in a hang. + Comment on `barrier_group`: By default it is set to None which + in torch distributed land, it will result in the global communicator. + """ + + def __init__(self, name): + """Initialize Timer. + + Args: + name (str): Name of the timer. + """ + super().__init__(name) + self._elapsed = 0.0 + self._active_time = 0.0 + self._started = False + # Note that None will default to the global process group + self._barrier_group = None + self._start_time = time.time() + + def set_barrier_group(self, barrier_group): + """Sets barrier group. + + Args: + barrier_group (ProcessGroup): Torch ProcessGroup for barrier. + """ + self._barrier_group = barrier_group + + def start(self, barrier=False): + """Start the timer. + + Args: + barrier (bool, optional): Synchronizes ranks before starting. Defaults to False. + """ + assert not self._started, 'timer has already been started' + if barrier: + torch.distributed.barrier(group=self._barrier_group) + torch.cuda.synchronize() + self._start_time = time.time() + self._started = True + + def stop(self, barrier=False): + """Stop the timer. + + Args: + barrier (bool, optional): Synchronizes ranks before stopping. Defaults to False. + """ + assert self._started, 'timer is not started' + if barrier: + torch.distributed.barrier(group=self._barrier_group) + torch.cuda.synchronize() + elapsed = time.time() - self._start_time + self._elapsed += elapsed + self._active_time += elapsed + self._started = False + + def reset(self): + """Reset timer. + """ + # Don't reset _active_time + self._elapsed = 0.0 + self._started = False + + def elapsed(self, reset=True, barrier=False): + """Calculates the elapsed time and restarts timer. + + Args: + reset (bool, optional): Resets timer before restarting. Defaults to True. + barrier (bool, optional): Synchronizes ranks before stopping. Defaults to False. + + Returns: + float: Elapsed time. + """ + _started = self._started + # If the timing in progress, end it first. + if self._started: + self.stop(barrier=barrier) + # Get the elapsed time. + _elapsed = self._elapsed + # Reset the elapsed time + if reset: + self.reset() + # If timing was in progress, set it back. + if _started: + self.start(barrier=barrier) + return _elapsed + + def active_time(self): + return self._active_time + + +class Timers: + """Class for a group of Timers. + """ + + def __init__(self, log_level, log_option): + """Initialize group of timers. + + Args: + log_level (int): Log level to control what timers are enabled. + log_option (str): Setting for logging statistics over ranks for all the timers. Allowed: ['max', 'minmax', 'all']. + """ + self._log_level = log_level + allowed_log_options = set(['max', 'minmax', 'all']) + assert ( + log_option in allowed_log_options + ), 'input log option {} is invalid. It must be one of {}'.format( + log_option, allowed_log_options + ) + self._log_option = log_option + self._timers = {} + self._log_levels = {} + self._dummy_timer = DummyTimer() + self._max_log_level = 2 + + def __call__(self, name, log_level=None): + """Call timer with name and log level.""" + # If the timer has already been set, then check if the log-level + # is provided, it matches the one that the timer was created with. + if name in self._timers: + if log_level is not None: + assert log_level == self._log_levels[name], ( + 'input log level {} does not match already existing ' + 'log level {} for {} timer'.format(log_level, self._log_levels[name], name) + ) + return self._timers[name] + # If timer does not exist and no log level is provided, + # set it to the max log level which is 2. + if log_level is None: + log_level = self._max_log_level + assert ( + log_level <= self._max_log_level + ), 'log level {} is larger than max supported log level {}'.format( + log_level, self._max_log_level + ) + # Now if the input log level is larger than the one set for + # the timers class, just ignore it and return a dummy timer. + if log_level > self._log_level: + return self._dummy_timer + # Otherwise, initalize the timer and set the level. + self._timers[name] = Timer(name) + self._log_levels[name] = log_level + return self._timers[name] + + def _get_elapsed_time_all_ranks(self, names, reset, barrier): + """Returns elapsed times of timers in names. + Assumptions: + - All the ranks call this function. + - `names` are identical on all ranks. + If the above assumptions are not met, calling this function will + result in hang. + + Args: + names (List[str]): list of timer names + reset (bool): reset the timer after recording the elapsed time + barrier (bool): if set, do a global barrier before time measurments + + Returns: + torch.tensor: Tensor of size [world_size, len(names)] with times in float. + """ + + # First make sure all the callers are in sync. + if barrier: + torch.distributed.barrier() + + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + + # Here we can use gather on the rank we want to print the + # timing, however, there is no gather_base support in + # pytorch yet. It is simpler to deal with a single tensor + # and since we are only gathering a small amount of data, + # it should be ok to use all-gather instead of gather. + rank_name_to_time = torch.zeros( + (world_size, len(names)), dtype=torch.float, device=torch.cuda.current_device() + ) + for i, name in enumerate(names): + if name in self._timers: + # Here we don't need to pass the barrier flag as all + # the processes are already in sync. This avoids the + # issue of different timers having different barrier + # groups inside their class. + rank_name_to_time[rank, i] = self._timers[name].elapsed(reset=reset) + + # See the note above for why we are not using gather. + torch.distributed._all_gather_base( + rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1) + ) + + return rank_name_to_time + + def _get_global_min_max_time(self, names, reset, barrier, normalizer): + """Report only min and max times across all ranks.""" + + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) + name_to_min_max_time = {} + for i, name in enumerate(names): + rank_to_time = rank_name_to_time[:, i] + # filter out the ones we did not have any timings for + rank_to_time = rank_to_time[rank_to_time > 0.0] + # If the timer exists: + if rank_to_time.numel() > 0: + name_to_min_max_time[name] = ( + rank_to_time.min().item() / normalizer, + rank_to_time.max().item() / normalizer, + ) + return name_to_min_max_time + + def _get_global_min_max_time_string(self, names, reset, barrier, normalizer, max_only): + """Report strings for max/minmax times across all ranks.""" + name_to_min_max_time = self._get_global_min_max_time(names, reset, barrier, normalizer) + if not name_to_min_max_time: + return None + if max_only: + output_string = 'max time across ranks (ms):' + else: + output_string = '(min, max) time across ranks (ms):' + for name in name_to_min_max_time: + min_time, max_time = name_to_min_max_time[name] + if max_only: + output_string += '\n {}: {:.2f}'.format((name + ' ').ljust(48, '.'), max_time) + else: + output_string += '\n {}: ({:.2f}, {:.2f})'.format( + (name + ' ').ljust(48, '.'), min_time, max_time + ) + return output_string + + def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): + """Report times across all ranks.""" + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) + + output_string = 'times across ranks (ms):' + no_reported_timing = True + for i, name in enumerate(names): + not_yet_found = True + for rank in range(torch.distributed.get_world_size()): + if rank_name_to_time[rank, i] > 0: + no_reported_timing = False + if not_yet_found: + not_yet_found = False + output_string += '\n {}:'.format(name) + output_string += '\n rank {:2d}: {:.2f}'.format( + rank, rank_name_to_time[rank, i] / normalizer + ) + if no_reported_timing: + return None + return output_string + + def get_all_timers_string( + self, + names: List[str] = None, + normalizer: float = 1.0, + reset: bool = True, + barrier: bool = False, + ): + """Returns the output string with logged timer values according to configured options. + + Args: + names (List[str]): Names of the timers to log. If None, all registered timers are fetched. Defaults to None. + normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0. + reset (bool, optional): Whether to reset timer values after logging. Defaults to True. + barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False. + + Raises: + Exception: Raises if log option is invalid. + + Returns: + str: Formatted string with the timer values. + """ + + if names == None: # get all registered timers + names = self._timers.keys() + + assert normalizer > 0.0 + if self._log_option in ['max', 'minmax']: + max_only = False + if self._log_option == 'max': + max_only = True + output_string = self._get_global_min_max_time_string( + names, reset, barrier, normalizer / 1000.0, max_only + ) + elif self._log_option == 'all': + output_string = self._get_all_ranks_time_string( + names, reset, barrier, normalizer / 1000.0 + ) + else: + raise Exception('unknown timing log option {}'.format(self._log_option)) + return output_string + + def log( + self, + names: List[str], + rank: int = None, + normalizer: float = 1.0, + reset: bool = True, + barrier: bool = False, + ): + """logs the timers passed in names to stdout. Example usage is to log average per step value for timer 'foo', + this function can be called with normalizer factor set to logging interval. + + Args: + names (List[str]): Names of the timers to log. + rank (int, optional): logs the timers to a specific rank. If set to None, logs to the last rank. Defaults to None. + normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0. + reset (bool, optional): Whether to reset timer values after logging. Defaults to True. + barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False. + """ + + output_string = self.get_all_timers_string(names, normalizer, reset, barrier) + # If no input rank is provided, log on last rank. + if rank is None: + rank = torch.distributed.get_world_size() - 1 + if rank == torch.distributed.get_rank() and output_string is not None: + print(output_string, flush=True) + + def write( + self, + names: List[str], + writer, + iteration: int, + normalizer: float = 1.0, + reset: bool = True, + barrier: bool = False, + ): + """Write timers to a tensorboard writer. Note that we only report maximum time across ranks to tensorboard. + + Args: + names (List[str]): Names of the timers to log. + writer (SummaryWriter): Tensorboard SummaryWriter object + iteration (int): Current iteration. + normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0. + reset (bool, optional): Whether to reset timer values after logging. Defaults to True. + barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False. + """ + # currently when using add_scalars, + # torch.utils.add_scalars makes each timer its own run, which + # polutes the runs list, so we just add each as a scalar + assert normalizer > 0.0 + name_to_min_max_time = self._get_global_min_max_time(names, reset, barrier, normalizer) + if writer is not None: + for name in name_to_min_max_time: + _, max_time = name_to_min_max_time[name] + writer.add_scalar(name + '-time', max_time, iteration) diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py new file mode 100644 index 0000000..7cc1077 --- /dev/null +++ b/megatron/core/transformer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from .module import MegatronModule +from .spec_utils import ModuleSpec, build_module +from .transformer_config import TransformerConfig +from .transformer_layer import TransformerLayer, TransformerLayerSubmodules diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py new file mode 100644 index 0000000..5fc3cf3 --- /dev/null +++ b/megatron/core/transformer/attention.py @@ -0,0 +1,620 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from abc import ABC, abstractmethod +from dataclasses import dataclass +from importlib.metadata import version +from typing import Union + +import torch +from pkg_resources import packaging + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.parallel_state import ( + get_data_parallel_group, + get_data_parallel_rank, + get_data_parallel_world_size, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import divide + +from .enums import AttnMaskType +from .transformer_config import TransformerConfig + +try: + import transformer_engine + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +if HAVE_TE: + from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim +else: + SplitAlongDim = None + + +@dataclass +class SelfAttentionSubmodules: + linear_qkv: Union[ModuleSpec, type] = None + core_attention: Union[ModuleSpec, type] = None + linear_proj: Union[ModuleSpec, type] = None + q_layernorm: Union[ModuleSpec, type] = None + k_layernorm: Union[ModuleSpec, type] = None + + +@dataclass +class CrossAttentionSubmodules: + linear_q: Union[ModuleSpec, type] = None + linear_kv: Union[ModuleSpec, type] = None + core_attention: Union[ModuleSpec, type] = None + linear_proj: Union[ModuleSpec, type] = None + + +class Attention(MegatronModule, ABC): + """Attention layer abstract class. + + This layer only contains common modules required for the "self attn" and + "cross attn" specializations. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: Union[SelfAttentionSubmodules, CrossAttentionSubmodules], + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + ): + super().__init__(config=config) + + self.config = config + self.layer_number = layer_number + self.attn_mask_type = attn_mask_type + self.attention_type = attention_type + + # For normal attention without groups, num_query_groups == num_attention_heads, + # so these two will be the same + self.query_projection_size = self.config.kv_channels * self.config.num_attention_heads + self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups + + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = divide( + self.query_projection_size, self.config.num_attention_heads + ) + self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size) + self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size) + + self.core_attention = build_module( + submodules.core_attention, + config=self.config, + layer_number=self.layer_number, + attn_mask_type=self.attn_mask_type, + attention_type=self.attention_type, + ) + + self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' + + # Output. + self.linear_proj = build_module( + submodules.linear_proj, + self.query_projection_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='proj', + ) + + def _checkpointed_attention_forward( + self, + query, + key, + value, + attention_mask, + rotary_pos_emb=None, + attn_mask_type=None, + packed_seq_params=None, + ): + """Forward method with selective activation checkpointing.""" + + def custom_forward(*inputs): + query = inputs[0] + key = inputs[1] + value = inputs[2] + attention_mask = inputs[3] + attn_mask_type = inputs[5] + attn_mask_type = AttnMaskType(attn_mask_type.item()) + output_ = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + return output_ + + if attn_mask_type is None: + attn_mask_type = self.attn_mask_type + attn_mask_type = torch.tensor([attn_mask_type.value], dtype=torch.int) + hidden_states = tensor_parallel.checkpoint( + custom_forward, + False, + query, + key, + value, + attention_mask, + rotary_pos_emb, + attn_mask_type, + ) + + return hidden_states + + def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype): + """Allocate memory to store kv cache during inference.""" + + return torch.empty( + inference_max_sequence_length, + batch_size, + self.num_query_groups_per_partition, + self.hidden_size_per_attention_head, + dtype=dtype, + device=torch.cuda.current_device(), + ) + + def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_pos_emb): + """ + Saves the generated key and value tensors to the end of the buffers in inference_params. + Returns the full size keys and values from the provided inference_params, as well as + adjusted rotary_pos_emb. + + Returns a tuple: (key, value, rotary_pos_emb) + + """ + attn_mask_type = self.attn_mask_type + if inference_params is None: + return key, value, rotary_pos_emb, attn_mask_type + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + is_first_step = False + if self.layer_number not in inference_params.key_value_memory_dict: + inf_max_seq_length = inference_params.max_sequence_length + inf_max_batch_size = inference_params.max_batch_size + inference_key_memory = self._allocate_memory( + inf_max_seq_length, inf_max_batch_size, key.dtype + ) + inference_value_memory = self._allocate_memory( + inf_max_seq_length, inf_max_batch_size, value.dtype + ) + inference_params.key_value_memory_dict[self.layer_number] = ( + inference_key_memory, + inference_value_memory, + ) + is_first_step = True + else: + # Get the pre-allocated buffers for this layer + inference_key_memory, inference_value_memory = inference_params.key_value_memory_dict[ + self.layer_number + ] + attn_mask_type = AttnMaskType.no_mask + + batch_start = inference_params.batch_size_offset + batch_end = batch_start + key.size(1) + assert batch_end <= inference_key_memory.size(1) + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + key.size(0) + assert sequence_end <= inference_key_memory.size(0) + # Copy key and values. + inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = key + inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = value + key = inference_key_memory[:sequence_end, batch_start:batch_end, ...] + value = inference_value_memory[:sequence_end, batch_start:batch_end, ...] + + # adjust the key rotary positional embedding + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + # need to cross check this condition during inference + # if not set_inference_key_value_memory: + if not is_first_step: + # In inference, we compute one token at a time. + # Select the correct positional embedding + # (only the last token in the sequence) + q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] + else: + # In the first forward pass of inference, + # we use the entire provided prefix. + # q_pos_emb here has the rope embeddings of the entire + # prefix + to-be-generated output so + # we slice to just the prefix. + q_pos_emb = q_pos_emb[:sequence_end, :, :, :] + k_pos_emb = k_pos_emb[:sequence_end, :, :, :] + rotary_pos_emb = (q_pos_emb, k_pos_emb) + + return key, value, rotary_pos_emb, attn_mask_type + + @abstractmethod + def get_query_key_value_tensors(self, hidden_states, key_value_states): + """ + This method needs to be implemented based on whether the derived class + is "self-attn" or "cross-attn". + """ + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + ): + # hidden_states: [sq, b, h] + + # For self attention we just duplicate the rotary_pos_emb if it isn't already + if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = (rotary_pos_emb,) * 2 + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states) + + # =================================================== + # Adjust key, value, and rotary_pos_emb for inference + # =================================================== + key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb + ) + + if packed_seq_params is not None: + query = query.squeeze(1) + key = key.squeeze(1) + value = value.squeeze(1) + + # ================================================ + # relative positional embedding (rotary embedding) + # ================================================ + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + query = apply_rotary_pos_emb( + query, + q_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_q, + ) + key = apply_rotary_pos_emb( + key, + k_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_kv, + ) + + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + # ================= + # Output. [sq, b, h] + # ================= + + output, bias = self.linear_proj(core_attn_out) + + return output, bias + + +class SelfAttention(Attention): + """Self-attention layer class + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: SelfAttentionSubmodules, + layer_number: int, + attn_mask_type=AttnMaskType.padding, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + attention_type="self", + ) + + self.linear_qkv = build_module( + submodules.linear_qkv, + self.config.hidden_size, + self.query_projection_size + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear or self.config.add_qkv_bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='qkv', + ) + + if submodules.q_layernorm is not None: + self.q_layernorm = build_module( + submodules.q_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.q_layernorm = None + + if submodules.k_layernorm is not None: + self.k_layernorm = build_module( + submodules.k_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.k_layernorm = None + + def run_realtime_tests(self): + """Performs a consistency check. + + This function makes sure that tensors across devices are the same during an experiment. + This is often not guaranteed to be so because of silent hardware failures (eg, memory + corruption loading a checkpoint, network traffic corruption encountered during data transmission). + + (TODO) In the future, more tensors should be checked across the training run and + checked every X iterations. This is left for future work. Equality of tensors is probably not + required; transmitting hashes is sufficient.""" + + if not self.config.qk_layernorm: + return + + # check that all tensor parallel and data parallel ranks have the same + # Q & K layernorm parameters. + rank = get_data_parallel_rank() + inputs = torch.stack( + [ + self.q_layernorm.weight.data, + self.q_layernorm.bias.data, + self.k_layernorm.weight.data, + self.k_layernorm.bias.data, + ] + ) + dp_list = [torch.empty_like(inputs) for _ in range(get_data_parallel_world_size())] + dp_list[rank] = inputs + torch.distributed.all_gather(dp_list, inputs, group=get_data_parallel_group()) + + def _compare(srcs, tgts, names, parallelism): + assert len(srcs) == len(tgts) == len(names) + for src, tgt, name in zip(srcs, tgts, names): + assert torch.all( + src == tgt + ), f"Discrepancy between {name} in {parallelism} ranks {i} and {rank}. Diff: {torch.norm(src - tgt)}" + + for i, dp in enumerate(dp_list): + q_w, q_b, k_w, k_b = torch.unbind(dp) + _compare( + [q_w, q_b, k_w, k_b], + [ + self.q_layernorm.weight.data, + self.q_layernorm.bias.data, + self.k_layernorm.weight.data, + self.k_layernorm.bias.data, + ], + ["q_w", "q_b", "k_w", "k_b"], + "DP", + ) + + rank = get_tensor_model_parallel_rank() + tp_list = [torch.empty_like(inputs) for _ in range(get_tensor_model_parallel_world_size())] + tp_list[rank] = inputs + torch.distributed.all_gather(tp_list, inputs, group=get_tensor_model_parallel_group()) + + for i, tp in enumerate(tp_list): + q_w, q_b, k_w, k_b = torch.unbind(tp) + _compare( + [q_w, q_b, k_w, k_b], + [ + self.q_layernorm.weight.data, + self.q_layernorm.bias.data, + self.k_layernorm.weight.data, + self.k_layernorm.bias.data, + ], + ["q_w", "q_b", "k_w", "k_b"], + "TP", + ) + + def get_query_key_value_tensors(self, hidden_states, key_value_states=None): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_qkv, _ = self.linear_qkv(hidden_states) + + # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + new_tensor_shape = mixed_qkv.size()[:-1] + ( + self.num_query_groups_per_partition, + ( + (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) + * self.hidden_size_per_attention_head + ), + ) + mixed_qkv = mixed_qkv.view(*new_tensor_shape) + + split_arg_list = [ + ( + self.num_attention_heads_per_partition + // self.num_query_groups_per_partition + * self.hidden_size_per_attention_head + ), + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ] + + if SplitAlongDim is not None: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = SplitAlongDim( + mixed_qkv, + 3, + split_arg_list, + ) + else: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = torch.split( + mixed_qkv, + split_arg_list, + dim=3, + ) + + # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head) + + if self.q_layernorm is not None: + query = self.q_layernorm(query) + + if self.k_layernorm is not None: + key = self.k_layernorm(key) + + if self.config.test_mode: + self.run_realtime_tests() + + return query, key, value + + +class CrossAttention(Attention): + """Cross-attention layer class + + Cross-attention layer takes input with size [s, b, h] and context with size + [s, b, h] and returns output of the same size. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: CrossAttentionSubmodules, + layer_number: int, + attn_mask_type=AttnMaskType.padding, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + attention_type="cross", + ) + + if self.config.num_query_groups != self.config.num_attention_heads: + raise ValueError( + f"Group query attention is not currently supported in cross attention." + ) + assert self.query_projection_size == self.kv_projection_size + + self.linear_q = build_module( + submodules.linear_q, + self.config.hidden_size, + self.query_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear, + skip_bias_add=False, + is_expert=False, + ) + + self.linear_kv = build_module( + submodules.linear_kv, + self.config.hidden_size, + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear, + skip_bias_add=False, + is_expert=False, + ) + + def get_query_key_value_tensors(self, hidden_states, key_value_states): + """ + Derives `query` tensor from `hidden_states`, and `key`/`value` tensors + from `key_value_states`. + """ + # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] + mixed_kv, _ = self.linear_kv(key_value_states) + + # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] + new_tensor_shape = mixed_kv.size()[:-1] + ( + self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head, + ) + mixed_kv = mixed_kv.view(*new_tensor_shape) + + # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] + (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2) + + # Attention head [sq, b, h] --> [sq, b, hp] + query, _ = self.linear_q(hidden_states) + + # [sq, b, hp] --> [sq, b, np, hn] + new_tensor_shape = query.size()[:-1] + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + query = query.view(*new_tensor_shape) + + return query, key, value diff --git a/megatron/core/transformer/custom_layers/__init__.py b/megatron/core/transformer/custom_layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/core/transformer/custom_layers/transformer_engine.py new file mode 100644 index 0000000..2af62bd --- /dev/null +++ b/megatron/core/transformer/custom_layers/transformer_engine.py @@ -0,0 +1,908 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import dataclasses +import os +from importlib.metadata import version +from typing import Callable + +import torch +import transformer_engine as te +from pkg_resources import packaging +from torch import Tensor + +from megatron.core import ModelParallelConfig, parallel_state +from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.parallel_state import ( + get_context_parallel_global_ranks, + get_context_parallel_group, + get_tensor_model_parallel_group, +) +from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from megatron.core.tensor_parallel.utils import divide +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint + + +def get_te_version(): + def get_te_version_str(): + if hasattr(te, '__version__'): + return str(te.__version__) + else: + return version("transformer-engine") + + return packaging.version.Version(get_te_version_str()) + + +_te_version = get_te_version() + + +def _get_extra_te_kwargs(config: TransformerConfig): + extra_transformer_engine_kwargs = { + "params_dtype": config.params_dtype, + } + + if _te_version >= packaging.version.Version("0.12.0"): + if config.use_cpu_initialization: + extra_transformer_engine_kwargs["device"] = 'cpu' + else: + extra_transformer_engine_kwargs["device"] = torch.cuda.current_device() + return extra_transformer_engine_kwargs + + +def condition_init_method(config, init_method): + return init_method if config.perform_initialization else (lambda w: None) + + +class TENorm: + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `LayerNorm` or `RMSNorm` based on input + """ + + # TODO should we ditch normalization config and just use spec to choose LayerNorm vs RMSNorm? + def __new__( + cls, + config: TransformerConfig, + hidden_size: int, + eps: float = 1e-5, + ): + if config.normalization == "LayerNorm": + instance = te.pytorch.LayerNorm( + hidden_size=hidden_size, + eps=eps, + sequence_parallel=config.sequence_parallel, + zero_centered_gamma=config.layernorm_zero_centered_gamma, + **_get_extra_te_kwargs(config), + ) + elif config.normalization == "RMSNorm": + assert hasattr( + te.pytorch, "RMSNorm" + ), "Transformer-Engine >= v0.11 required to use this feature" + instance = te.pytorch.RMSNorm( + hidden_size=hidden_size, + eps=eps, + sequence_parallel=config.sequence_parallel, + zero_centered_gamma=config.layernorm_zero_centered_gamma, + **_get_extra_te_kwargs(config), + ) + else: + raise Exception('Only LayerNorm and RMSNorm are curently supported') + + return instance + + +class TELinear(te.pytorch.Linear): + """ + Wrapper for the Transformer-Engine's `Linear` layer. + + Note that if Megatron's parallel_state has not been initialized + yet, the tp_group passed to TE will be None and must be set later + via set_tensor_parallel_group(). + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + parallel_mode: str, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + skip_bias_add: bool, + skip_weight_param_allocation: bool, + tp_comm_buffer_name: str = None, + ): + self.config = config + + # TE returns a zero length Tensor when bias=False and + # return_bias=True, but we prefer None. So in that case we + # tell TE to not return the bias, and return None + # ourselves. This way our forward always returns two values + # and we don't have to deal with the zero length Tensor. + self.te_return_bias = skip_bias_add and bias + self.is_first_microbatch = True + self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache + if skip_weight_param_allocation: + raise ValueError( + 'Transformer Engine linear layers do not support skip_weight_param_allocation' + ) + + extra_kwargs = _get_extra_te_kwargs(config) + + if _te_version >= packaging.version.Version("0.8.0"): + if self.config.tp_comm_overlap: + if _te_version > packaging.version.Version("1.5.0"): + # Use old overlap flags if they were supplied instead + extra_kwargs["ub_overlap_ag"] = ( + self.config.tp_comm_overlap_ag + if hasattr(self.config, "tp_comm_overlap_ag") + else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag + ) + extra_kwargs["ub_overlap_rs"] = ( + self.config.tp_comm_overlap_rs + if hasattr(self.config, "tp_comm_overlap_rs") + else self.config.tp_comm_split_rs or self.config.tp_comm_atomic_rs + ) + else: + extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag + extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag + extra_kwargs["ub_split_rs"] = self.config.tp_comm_split_rs + extra_kwargs["ub_atomic_gemm_rs"] = self.config.tp_comm_atomic_rs + if _te_version > packaging.version.Version("1.0.0"): + assert ( + tp_comm_buffer_name is not None + ), "Buffer name should be set to configure communication overlap settings" + extra_kwargs["ub_name"] = tp_comm_buffer_name + + super().__init__( + in_features=input_size, + out_features=output_size, + sequence_parallel=self.config.sequence_parallel, + fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, + tp_group=get_tensor_model_parallel_group(check_initialized=False), + tp_size=self.config.tensor_model_parallel_size, + get_rng_state_tracker=( + get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None + ), + init_method=condition_init_method(config, init_method), + bias=bias, + return_bias=self.te_return_bias, + parallel_mode=parallel_mode, + **extra_kwargs, + ) + + def forward(self, x): + _is_first_microbatch = ( + None if self.disable_parameter_transpose_cache else self.is_first_microbatch + ) + out = super().forward(x, is_first_microbatch=_is_first_microbatch) + self.is_first_microbatch = False + + # TE only returns a tuple when return_bias is True, otherwise + # it returns a single Tensor, we always want to return two + # values regardless of the arguments. + if self.te_return_bias: + return out + return out, None + + +class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear): + """ + Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines + layernorm and linear layers + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: TransformerConfig, + init_method: Callable, + gather_output: bool, + bias: bool, + skip_bias_add: bool, + is_expert: bool, + skip_weight_param_allocation: bool = False, + tp_comm_buffer_name: str = None, + ): + self.config = config + + if gather_output: + raise ValueError('Transformer Engine linear layers do not support gather_output = True') + + if is_expert: + raise ValueError('Transformer Engine linear layers do not yet support MoE') + + if skip_weight_param_allocation: + raise ValueError( + 'Transformer Engine linear layers do not support skip_weight_param_allocation' + ) + + # TE returns a zero length Tensor when bias=False and + # return_bias=True, but we prefer None. So in that case we + # tell TE to not return the bias, and return None + # ourselves. This way our forward always returns two values + # and we don't have to deal with the zero length Tensor. + self.te_return_bias = skip_bias_add and bias + self.is_first_microbatch = True + self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache + extra_kwargs = _get_extra_te_kwargs(config) + + # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm` + if _te_version >= packaging.version.Version("0.11.0"): + extra_kwargs["normalization"] = self.config.normalization + elif self.config.normalization != "LayerNorm": + raise ValueError( + f"Transformer Engine v{_te_version} does not support {self.config.normalization}." + ) + + if _te_version >= packaging.version.Version("0.8.0"): + if self.config.tp_comm_overlap: + extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad + extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad + if _te_version > packaging.version.Version("1.5.0"): + # Use old overlap flags if they were supplied instead + extra_kwargs["ub_overlap_ag"] = ( + self.config.tp_comm_overlap_ag + if hasattr(self.config, "tp_comm_overlap_ag") + else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag + ) + if _te_version > packaging.version.Version("1.6.0.dev0"): + extra_kwargs["ub_overlap_rs_dgrad"] = ( + self.config.tp_comm_overlap_rs_dgrad + if hasattr(self.config, "tp_comm_overlap_rs_dgrad") + else False + ) + else: + extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag + extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag + if _te_version > packaging.version.Version("1.0.0"): + assert ( + tp_comm_buffer_name is not None + ), "Buffer name should be set to configure communication overlap settings" + extra_kwargs["ub_name"] = tp_comm_buffer_name + + super().__init__( + in_features=input_size, + out_features=output_size, + eps=self.config.layernorm_epsilon, + sequence_parallel=self.config.sequence_parallel, + fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, + tp_group=get_tensor_model_parallel_group(check_initialized=False), + tp_size=self.config.tensor_model_parallel_size, + get_rng_state_tracker=( + get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None + ), + init_method=condition_init_method(config, init_method), + bias=bias, + return_bias=self.te_return_bias, + parallel_mode="column", + return_layernorm_output=False, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + **extra_kwargs, + ) + + def forward(self, x): + _is_first_microbatch = ( + None if self.disable_parameter_transpose_cache else self.is_first_microbatch + ) + out = super().forward(x, is_first_microbatch=_is_first_microbatch) + self.is_first_microbatch = False + + # TE only returns a tuple when return_bias is True, otherwise + # it returns a single Tensor, we always want to return two + # values regardless of the arguments. + if self.te_return_bias: + return out + return out, None + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Sharding along axis 0, bias sharded""" + state_dict = self.state_dict(prefix='', keep_vars=True) + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets + ) + + +class TEColumnParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `ColumnParallelLinear` layer. + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + gather_output: bool, + bias: bool, + skip_bias_add: bool, + is_expert: bool, + skip_weight_param_allocation: bool = False, + tp_comm_buffer_name: str = None, + ): + if gather_output: + raise ValueError('Transformer Engine linear layers do not support gather_output = True') + + if is_expert: + raise ValueError('Transformer Engine linear layers do not yet support MoE') + + super().__init__( + input_size=input_size, + output_size=output_size, + parallel_mode="column", + config=config, + init_method=condition_init_method(config, init_method), + bias=bias, + skip_bias_add=skip_bias_add, + skip_weight_param_allocation=skip_weight_param_allocation, + tp_comm_buffer_name=tp_comm_buffer_name, + ) + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Sharding along axis 0, bias sharded""" + state_dict = self.state_dict(prefix='', keep_vars=True) + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets + ) + + +class TERowParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `RowParallelLinear` layer. + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + input_is_parallel: bool, + skip_bias_add: bool, + is_expert: bool, + tp_comm_buffer_name: str = None, + ): + if not input_is_parallel: + raise ValueError( + "Transformer Engine linear layers do not support input_is_parallel = False" + ) + + if is_expert: + raise ValueError('Transformer Engine linear layers do not yet support MoE') + + super().__init__( + input_size=input_size, + output_size=output_size, + parallel_mode="row", + config=config, + init_method=condition_init_method(config, init_method), + bias=bias, + skip_bias_add=skip_bias_add, + skip_weight_param_allocation=False, # We don't currently use this for row parallel layers + tp_comm_buffer_name=tp_comm_buffer_name, + ) + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Sharding along axis 1, bias not sharded""" + state_dict = self.state_dict(prefix='', keep_vars=True) + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {'weight': 1}, sharded_offsets + ) + + +class TEDotProductAttention(te.pytorch.DotProductAttention): + """ + Wrapper for the Transformer-Engine's `DotProductAttention` layer that also + has "flash attention" enabled. + + Note that if Megatron's parallel_state has not been initialized yet, the + tp_group and cp_group passed to TE will be None and must be set later + via set_tensor_parallel_group() and set_context_parallel_group(). + """ + + cp_stream: torch.cuda.Stream = None + + def __init__( + self, + config: TransformerConfig, + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + attention_dropout: float = None, + ): + self.config = config + self.te_forward_mask_type = False + self.qkv_format: str = 'sbhd' + + if self.config.apply_query_key_layer_scaling != bool( + int(os.getenv('NVTE_APPLY_QK_LAYER_SCALING', '0')) + ): + raise ValueError( + f"apply_query_key_layer_scaling is {self.config.apply_query_key_layer_scaling} " + f"but environment variable NVTE_APPLY_QK_LAYER_SCALING is " + f"{os.getenv('NVTE_APPLY_QK_LAYER_SCALING')}. Transformer Engine does not support " + f"setting query key layer scaling via argument, so these two must match." + ) + + extra_kwargs = {} + if _te_version >= packaging.version.Version("0.11.0"): + extra_kwargs["num_gqa_groups"] = self.config.num_query_groups + elif self.config.num_query_groups != self.config.num_attention_heads: + raise ValueError( + f"Transformer Engine v{_te_version} does not support Grouped Query Attention, " + f"use a newer version of Transformer Engine. " + f"(num_query_groups ({self.config.num_query_groups}) != " + f"num_attention_heads ({self.config.num_attention_heads}))" + ) + + if _te_version >= packaging.version.Version("0.10.0"): + extra_kwargs["attention_type"] = attention_type + # older version don't need attention_type + + if _te_version > packaging.version.Version("0.12.0"): + self.te_forward_mask_type = True + + # Only Transformer-Engine version >= 1.0.0 supports context parallelism + if _te_version >= packaging.version.Version("1.0.0"): + if getattr(TEDotProductAttention, "cp_stream") is None: + TEDotProductAttention.cp_stream = torch.cuda.Stream() + extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False) + extra_kwargs["cp_global_ranks"] = get_context_parallel_global_ranks( + check_initialized=False + ) + extra_kwargs["cp_stream"] = TEDotProductAttention.cp_stream + else: + assert ( + self.config.context_parallel_size == 1 + ), "Only Transformer-Engine version >= 1.0.0 supports context parallelism!" + + if self.config.deterministic_mode: + if int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")) != 0: + raise RuntimeError( + "deterministic_mode is on and we are using DotProductAttention from " + "Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. " + f"Currently set to: {os.getenv('NVTE_ALLOW_NONDETERMINISTIC_ALGO', 'not set')}." + ) + + if config.window_size is not None: + # Check version + assert _te_version >= packaging.version.Version( + "1.2.0" + ), f"Transformer-Engine version ({str(_te_version)}) must be >= 1.2.0 to support sliding window attention." + extra_kwargs['window_size'] = config.window_size + + super().__init__( + num_attention_heads=self.config.num_attention_heads, + kv_channels=self.config.kv_channels, + attention_dropout=( + self.config.attention_dropout if attention_dropout is None else attention_dropout + ), + attn_mask_type=attn_mask_type.name, + sequence_parallel=self.config.sequence_parallel, + tp_size=self.config.tensor_model_parallel_size, + get_rng_state_tracker=( + get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None + ), + tp_group=get_tensor_model_parallel_group(check_initialized=False), + layer_number=layer_number, + **extra_kwargs, + ) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Tensor, + attn_mask_type: AttnMaskType, + packed_seq_params: PackedSeqParams = None, + ): + packed_seq_kwargs = ( + dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {} + ) + # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set after init + if self.config.apply_rope_fusion and _te_version > packaging.version.Version("0.13.0"): + self.qkv_format = 'bshd' + + qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) + + if _te_version < packaging.version.Version("1.3.0"): + # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H copies (#555) + # These two arguments did not exist prior to 1.3.0 + packed_seq_kwargs.pop("max_seqlen_q", None) + packed_seq_kwargs.pop("max_seqlen_kv", None) + + if self.config.apply_rope_fusion and qkv_format == 'bshd': + query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)] + # In PyTorch, the following two tensors are in fact the same: + # Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1) + # Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1) + # Stride for a dimension that is 1 has no meaning, so tensors created two different ways + # can have same shape but different strides. + # We unify them to the first one to pass the stride check in TE + if value.shape == key.shape and value.shape[0] == 1 and value.stride() != key.stride(): + value = value.as_strided(value.shape, key.stride()) + + if self.te_forward_mask_type: + if qkv_format == 'thd' and _te_version >= packaging.version.Version("1.7.0"): + # thd format uses flash attention with cuDNN kernel which requires is_padding=True, so the only + # acceptable mask types are `padding_causal` and `padding`. These do not necessarily indicate + # there are padded tokens in the sequence. + if attn_mask_type == AttnMaskType.causal: + attn_mask_type = AttnMaskType.padding_causal + elif attn_mask_type == AttnMaskType.no_mask: + attn_mask_type = AttnMaskType.padding + core_attn_out = super().forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type.name, + **packed_seq_kwargs, + ) + else: + core_attn_out = super().forward( + query, + key, + value, + attention_mask, + **packed_seq_kwargs, + ) + + if self.config.apply_rope_fusion and qkv_format == 'bshd': + return core_attn_out.transpose(0, 1) + else: + return core_attn_out + + +if _te_version >= packaging.version.Version("1.9.0.dev0"): + + class TEGroupedLinear(te.pytorch.GroupedLinear): + """ + Wrapper for the Transformer-Engine's `GroupedLinear` layer. + + Note that if Megatron's parallel_state has not been initialized + yet, the tp_group passed to TE will be None and must be set later + via set_tensor_parallel_group(). + """ + + def __init__( + self, + num_gemms: int, + input_size: int, + output_size: int, + *, + parallel_mode: str, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + skip_bias_add: bool, + is_expert: bool = False, + tp_comm_buffer_name: str = None, + ): + self.config = config + + # TE returns a zero length Tensor when bias=False and + # return_bias=True, but we prefer None. So in that case we + # tell TE to not return the bias, and return None + # ourselves. This way our forward always returns two values + # and we don't have to deal with the zero length Tensor. + self.te_return_bias = skip_bias_add and bias + self.is_first_microbatch = True + self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache + + extra_kwargs = _get_extra_te_kwargs(config) + extra_kwargs["ub_name"] = tp_comm_buffer_name + + self.expert_parallel = self.config.expert_model_parallel_size > 1 + if self.expert_parallel: + extra_kwargs["rng_tracker_name"] = get_expert_parallel_rng_tracker_name() + + # For MoE models, the comms between TP and EP group is explicitly handled by MoE token dispatcher. + # So we disable comms by making TE agnostic of model parallel. + self.explicit_expert_comm = is_expert and ( + config.tensor_model_parallel_size > 1 or self.expert_parallel + ) + tp_group = get_tensor_model_parallel_group(check_initialized=False) + if self.explicit_expert_comm and config.moe_extended_tp: + tp_size = parallel_state.get_tensor_and_expert_parallel_world_size() + else: + tp_size = parallel_state.get_tensor_model_parallel_world_size() + if self.explicit_expert_comm: + if parallel_mode == "column": + output_size = divide(output_size, tp_size) + elif parallel_mode == "row": + input_size = divide(input_size, tp_size) + parallel_mode = None + tp_size = 1 + tp_group = None + + super().__init__( + num_gemms=num_gemms, + in_features=input_size, + out_features=output_size, + sequence_parallel=self.config.sequence_parallel, + fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, + tp_group=tp_group, + tp_size=tp_size, + get_rng_state_tracker=( + get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None + ), + init_method=condition_init_method(config, init_method), + bias=bias, + return_bias=self.te_return_bias, + parallel_mode=parallel_mode, + **extra_kwargs, + ) + + for param in self.parameters(): + setattr(param, 'allreduce', not (is_expert and self.expert_parallel)) + + def forward(self, x, m_splits): + _is_first_microbatch = ( + None if self.disable_parameter_transpose_cache else self.is_first_microbatch + ) + out = super().forward(x, m_splits, is_first_microbatch=_is_first_microbatch) + self.is_first_microbatch = False + + # TE only returns a tuple when return_bias is True, otherwise + # it returns a single Tensor, we always want to return two + # values regardless of the arguments. + if self.te_return_bias: + return out + return out, None + + def _sharded_state_dict_grouped( + self, tp_axis_map, prefix='', sharded_offsets=(), metadata=None + ): + """ + prefix should be module_name to make keys identical to sequetial ones. + """ + sharded_state_dict = {} + full_state_dict = self.state_dict(prefix='', keep_vars=True) + num_global_experts = ( + parallel_state.get_expert_model_parallel_world_size() * self.num_gemms + ) + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_gemms + ) + ep_axis = len(sharded_offsets) + for gemm_idx in range(self.num_gemms): + state_dict = { + f'{gemm_idx}.weight': full_state_dict[f'weight{gemm_idx}'], + f'{gemm_idx}._extra_state': full_state_dict['_extra_state'], + } + if self.use_bias: + state_dict[f'{gemm_idx}.bias'] = full_state_dict[f'bias{gemm_idx}'] + sub_sd = make_sharded_tensors_for_checkpoint( + state_dict, + '', + tp_axis_map, + ( + *sharded_offsets, + (ep_axis, local_expert_indices_offset + gemm_idx, num_global_experts), + ), + ) + # Remove expert layers indexing from sharded keys + replace_prefix_for_sharding(sub_sd, f'{gemm_idx}.', prefix) + sharded_state_dict.update( + { + f'{prefix}weight{gemm_idx}': sub_sd[f'{gemm_idx}.weight'], + # TODO: TE's GroupedLinear only has one _extra_state for all experts. + # We need sharding or build/merge fn to handle _extra_state correctly. + f'{prefix}_extra_state{"" if gemm_idx == 0 else gemm_idx}': sub_sd[ + f'{gemm_idx}._extra_state' + ], + } + ) + if self.use_bias: + sharded_state_dict[f'{prefix}bias{gemm_idx}'] = sub_sd[f'{gemm_idx}.bias'] + # Adjust replica ids - replication along DP modulo EP + for k, sh_ten in sharded_state_dict.items(): + replica_id = sh_ten.replica_id + assert ( + len(replica_id) == 3 + ), f'Expected replica_id for {k} to be in (PP, TP, DP) format, got: {replica_id}' + sh_ten.replica_id = ( + *replica_id[:2], + parallel_state.get_data_modulo_expert_parallel_rank(), + ) + return sharded_state_dict + + class TEColumnParallelGroupedLinear(TEGroupedLinear): + """ + Wrapper for the Transformer-Engine's `GroupedLinear` layer but specialized + to column-parallel style. + """ + + def __init__( + self, + num_gemms: int, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + skip_bias_add: bool, + is_expert: bool, + tp_comm_buffer_name: str = None, + ): + + super().__init__( + num_gemms=num_gemms, + input_size=input_size, + output_size=output_size, + parallel_mode="column", + config=config, + init_method=condition_init_method(config, init_method), + bias=bias, + skip_bias_add=skip_bias_add, + is_expert=is_expert, + tp_comm_buffer_name=tp_comm_buffer_name, + ) + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """ + For each gemm, sharding along axis 0, bias sharded. + Assume sharded_offsets[-1] is the expert parallel offset. + """ + tp_axis_map = {} + for gemm_idx in range(self.num_gemms): + tp_axis_map.update( + { + f'{gemm_idx}.weight': 0, + f'{gemm_idx}.bias': 0, + } + ) + return super()._sharded_state_dict_grouped( + tp_axis_map, prefix, sharded_offsets, metadata + ) + + class TERowParallelGroupedLinear(TEGroupedLinear): + """ + Wrapper for the Transformer-Engine's `GroupedLinear` layer but specialized + to row-parallel style. + """ + + def __init__( + self, + num_gemms: int, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + skip_bias_add: bool, + is_expert: bool, + tp_comm_buffer_name: str = None, + ): + + super().__init__( + num_gemms=num_gemms, + input_size=input_size, + output_size=output_size, + parallel_mode="row", + config=config, + init_method=condition_init_method(config, init_method), + bias=bias, + skip_bias_add=skip_bias_add, + is_expert=is_expert, + tp_comm_buffer_name=tp_comm_buffer_name, + ) + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """ + For each gemm, sharding along axis 1, bias not sharded. + Assume sharded_offsets[-1] is the expert parallel offset. + """ + tp_axis_map = {f'{gemm_idx}.weight': 1 for gemm_idx in range(self.num_gemms)} + return super()._sharded_state_dict_grouped( + tp_axis_map, prefix, sharded_offsets, metadata + ) + +else: + + TEGroupedLinear = None + TEColumnParallelGroupedLinear = None + TERowParallelGroupedLinear = None + + +class TEDelayedScaling(te.common.recipe.DelayedScaling): + """ + Wrapper for the Transformer-Engine's `DelayedScaling` layer. + """ + + def __init__( + self, + config: ModelParallelConfig, + fp8_format: int, + override_linear_precision: tuple = (False, False, False), + ): + extra_kwargs = _get_extra_te_kwargs(config) + if _te_version >= packaging.version.Version("1.6.0.dev0"): + extra_kwargs["fp8_dpa"] = config.fp8_dot_product_attention + extra_kwargs["fp8_mha"] = config.fp8_multi_head_attention + + super().__init__( + margin=config.fp8_margin, + interval=config.fp8_interval, + fp8_format=fp8_format, + amax_compute_algo=config.fp8_amax_compute_algo, + amax_history_len=config.fp8_amax_history_len, + override_linear_precision=override_linear_precision, + **extra_kwargs, + ) + + +def te_checkpoint( + forward_func, + distribute_saved_activations, + get_rng_state_tracker, + tp_group, + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + packed_seq_params, +): + from transformer_engine.pytorch.distributed import checkpoint + + if _te_version >= packaging.version.Version("1.5.0"): + return checkpoint( + forward_func, + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + packed_seq_params, + distribute_saved_activations=distribute_saved_activations, + get_rng_state_tracker=get_rng_state_tracker, + tp_group=tp_group, + ) + else: + return checkpoint( + forward_func, + distribute_saved_activations, + get_rng_state_tracker, + tp_group, + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + packed_seq_params, + ) + + +try: + + from transformer_engine.pytorch.attention import _SplitAlongDim + + SplitAlongDim = _SplitAlongDim.apply + +except ImportError: + + SplitAlongDim = None + +try: + + from transformer_engine.pytorch.cpu_offload import get_cpu_offload_context + +except ImportError: + + get_cpu_offload_context = None diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py new file mode 100644 index 0000000..967d0ce --- /dev/null +++ b/megatron/core/transformer/dot_product_attention.py @@ -0,0 +1,205 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +import math + +import torch +from torch import Tensor + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import attention_mask_func +from megatron.core.utils import divide + + +class DotProductAttention(MegatronModule): + """ + Region where selective activation recomputation is applied. + This region is memory intensive but less compute intensive which + makes activation checkpointing more efficient for LLMs (20B+). + See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + + We use the following notation: + h: hidden size + n: number of attention heads + p: number of tensor model parallel partitions + b: batch size + s: sequence length + """ + + def __init__( + self, + config: TransformerConfig, + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + attention_dropout: float = None, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + + assert ( + self.config.context_parallel_size == 1 + ), "Context parallelism is only supported by TEDotProductAttention!" + + assert ( + self.config.window_size is None + ), "Sliding Window Attention is only supported by TEDotProductAttention!" + + self.layer_number = max(1, layer_number) + self.attn_mask_type = attn_mask_type + self.attention_type = attention_type # unused for now + + projection_size = self.config.kv_channels * self.config.num_attention_heads + + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_partition = divide(projection_size, world_size) + self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size) + self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size) + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.config.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + input_in_fp16=self.config.fp16, + input_in_bf16=self.config.bf16, + attn_mask_type=self.attn_mask_type, + scaled_masked_softmax_fusion=self.config.masked_softmax_fusion, + mask_func=attention_mask_func, + softmax_in_fp32=self.config.attention_softmax_in_fp32, + scale=coeff, + ) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout( + self.config.attention_dropout if attention_dropout is None else attention_dropout + ) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Tensor, + attn_mask_type: AttnMaskType = None, + packed_seq_params: PackedSeqParams = None, + ): + assert packed_seq_params is None, ( + "Packed sequence is not supported by DotProductAttention." + "Please use TEDotProductAttention instead." + ) + + # =================================== + # Raw attention scores. [b, n/p, s, s] + # =================================== + + # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn] + # This is a noop for normal attention where ng == np. When using group query attention this + # creates a view that has the keys and values virtually repeated along their dimension to + # match the number of queries. + + # attn_mask_type is not used. + if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: + key = key.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + value = value.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + + # [b, np, sq, sk] + output_size = ( + query.size(1), + query.size(2), + query.size(0), + key.size(0), + ) + + # [sq, b, np, hn] -> [sq, b * np, hn] + # This will be a simple view when doing normal attention, but in group query attention + # the key and value tensors are repeated to match the queries so you can't use simple strides + # to extract the queries. + query = query.reshape(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key = key.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( + (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu", + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query.transpose(0, 1), # [b * np, sq, hn] + key.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + + if not self.config.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = ( + value.size(1), + value.size(2), + query.size(0), + value.size(3), + ) + + # change view [sk, b * np, hn] + value = value.view(value.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + + # matmul: [b * np, sq, hn] + context = torch.bmm(attention_probs, value.transpose(0, 1)) + + # change view [b, np, sq, hn] + context = context.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context = context.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,) + context = context.view(*new_context_shape) + + return context diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py new file mode 100644 index 0000000..3d9bc55 --- /dev/null +++ b/megatron/core/transformer/enums.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import enum + + +# can we get rid of this? +# it's being used in pipeline schedules +class ModelType(enum.Enum): + encoder_or_decoder = 1 + encoder_and_decoder = 2 + + +# class LayerType(enum.Enum): +# encoder = 1 +# decoder = 2 + + +class AttnType(enum.Enum): + self_attn = 1 + cross_attn = 2 + + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 + no_mask = 3 # only used for TE + padding_causal = 4 # only used for thd attention diff --git a/megatron/core/transformer/identity_op.py b/megatron/core/transformer/identity_op.py new file mode 100644 index 0000000..5d9388f --- /dev/null +++ b/megatron/core/transformer/identity_op.py @@ -0,0 +1,28 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import torch + + +class IdentityOp(torch.nn.Module): + """ + This is a placeholder for IdentityOp(x) -> x + """ + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x, *args, **kwargs): + return x + + +class IdentityFuncOp(IdentityOp): + """ + This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x. + Such a func is handy for ops like `bias_dropout_fusion` which themselves + return a function at runtime based on passed arguments + """ + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, *args, **kwargs): + return super().forward diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py new file mode 100644 index 0000000..e82d6ec --- /dev/null +++ b/megatron/core/transformer/mlp.py @@ -0,0 +1,255 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.mapping import ( + ReplicaId, + ShardedStateDict, + ShardedTensorFactory, +) +from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl +from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl +from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint + + +@dataclass +class MLPSubmodules: + linear_fc1: Union[ModuleSpec, type] = None + linear_fc2: Union[ModuleSpec, type] = None + + +class MLP(MegatronModule): + """ + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + + + Returns an output and a bias to be added to the output. + If config.add_bias_linear is False, the bias returned is None. + + We use the following notation: + h: hidden size + p: number of tensor model parallel partitions + b: batch size + s: sequence length + """ + + def __init__( + self, + config: TransformerConfig, + submodules: MLPSubmodules, + is_expert: bool = False, + input_size: int = None, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + + self.input_size = input_size if input_size != None else self.config.hidden_size + + # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf + ffn_hidden_size = self.config.ffn_hidden_size + if self.config.gated_linear_unit: + ffn_hidden_size *= 2 + + self.linear_fc1 = build_module( + submodules.linear_fc1, + self.input_size, + ffn_hidden_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear, + skip_bias_add=True, + is_expert=is_expert, + tp_comm_buffer_name='fc1', + ) + + self.activation_func = self.config.activation_func + + self.linear_fc2 = build_module( + submodules.linear_fc2, + self.config.ffn_hidden_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True, + is_expert=is_expert, + tp_comm_buffer_name='fc2', + ) + + def forward(self, hidden_states): + + # [s, b, 4 * h/p] + intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) + + if self.config.bias_activation_fusion: + if self.activation_func == F.gelu: + if self.config.gated_linear_unit: + intermediate_parallel = bias_geglu_impl(intermediate_parallel, bias_parallel) + else: + assert self.config.add_bias_linear is True + intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) + elif self.activation_func == F.silu and self.config.gated_linear_unit: + intermediate_parallel = bias_swiglu_impl( + intermediate_parallel, + bias_parallel, + self.config.activation_func_fp8_input_store, + ) + else: + raise ValueError("Only support fusion of gelu and swiglu") + else: + if bias_parallel is not None: + intermediate_parallel = intermediate_parallel + bias_parallel + if self.config.gated_linear_unit: + + def glu(x): + x = torch.chunk(x, 2, dim=-1) + return self.config.activation_func(x[0]) * x[1] + + intermediate_parallel = glu(intermediate_parallel) + else: + intermediate_parallel = self.activation_func(intermediate_parallel) + + # [s, b, h] + output, output_bias = self.linear_fc2(intermediate_parallel) + + return output, output_bias + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None + ) -> ShardedStateDict: + sharded_state_dict = {} + for name, module in self._modules.items(): + sub_sd = module.sharded_state_dict(f'{prefix}{name}.', sharded_offsets, metadata) + if self.config.gated_linear_unit and name == 'linear_fc1': + assert f'{prefix}{name}.weight' in sub_sd, sub_sd.keys() + for k, v in sub_sd.items(): + if k in (f'{prefix}{name}.weight', f'{prefix}{name}.bias'): + sub_sd[k] = apply_swiglu_sharded_factory(v, sharded_offsets) + sharded_state_dict.update(sub_sd) + return sharded_state_dict + + +def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets): + # We must split the tensor into 2 parts, each sharded separately. + # This requires a ShardedTensorFactory which `chunk`s during saving + # and `cat`s during loading + tp_rank = parallel_state.get_tensor_model_parallel_rank() + tp_size = parallel_state.get_tensor_model_parallel_world_size() + swiglu_shard_axis = 0 + prepend_axis_num = len(sharded_offsets) + original_shape = original_sh_ten.local_shape + original_numel = int(np.prod(original_shape)) + + @torch.no_grad() + def sh_ten_build_fn( + key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice] + ): + offset_w = (swiglu_shard_axis + prepend_axis_num, tp_rank, tp_size * 2) + offset_v = (swiglu_shard_axis + prepend_axis_num, tp_size + tp_rank, tp_size * 2) + if flattened_range is None: + tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis) + return [ + ShardedTensor.from_rank_offsets( + key, + tensor_w, + *sharded_offsets, + offset_w, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + ), + ShardedTensor.from_rank_offsets( + key, + tensor_v, + *sharded_offsets, + offset_v, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + ), + ] + else: + # Here we need to map a slice `t` (`flattened_range` specifies slice start and stop) + # of the *original* flattened tensor into slices `w` and `v` of chunked + # and flattened tensor. + # Example: + # If original tensor has (16, 5) shape and flattened_range is `slice(8, 64)`, + # then `t` has shape `(56,)` and we need to create 2 tensors: + # w: first 32 elements of `t` with flattened_range slice(8, 40) + # v: last 24 elements of `t` with flattened_range slice(0, 24) + # Global offsets are the same as in the non-flattened case + assert t.ndim == 1, (key, t.shape) + non_flat_local_shape = (original_shape[0] // 2, *original_shape[1:]) + chunk_numel = original_numel // 2 + result = [] + if flattened_range.start < chunk_numel: + # Non-empty `w` chunk + tensor_w = t[: chunk_numel - flattened_range.start] + flattened_range_w = slice( + flattened_range.start, min(chunk_numel, flattened_range.stop) + ) + assert len(tensor_w) == flattened_range_w.stop - flattened_range_w.start + result.append( + ShardedTensor.from_rank_offsets_flat( + key, + tensor_w, + non_flat_local_shape, + *sharded_offsets, + offset_w, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + flattened_range=flattened_range_w, + ) + ) + if flattened_range.stop > chunk_numel: + # Non-empty `v` chunk + tensor_v = t[-(flattened_range.stop - chunk_numel) :] + flattened_range_v = slice( + max(chunk_numel, flattened_range.start) - chunk_numel, + flattened_range.stop - chunk_numel, + ) + assert len(tensor_v) == flattened_range_v.stop - flattened_range_v.start, ( + len(tensor_v), + flattened_range_v, + ) + + result.append( + ShardedTensor.from_rank_offsets_flat( + key, + tensor_v, + non_flat_local_shape, + *sharded_offsets, + offset_v, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + flattened_range=flattened_range_v, + ) + ) + assert sum(sh_ten.data.numel() for sh_ten in result) == t.numel(), (result, t.shape) + return result + + def sh_ten_merge_fn(sub_state_dict): + with torch.no_grad(): + return torch.cat(sub_state_dict) + + return ShardedTensorFactory( + original_sh_ten.key, + original_sh_ten.data, + sh_ten_build_fn, + sh_ten_merge_fn, + original_sh_ten.replica_id, + ) diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py new file mode 100644 index 0000000..007521d --- /dev/null +++ b/megatron/core/transformer/module.py @@ -0,0 +1,190 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron Module.""" +from typing import Optional, Tuple + +import torch +from torch.autograd import Variable +from torch.nn.parameter import Parameter + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import ( + make_sharded_tensors_for_checkpoint, + sharded_state_dict_default, +) + +_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) +_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor) + + +def param_is_not_shared(param): + return not hasattr(param, 'shared') or not param.shared + + +class MegatronModule(torch.nn.Module): + """Base Megatron module inhertied by all Models. + + Megatron specific extensions of torch Module with support + for pipelining + + Args: + config (TransformerConfig): Transformer config + """ + + # def __init__(self, config: TransformerConfig, share_word_embeddings=True): + def __init__(self, config: TransformerConfig): + super().__init__() + self.config = config + + def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: bool = False): + """Override state dict for saving checkpoints Use this function to override the + state dict for saving checkpoints. + + Args: + prefix (str, optional): _description_. Defaults to ''. + keep_vars (bool, optional): _description_. Defaults to False. + + Returns: + _type_: _description_ + """ + + return self.state_dict(prefix=prefix, keep_vars=keep_vars) + + def sharded_state_dict( + self, + prefix: str = '', + sharded_offsets: Tuple[Tuple[int, int, int]] = (), + metadata: Optional[dict] = None, + ) -> ShardedStateDict: + """Default implementation for sharded state dict for distributed checkpointing. + + General definition of sharded_state_dict simply calls `sharded_state_dict_default` + (which call sharded_state_dict method if possible or a default implementation otherwise) + recursively on all submodules. + + Args: + prefix (str): prefix for the state dict keys + sharded_offsets (Tuple[Tuple[int, int, int]], optional): sharding already + applied (e.g. PP related) by sup-modules. Passed along to ShardedTensor + metadata (dict, optional): metadata passed recursively to sharded_state_dict methods + + Returns: + dict: dictionary of state dict keys mapped to ShardedTensors + """ + sharded_state_dict = {} + # Save parameters + self._save_to_state_dict(sharded_state_dict, '', keep_vars=True) + sharded_state_dict = make_sharded_tensors_for_checkpoint( + sharded_state_dict, prefix, sharded_offsets=sharded_offsets + ) + # Recurse into submodules + for name, module in self.named_children(): + sharded_state_dict.update( + sharded_state_dict_default(module, f'{prefix}{name}.', sharded_offsets, metadata) + ) + return sharded_state_dict + + def set_is_first_microbatch(self): + """Sets the is_first_microbatch flag if it exists. When this flag is set, TE modules will update their fp8 parameter cache. + + """ + for m in self.modules(): + if hasattr(m, "is_first_microbatch"): + m.is_first_microbatch = True + + +def conversion_helper(val, conversion): + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + + +def fp32_to_float16(val, float16_convertor): + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, _FLOAT_TYPES): + val = float16_convertor(val) + return val + + return conversion_helper(val, half_conversion) + + +def float16_to_fp32(val): + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)): + val = val.float() + return val + + return conversion_helper(val, float_conversion) + + +class Float16Module(MegatronModule): + """Float 16 Module. + + Attributes: + config (TransformerConfig): Transformer config + fp16 (bool) : Specifies if the model runs in fp16 mode + bf16 (bool) : Specifies if the model runs in bf16 mode + + Args: + config (TransformerConfig): The transformer config used to initalize the model + """ + + def __init__(self, config: TransformerConfig, module: torch.nn.Module): + super(Float16Module, self).__init__(config) + self.config = config + self.fp16 = config.fp16 + self.bf16 = config.bf16 + + if self.fp16: + self.add_module('module', module.half()) + + def float16_convertor(val): + return val.half() + + elif self.bf16: + self.add_module('module', module.bfloat16()) + + def float16_convertor(val): + return val.bfloat16() + + else: + raise Exception('Either config.fp16 or config.bf16 should be True.') + + self.float16_convertor = float16_convertor + + def set_input_tensor(self, input_tensor): + return self.module.set_input_tensor(input_tensor) + + def forward(self, *inputs, **kwargs): + if parallel_state.is_pipeline_first_stage(): + inputs = fp32_to_float16(inputs, self.float16_convertor) + outputs = self.module(*inputs, **kwargs) + if parallel_state.is_pipeline_last_stage(): + outputs = float16_to_fp32(outputs) + return outputs + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """Retrieve state_dict from the module being wrapped.""" + return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars) + + def sharded_state_dict(self, prefix='', *args, **kwargs): + """Retrieve sharded_state_dict from the module being wrapped.""" + return self.module.sharded_state_dict(prefix, *args, **kwargs) + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md new file mode 100644 index 0000000..a1771c7 --- /dev/null +++ b/megatron/core/transformer/moe/README.md @@ -0,0 +1,215 @@ +# Megatron Core MoE Key Features + +### Parallelism + +- **Expert Parallel** + - A specific method of parallelism for MoE models, where experts are partitioned onto different workers and each worker processes a different batch of training samples, each worker process one or more experts for each MoE layer. +- **3D Parallel**: Data Parallel , Tensor Parallel, Pipeline Parallel, Sequence Parallel + - Note: When using MoE with expert parallelism and tensor parallelism, sequence parallelism must be used. +- **Richer parallel mappings**: EP can be combined with DP/TP/PP/SP for handling larger MoE variants. +- **Full distributed optimizer support.** + +### Router and Load Balancing + +- Router type: + - Top-K MLP router +- Load Balancing algorithms: + - Sinkhorn (S-BASE) + - Aux loss / Load balancing loss + +### Performance Optimizations + +- GroupedGEMM when num local experts > 1 + - Supported dtype: bf16 + - Performance improvements for larger MoE models +- Enable `--tp-comm-overlap` for MoE + +### Token Dispatch Mechanism + +- Dropless / No token drop. +- Token drop and padding. + +### Ease of use +- Checkpoint converter (coming soon) +- Per-layer logging + +## Upcoming features + +- Enhanced cutlass GroupedGEMM kernels + - Reduced host-device syncs. + - More supported dtype: fp32/bf16/fp16 + - Kernel heuristics tuned for H100/A100/A10/L40S + - BWD cutlass GroupedGEMM kernels supported +- Token permutation / unpermutation fusion +- Fused Sinkhorn Kernel +- Context Parallel with MoE +- FP8 training support + +# User Guide + +### MoE Related Arguments + +| Item | Description | +| --- | --- | +| num-experts | Number of Experts in MoE (None means no MoE) | +| expert-model-parallel-size | Degree of expert model parallelism. Default is 1. | +| moe-grouped-gemm | When there are multiple experts per rank, compress multiple local (potentially small) gemms in a single kernel launch to improve the utilization and performance by leveraging the Grouped GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm). | +| moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". | +| moe-router-topk | Number of experts to route to for each token. The default is 2. | +| moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. | +| moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. | +| moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. | +| moe-token-dispatcher-type | Determines the token dispatcher type. Choices are "allgather" and "alltoall". Default is "allgather". | +| moe-per-layer-logging | Enable per-layer logging for MoE, currently supports auxiliary loss and z loss. | +| moe-expert-capacity-factor | The capacity factor for each expert, None means no token will be dropped. Default is None. | +| moe-pad-expert-input-to-capacity | Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set. | + +### Usage + +To train a top-2 MoE model with an auxiliary loss, include the following arguments: + +```python +--num-experts 8 +--expert-model-parallel-size 8 +--moe-grouped-gemm +--moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss. +--moe-router-topk 2 +--moe-aux-loss-coeff 1e-2 +--use-distributed-optimizer +``` + +To avoid out-of-memory in dropless MoE training, we can set a large capacity factor, add: + +```python +--moe-expert-capacity-factor 4.0 +``` + +To enable the token drop mechanism, such as GShard and SwitchTransformer, include the following arguments: + +```python +--moe-expert-capacity-factor 1.0 +--moe-pad-expert-input-to-capacity # Optional +``` + + +## Dropless MoE training script example: +
+Click here. + +```bash +#!/bin/bash + +# Runs Mixtral 8x7B model on 32 H100/A100 GPUs +# The Dropless MoE suffers from an imbalanced token distribution at the early stage of training (the first few hundred iterations), which may lead to poor performance and out-of-memory (OOM) issues. +# To check the performance of a Dropless MoE model, we should run the model for at least 500 iterations or resume from trained checkpoints. + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=${MASTER_ADDR:-"localhost"} +MASTER_PORT=${MASTER_PORT:-"6000"} +NNODES=${NNODES:-"1"} +NODE_RANK=${RANK:-"0"} +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +CHECKPOINT_PATH=$1 +TOKENIZER_MODEL=$2 +DATA_PATH=$3 + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NNODES + --node_rank $NODE_RANK + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +MODEL_ARGS=( + --disable-bias-linear + --seq-length 4096 + --max-position-embeddings 32768 + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --init-method-std 0.01 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --normalization RMSNorm + --position-embedding-type rope + --swiglu + --untie-embeddings-and-output-weights + --group-query-attention + --num-query-groups 8 + --no-masked-softmax-fusion + --no-position-embedding +) + +MOE_ARGS=( + --num-experts 8 + --expert-model-parallel-size 8 + --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, None. Default is aux_loss. + --moe-router-topk 2 + --moe-aux-loss-coeff 1e-2 + --moe-grouped-gemm +) + +DATA_ARGS=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${TOKENIZER_MODEL} + --data-path $DATA_PATH + --split 99990,8,2 +) + +TRAINING_ARGS=( + --micro-batch-size 1 + --global-batch-size 128 + --lr 1e-4 + --train-iters 500000 + --lr-decay-iters 320000 + --lr-decay-style cosine + --min-lr 1.0e-5 + --weight-decay 0.1 + --lr-warmup-iters 500 + --clip-grad 1.0 + --bf16 + --overlap-grad-reduce + --overlap-param-gather +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 2 + --pipeline-model-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer +) + +LOGGING_ARGS=( + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ + --no-load-optim \ + --no-load-rng +) + +if [ -n "${WANDB_API_KEY}" ]; then + LOGGING_ARGS+=( + --wandb-project ${WANDB_PROJECT:-"Mixtral-Finetuning"} + --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} + ) +fi + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ + ${MODEL_ARGS[@]} \ + ${MOE_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${LOGGING_ARGS[@]} +``` +
diff --git a/megatron/core/transformer/moe/__init__.py b/megatron/core/transformer/moe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py new file mode 100644 index 0000000..e11adf9 --- /dev/null +++ b/megatron/core/transformer/moe/experts.py @@ -0,0 +1,571 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from copy import deepcopy +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.mapping import ( + ReplicaId, + ShardedStateDict, + ShardedTensorFactory, +) +from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl +from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl +from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl +from megatron.core.jit import jit_fuser +from megatron.core.tensor_parallel.layers import ( + _initialize_affine_weight_cpu, + _initialize_affine_weight_gpu, +) +from megatron.core.tensor_parallel.utils import divide +from megatron.core.transformer.mlp import MLP, MLPSubmodules, apply_swiglu_sharded_factory +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.moe import grouped_gemm_util as gg +from megatron.core.transformer.spec_utils import build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import make_sharded_object_for_checkpoint + + +class GroupedMLP(MegatronModule): + """An efficient implementation of the Experts layer using CUTLASS GroupedGEMM. + + This class is designed to execute multiple experts in parallel, thereby maximizing computational efficiency. + """ + + def __init__(self, num_local_experts: int, config: TransformerConfig): + super().__init__(config=config) + self.config: TransformerConfig = config + self.num_local_experts = num_local_experts + gg.assert_grouped_gemm_is_available() + assert ( + config.add_bias_linear == False + ), "bias in the expert layer is not supported in Grouped GEMM yet, please set '--disable-bias-linear' instead." + + self.expert_parallel = config.expert_model_parallel_size > 1 + if self.config.gated_linear_unit: + if self.config.activation_func not in (F.silu, F.gelu): + raise ValueError("Activation function must be silu or gelu when using GroupedMLP.") + + @jit_fuser + def glu(x): + x = torch.chunk(x, 2, dim=-1) + return self.config.activation_func(x[0]) * x[1] + + self.activation_func = glu + else: + self.activation_func = self.config.activation_func + + # How many feature each rank holds for fc1 and fc2, respectively. + self.moe_extended_tp = config.moe_extended_tp + if config.moe_extended_tp: + tp_size = parallel_state.get_tensor_and_expert_parallel_world_size() + else: + tp_size = parallel_state.get_tensor_model_parallel_world_size() + + fc1_output_size = self.config.ffn_hidden_size * self.num_local_experts + if config.gated_linear_unit: + # Project to 4h. If using swiglu double the output width, + # see https://arxiv.org/pdf/2002.05202.pdf + fc1_output_size *= 2 + fc1_output_size_per_partition = divide(fc1_output_size, tp_size) + + fc2_input_size = self.config.ffn_hidden_size * self.num_local_experts + fc2_input_size_per_partition = divide(fc2_input_size, tp_size) + + # Note: The current kernel implementations of grouped_gemm + # does not support transposition with CUTLASS grouped GEMM + # (https://github.com/fanshiqing/grouped_gemm/blob/main/csrc/grouped_gemm.cu#L355-L358) + # and as a result we avoid allocate the transpose of weights. + # Initialize weight. + if config.use_cpu_initialization: + self.weight1 = Parameter( + torch.empty( + self.config.hidden_size, + fc1_output_size_per_partition, + dtype=config.params_dtype, + ) + ) + self.weight2 = Parameter( + torch.empty( + fc2_input_size_per_partition, + self.config.hidden_size, + dtype=config.params_dtype, + ) + ) + if config.perform_initialization: + _initialize_affine_weight_cpu( + self.weight1, + self.config.hidden_size, + fc1_output_size, + fc1_output_size_per_partition, + partition_dim=1, + init_method=config.init_method, + params_dtype=config.params_dtype, + ) + _initialize_affine_weight_cpu( + self.weight2, + fc2_input_size, + self.config.hidden_size, + fc2_input_size_per_partition, + partition_dim=0, + init_method=config.output_layer_init_method, + params_dtype=config.params_dtype, + ) + else: + self.weight1 = Parameter( + torch.empty( + self.config.hidden_size, + fc1_output_size_per_partition, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + self.weight2 = Parameter( + torch.empty( + fc2_input_size_per_partition, + self.config.hidden_size, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + ) + if config.perform_initialization: + _initialize_affine_weight_gpu( + self.weight1, + config.init_method, + partition_dim=1, + expert_parallel=self.expert_parallel, + ) + _initialize_affine_weight_gpu( + self.weight2, + config.output_layer_init_method, + partition_dim=0, + expert_parallel=self.expert_parallel, + ) + setattr(self.weight1, 'allreduce', not self.expert_parallel) + setattr(self.weight2, 'allreduce', not self.expert_parallel) + + def remove_extra_states_check(self, incompatible_keys): + """ + Remove _extra_state from unexpected keys. + These keys are for dist ckpt compatibility with SequentialMLP. + """ + keys = deepcopy(incompatible_keys.unexpected_keys) + for key in keys: + if '_extra_state' in key: + incompatible_keys.unexpected_keys.remove(key) + + self.register_load_state_dict_post_hook(remove_extra_states_check) + + def forward(self, permuted_local_hidden_states, tokens_per_expert): + if permuted_local_hidden_states.nelement() != 0: + # Reshape the weights for the grouped GEMMs. + w1 = self.weight1.view(self.num_local_experts, self.config.hidden_size, -1) + w2 = self.weight2.view(self.num_local_experts, -1, self.config.hidden_size) + + fc1_output = gg.ops.gmm( + permuted_local_hidden_states, w1, tokens_per_expert, trans_b=False + ) + + intermediate_parallel = self.activation_func(fc1_output) + + fc2_output = gg.ops.gmm(intermediate_parallel, w2, tokens_per_expert, trans_b=False) + else: + # No token is allocated for local experts. + assert torch.count_nonzero(tokens_per_expert) == 0 + + # Make sure parameters still have gradients when no tokens are routed to this set of experts. + w1 = self.weight1.view(self.config.hidden_size, -1) + w2 = self.weight2.view(-1, self.config.hidden_size) + h = torch.matmul(permuted_local_hidden_states, w1) + h = self.activation_func(h) + h = torch.matmul(h, w2) + + fc2_output = h + + return fc2_output, None + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Maps local expert to global experts.""" + if self.moe_extended_tp: + raise NotImplementedError( + 'Currently distributed checkpointing is not supported for moe_extended_tp' + ) + + sharded_state_dict = {} + num_global_experts = ( + parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts + ) + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_local_experts + ) + tp_size = parallel_state.get_tensor_model_parallel_world_size() + tp_rank = parallel_state.get_tensor_model_parallel_rank() + + prepend_axis_num = len(sharded_offsets) + replica_id = ( + 0, + 0, + parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True), + ) + + @torch.no_grad() + def sh_ten_build_fn( + key: str, + t: torch.Tensor, + replica_id: ReplicaId, + flattened_range: Optional[slice], + tp_axis: int, + with_glu: bool, + ): + if tp_axis == 0: + real_shape = (self.num_local_experts, self.config.hidden_size, -1) + elif tp_axis == 1: + real_shape = (self.num_local_experts, -1, self.config.hidden_size) + assert with_glu == False + else: + raise ValueError("tp_axis should be 0 or 1.") + if flattened_range is None: + t = t.view(real_shape).transpose(-1, -2) + if with_glu: + local_tensors = torch.chunk(t, 2, -2) + sub_states = [ + ShardedTensor.from_rank_offsets( + key, + local_tensors[0].contiguous(), + *sharded_offsets, + ( + prepend_axis_num, + parallel_state.get_expert_model_parallel_rank(), + parallel_state.get_expert_model_parallel_world_size(), + ), + (prepend_axis_num + 1, tp_rank, tp_size * 2), + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + ), + ShardedTensor.from_rank_offsets( + key, + local_tensors[1].contiguous(), + *sharded_offsets, + ( + prepend_axis_num, + parallel_state.get_expert_model_parallel_rank(), + parallel_state.get_expert_model_parallel_world_size(), + ), + (prepend_axis_num + 1, tp_size + tp_rank, tp_size * 2), + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + ), + ] + else: + sub_states = ShardedTensor.from_rank_offsets( + key, + t.contiguous(), + *sharded_offsets, + ( + prepend_axis_num, + parallel_state.get_expert_model_parallel_rank(), + parallel_state.get_expert_model_parallel_world_size(), + ), + (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size), + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + ) + else: + raise NotImplementedError( + 'Currently GroupedMLP does not support distributed checkpointing ' + 'with the distributed optimizer.' + ) + return sub_states + + @torch.no_grad() + def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool): + if tp_axis == 0: + weight_shape = (self.config.hidden_size, -1) + elif tp_axis == 1: + weight_shape = (-1, self.config.hidden_size) + assert with_glu == False + else: + raise ValueError("tp_axis should be 0 or 1.") + if with_glu: + sub_state_dict = torch.cat(sub_state_dict, -2) + return sub_state_dict.transpose(-1, -2).reshape(weight_shape) + + state_dict = self.state_dict(prefix='', keep_vars=True) + # To align with SequentialMLP, the weight tensors are transposed, + # and the tp_axis is also for the transposed tensors + for name, tensor in state_dict.items(): + if name == 'weight1': + tp_axis = 0 + with_glu = self.config.gated_linear_unit + wkey = f'{prefix}experts.linear_fc1.weight' + else: + tp_axis = 1 + with_glu = False + wkey = f'{prefix}experts.linear_fc2.weight' + sharded_state_dict[f'{prefix}{name}'] = ShardedTensorFactory( + wkey, + tensor, + partial(sh_ten_build_fn, tp_axis=tp_axis, with_glu=with_glu), + partial(sh_ten_merge_fn, tp_axis=tp_axis, with_glu=with_glu), + replica_id, + ) + + replica_id = ( + 0, + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True), + ) + # Add fake _extra_state to be compatible with SequentialMLP + for expert_local_idx in range(self.num_local_experts): + expert_global_idx = local_expert_indices_offset + expert_local_idx + expert_sharded_offsets = ( + *sharded_offsets, + (len(sharded_offsets), expert_global_idx, num_global_experts), + ) + for mod in ['linear_fc1', 'linear_fc2']: + sharded_state_dict[f'{prefix}expert{expert_global_idx}.{mod}._extra_state'] = ( + make_sharded_object_for_checkpoint( + None, + f'{prefix}experts.{mod}._extra_state', + expert_sharded_offsets, + replica_id, + ) + ) + + return sharded_state_dict + + +class TEGroupedMLP(MegatronModule): + """An efficient implementation of the Experts layer using TE's GroupedLinear. + + This class is designed to execute multiple experts in parallel, thereby maximizing computational efficiency. + """ + + def __init__(self, num_local_experts, config: TransformerConfig, submodules: MLPSubmodules): + super().__init__(config=config) + self.moe_extended_tp = config.moe_extended_tp + self.num_local_experts = num_local_experts + self.input_size = self.config.hidden_size + + # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf + ffn_hidden_size = self.config.ffn_hidden_size + if self.config.gated_linear_unit: + ffn_hidden_size *= 2 + + self.linear_fc1 = build_module( + submodules.linear_fc1, + self.num_local_experts, + self.input_size, + ffn_hidden_size, + config=self.config, + init_method=self.config.init_method, + bias=self.config.add_bias_linear, + skip_bias_add=True, + is_expert=True, + tp_comm_buffer_name='fc1', + ) + + self.activation_func = self.config.activation_func + + self.linear_fc2 = build_module( + submodules.linear_fc2, + self.num_local_experts, + self.config.ffn_hidden_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + skip_bias_add=True, + is_expert=True, + tp_comm_buffer_name='fc2', + ) + + def remove_extra_states_check(self, incompatible_keys): + """ + Remove extra _extra_state from unexpected keys. + These keys are for dist ckpt compatibility with SequentialMLP. + """ + keys = deepcopy(incompatible_keys.unexpected_keys) + for key in keys: + if '_extra_state' in key: + incompatible_keys.unexpected_keys.remove(key) + + self.register_load_state_dict_post_hook(remove_extra_states_check) + + def forward( + self, permuted_local_hidden_states: torch.Tensor, tokens_per_expert: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Forward of TEGroupedMLP + + Args: + permuted_local_hidden_states (torch.Tensor): The permuted input hidden states of the + local experts. + tokens_per_expert (torch.Tensor): The number of tokens per expert. + + Return: + output (torch.Tensor): The output of the local experts. + """ + tokens_per_expert = tokens_per_expert.tolist() + intermediate_parallel, bias_parallel = self.linear_fc1( + permuted_local_hidden_states, tokens_per_expert + ) + + if self.config.bias_activation_fusion: + if self.activation_func == F.gelu: + if self.config.gated_linear_unit: + intermediate_parallel = bias_geglu_impl(intermediate_parallel, bias_parallel) + else: + assert self.config.add_bias_linear is True + intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) + elif self.activation_func == F.silu and self.config.gated_linear_unit: + intermediate_parallel = bias_swiglu_impl( + intermediate_parallel, + bias_parallel, + self.config.activation_func_fp8_input_store, + ) + else: + raise ValueError("Only support fusion of gelu and swiglu") + else: + if bias_parallel is not None: + intermediate_parallel = intermediate_parallel + bias_parallel + if self.config.gated_linear_unit: + + def glu(x): + x = torch.chunk(x, 2, dim=-1) + return self.config.activation_func(x[0]) * x[1] + + intermediate_parallel = glu(intermediate_parallel) + else: + intermediate_parallel = self.activation_func(intermediate_parallel) + + output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) + + return output, output_bias + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None + ) -> ShardedStateDict: + """ + Maps local expert to global experts. + The sharded state dict is interchangable with SequentialMLP's. + """ + if self.moe_extended_tp: + raise NotImplementedError( + 'Currently distributed checkpointing is not supported for moe_extended_tp' + ) + sharded_state_dict = {} + for name, module in self._modules.items(): + sub_sd = module.sharded_state_dict(f'{name}.', sharded_offsets, metadata) + if name == 'linear_fc1' and self.config.gated_linear_unit: + num_global_experts = ( + parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts + ) + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_local_experts + ) + ep_axis = len(sharded_offsets) + for i in range(self.num_local_experts): + new_sharded_offsets = ( + *sharded_offsets, + (ep_axis, local_expert_indices_offset + i, num_global_experts), + ) + for k in (f'{name}.weight{i}', f'{name}.bias{i}'): + if k in sub_sd: + sub_sd[k] = apply_swiglu_sharded_factory(sub_sd[k], new_sharded_offsets) + # Add prefix here to match sequential's keys + replace_prefix_for_sharding(sub_sd, f'{name}.', f'{prefix}experts.{name}.') + sharded_state_dict.update({f"{prefix}{k}": v for k, v in sub_sd.items()}) + return sharded_state_dict + + +class SequentialMLP(MegatronModule): + """An implementation of the Experts layer using a sequence of MLP layers. + + This class executes each expert sequentially. + """ + + def __init__(self, num_local_experts, config: TransformerConfig, submodules: MLPSubmodules): + super().__init__(config=config) + self.add_bias = config.add_bias_linear + self.moe_extended_tp = config.moe_extended_tp + self.num_local_experts = num_local_experts + self.local_experts = torch.nn.ModuleList() + for _ in range(self.num_local_experts): + expert = MLP(self.config, submodules, is_expert=True) + self.local_experts.append(expert) + + def forward(self, permuted_local_hidden_states, tokens_per_expert): + + output_local = torch.zeros_like(permuted_local_hidden_states) + output_bias_local = None + if self.add_bias: + output_bias_local = torch.zeros_like(permuted_local_hidden_states) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + for expert_num, expert in enumerate(self.local_experts): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + hidden = permuted_local_hidden_states[start:end] + output, output_bias = expert(hidden) + + output_local[start:end] = output + if self.add_bias: + output_bias = output_bias.expand_as(output) + output_bias_local[start:end, :] = output_bias + + return output_local, output_bias_local + + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + """Maps local expert to global experts.""" + if self.moe_extended_tp: + raise NotImplementedError( + 'Currently distributed checkpointing is not supported for moe_extended_tp' + ) + + sharded_state_dict = {} + num_global_experts = ( + parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts + ) + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_local_experts + ) + + expert_sharded_prefix = f'{prefix}experts.' + for expert_local_idx, expert in enumerate(self.local_experts): + expert_global_idx = local_expert_indices_offset + expert_local_idx + expert_state_dict_prefix = f'{prefix}local_experts.{expert_local_idx}.' + expert_sharded_offsets = ( + *sharded_offsets, + (len(sharded_offsets), expert_global_idx, num_global_experts), + ) + + expert_state_dict = expert.sharded_state_dict( + expert_state_dict_prefix, expert_sharded_offsets, metadata + ) + # Remove expert layers indexing from sharded keys + replace_prefix_for_sharding( + expert_state_dict, expert_state_dict_prefix, expert_sharded_prefix + ) + # Adjust replica ids - replication along DP modulo EP + for k, sh_ten in expert_state_dict.items(): + replica_id = sh_ten.replica_id + assert ( + len(replica_id) == 3 + ), f'Expected replica_id for {k} to be in (PP, TP, DP) format, got: {replica_id}' + sh_ten.replica_id = ( + *replica_id[:2], + parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True), + ) + + sharded_state_dict.update(expert_state_dict) + return sharded_state_dict diff --git a/megatron/core/transformer/moe/grouped_gemm_util.py b/megatron/core/transformer/moe/grouped_gemm_util.py new file mode 100644 index 0000000..e7ef79d --- /dev/null +++ b/megatron/core/transformer/moe/grouped_gemm_util.py @@ -0,0 +1,20 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +try: + import grouped_gemm +except ImportError: + grouped_gemm = None + + +def grouped_gemm_is_available(): + return grouped_gemm is not None + + +def assert_grouped_gemm_is_available(): + assert grouped_gemm_is_available(), ( + "Grouped GEMM is not available. Please run " + "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`." + ) + + +ops = grouped_gemm.ops if grouped_gemm_is_available() else None diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py new file mode 100644 index 0000000..1ea61ba --- /dev/null +++ b/megatron/core/transformer/moe/moe_layer.py @@ -0,0 +1,121 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from abc import ABC, abstractmethod + +import torch + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.transformer.mlp import MLPSubmodules +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP +from megatron.core.transformer.moe.router import TopKRouter +from megatron.core.transformer.moe.token_dispatcher import ( + MoEAllGatherTokenDispatcher, + MoEAlltoAllTokenDispatcher, +) +from megatron.core.transformer.transformer_config import TransformerConfig + + +class BaseMoELayer(MegatronModule, ABC): + """Base class for a mixture of experts layer. + + Args: + config (TransformerConfig): Configuration object for the transformer model. + """ + + def __init__(self, config: TransformerConfig, layer_number: int = None): + super(BaseMoELayer, self).__init__(config) + self.config = config + self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() + assert self.expert_parallel_size > 0, "Expected non-negative expert parallel size" + + if self.config.moe_extended_tp: + self.num_local_experts = self.config.num_moe_experts + local_expert_indices_offset = 0 + else: + assert self.config.num_moe_experts % self.expert_parallel_size == 0 + self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_local_experts + ) + + self.local_expert_indices = [ + local_expert_indices_offset + i for i in range(self.num_local_experts) + ] + assert all(map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices)) + self.router = None + self.experts = None + self.token_dispatcher = None + self.layer_number = layer_number + + @abstractmethod + def forward(self, hidden_states): + pass + + def set_layer_number(self, layer_number: int): + self.layer_number = layer_number + self.router.set_layer_number(layer_number) + + +class MoELayer(BaseMoELayer): + """Mixture of experts Layer **currently only supports no token dropping**. + + Args: + BaseMoELayer (MegatronModule): Base class for MoE layers + """ + + def __init__( + self, config: TransformerConfig, submodules: MLPSubmodules = None, layer_number: int = None + ): + self.submodules = submodules + super(MoELayer, self).__init__(config=config, layer_number=layer_number) + self.router = TopKRouter(config=self.config) + if self.config.moe_grouped_gemm: + if isinstance(self.submodules, MLPSubmodules): + self.experts = TEGroupedMLP(self.num_local_experts, self.config, self.submodules) + else: + self.experts = GroupedMLP(self.num_local_experts, self.config) + else: + assert isinstance(self.submodules, MLPSubmodules) + self.experts = SequentialMLP(self.num_local_experts, self.config, self.submodules) + if config.moe_token_dispatcher_type == "allgather": + self.token_dispatcher = MoEAllGatherTokenDispatcher( + self.num_local_experts, self.local_expert_indices, config=self.config + ) + elif config.moe_token_dispatcher_type == "alltoall": + self.token_dispatcher = MoEAlltoAllTokenDispatcher( + self.num_local_experts, self.local_expert_indices, config=self.config + ) + else: + raise ValueError( + f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}" + ) + self.moe_layer_recompute = config.moe_layer_recompute + + def forward(self, hidden_states: torch.Tensor): + if ( + self.training + and self.config.tensor_model_parallel_size > 1 + and not self.config.sequence_parallel + ): + raise ValueError( + "During training, performance may degrade if MoE and tensor parallelism" + "are enabled without also enabling sequence parallelism." + ) + + # process MoE + def custom_forward(hidden_states): + probs, indices = self.router(hidden_states) + (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation( + hidden_states, probs, indices + ) + expert_output, mlp_bias = self.experts(dispatched_input, tokens_per_expert) + output, mlp_bias = self.token_dispatcher.token_unpermutation(expert_output, mlp_bias) + return output, mlp_bias + + if self.moe_layer_recompute: + output, mlp_bias = tensor_parallel.checkpoint(custom_forward, False, hidden_states) + else: + output, mlp_bias = custom_forward(hidden_states) + + return output, mlp_bias diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py new file mode 100644 index 0000000..c0c10a2 --- /dev/null +++ b/megatron/core/transformer/moe/moe_utils.py @@ -0,0 +1,505 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import math + +import torch + +from megatron.core import parallel_state + + +def switch_load_balancing_loss_func( + probs: torch.Tensor, + tokens_per_expert: torch.Tensor, + topk: int, + moe_aux_loss_coeff: float, + sequence_partition_group=None, +): + """Calculate the auxiliary loss for load balancing. + Refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details. + + Args: + probs (torch.Tensor): Softmax probabilities output by the router for each token. [num_tokens, num_experts] + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. [num_experts] + topk (int): The number of experts selected for each token. + moe_aux_loss_coeff (float): The coefficient for the auxiliary loss. + sequence_partition_group (optional): The parallel group over which the sequence is partitioned. If None, no partitioning is applied. Defaults to None. + + Returns: + torch.Tensor: The auxiliary loss for load balancing. + """ + num_sub_sequence = 1 + + # If the sequence is partitioned by certain parallelism strategies like Sequence Parallelism or Context Parallelism, compute the gradient of the auxiliary loss with respect to the full sequence. + if sequence_partition_group is not None: + # We can keep `aggregated_probs_per_expert` local since we don't need the gradient for `tokens_per_expert`, saving one allreduce operation for `aggregated_probs_per_expert`. + num_sub_sequence = torch.distributed.get_world_size(sequence_partition_group) + torch.distributed.all_reduce(tokens_per_expert, group=sequence_partition_group) + + num_tokens = probs.shape[0] * num_sub_sequence + num_experts = probs.shape[1] + + # The formula of aux_loss: aux_loss = sum((probs_per_expert/num_tokens) * (tokens_per_expert/(num_tokens*topk))) * num_experts * moe_aux_loss_coeff. + # This can be simplified to fuse the division and multiplication operations. + aggregated_probs_per_expert = probs.sum(dim=0) + aux_loss = torch.sum(aggregated_probs_per_expert * tokens_per_expert) * ( + num_experts * moe_aux_loss_coeff / (num_tokens * num_tokens * topk) + ) + return aux_loss + + +def z_loss_func(logits, z_loss_coeff): + """Encourages the router's logits to remain small to enhance stability. + Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. + + Args: + logits (torch.Tensor): The logits of the router. + + Returns: + torch.Tensor: The logits after applying the z-loss. + """ + + z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff + return z_loss + + +def sinkhorn(cost: torch.Tensor, tol: float = 0.0001): + """Sinkhorn based MoE routing function""" + cost = torch.exp(cost) + d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype) + d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype) + + eps = 0.00000001 + error = 1e9 + d1_old = d1 + while error > tol: + d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps) + d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) + error = torch.mean(torch.abs(d1_old - d1)) + d1_old = d1 + return d1 * cost * d0.unsqueeze(1) + + +def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_capacity=None): + """ + Calculate the capacity of each expert. + + Args: + num_tokens (int): num of the input tokens. + num_experts (int): num of the experts. + capacity_factor (float): Capacity factor. + min_capacity (int, optional): Minimum capacity. Defaults to None. + + Returns: + Tensor: Capacity of each expert. + """ + capacity = math.ceil((num_tokens / num_experts) * capacity_factor) + if min_capacity is not None and capacity < min_capacity: + capacity = min_capacity + return capacity + + +class MoEAuxLossAutoScaler(torch.autograd.Function): + """An AutoScaler that compute and scales the grad for auxiliary loss.""" + + main_loss_backward_scale: torch.Tensor = torch.tensor(1.0) + + @staticmethod + def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor): + """Preserve the aux_loss by storing it in the context to avoid garbage collection. + + Args: + output (torch.Tensor): The output tensor. + aux_loss (torch.Tensor): The auxiliary loss tensor. + + Returns: + torch.Tensor: The output tensor. + """ + ctx.save_for_backward(aux_loss) + return output + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + """Compute and scale the gradient for auxiliary loss.. + + Args: + grad_output (torch.Tensor): The gradient of the output. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled auxiliary loss gradient. + """ + (aux_loss,) = ctx.saved_tensors + aux_loss_backward_scale = MoEAuxLossAutoScaler.main_loss_backward_scale + scaled_aux_loss_grad = torch.ones_like(aux_loss) * aux_loss_backward_scale + return grad_output, scaled_aux_loss_grad + + @staticmethod + def set_loss_scale(scale: torch.Tensor): + """set the scale of the aux loss. + + Args: + scale (torch.Tensor): The scale value to set. Please ensure that the scale passed in matches the scale of the main_loss. + """ + MoEAuxLossAutoScaler.main_loss_backward_scale = scale + + +def permute(tokens, indices, num_out_tokens: int = None, padded_mode: bool = False): + """Permute the tokens based on the indices. Token with the same index will be grouped together. + The input indices shape is [tokens, top_k], it indicates which experts were selected by each token separately. + Args: + tokens (torch.Tensor): The input token tensor. + indices (torch.Tensor): The token to expert indices tensor, should have a shape of [num_tokens] or [num_tokens, topk]. + num_out_tokens (int, optional): The effective output token count, when enabling the capacity factor, should equal the number of tokens not dropped. By default, set to None, meaning no tokens are dropped. + padded_mode (bool, optional): If True, indicating the indices are padded to [num_expert, capacity] to denote selected tokens per expert. Defaults to False. + + Returns: + torch.Tensor: The permuted tensor. + torch.Tensor: The sorted_indices corresponding permuted tensor. + """ + if padded_mode: + return permute_with_padded_tokens(tokens, indices) + + if indices.dim() == 1: + topk = 1 + else: + topk = indices.size(1) + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices, stable=True) + if num_out_tokens is not None: + sorted_indices = sorted_indices[:num_out_tokens] + permuted_tokens = tokens.index_select(0, sorted_indices // topk) + return permuted_tokens, sorted_indices + + +def unpermute( + permuted_tokens: torch.Tensor, + sorted_indices: torch.Tensor, + probs: torch.Tensor = None, + padded_mode: bool = False, + restore_shape: torch.Size = None, +): + """Unpermute a tensor of permuted tokens based on sorted indices, and optionally merge the tokens with their corresponding probabilities. + + Args: + permuted_tokens (torch.Tensor): The tensor of permuted tokens to be unpermuted. + sorted_indices (torch.Tensor): The tensor of sorted indices used to unpermute the tokens. + probs (torch.Tensor, optional): The tensor of probabilities corresponding to the permuted tokens. If provided, the unpermuted tokens will be merged with their respective probabilities. + padded_mode (bool, optional): If True, indicating the indices are padded to [num_expert, capacity] to denote selected tokens per expert. Defaults to False. + restore_shape (torch.Size, optional): The input shape before permutation, only used in padding mode. Defaults to None. + + Returns: + torch.Tensor: The unpermuted tokens, optionally merged with probabilities. + """ + if padded_mode: + return unpermute_with_padded_tokens( + permuted_tokens, sorted_indices, probs, restore_shape=restore_shape + ) + + assert sorted_indices.numel() == permuted_tokens.size(0) + if probs is not None: + # Unpermute and merge the tokens with their probabilities + num_unpermuted_tokens = probs.numel() + topk = probs.size(1) + else: + # Unpermute the tokens without merge + num_unpermuted_tokens = permuted_tokens.size(0) + topk = 1 + + unpermuted_tokens = torch.zeros( + [num_unpermuted_tokens, permuted_tokens.shape[-1]], + dtype=permuted_tokens.dtype, + device=permuted_tokens.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, permuted_tokens) + unpermuted_tokens = unpermuted_tokens.reshape(-1, topk, permuted_tokens.size(-1)) + if probs is not None: + unpermuted_tokens = unpermuted_tokens * probs.unsqueeze(-1) + unpermuted_tokens = unpermuted_tokens.sum(dim=1) + + return unpermuted_tokens + + +def permute_with_padded_tokens(tokens, indices): + """Permute the tokens based on the indices, only used in padding mode. + The input indices shape is [num_expert, capacity], it indicates which tokens were selected by each expert separately. + Args: + tokens (torch.Tensor): The input token tensor. + indices (torch.Tensor): A tensor with shape [num_expert, capacity], indicating the selected tokens for each expert. + + Returns: + torch.Tensor: The permuted tensor. + torch.Tensor: The sorted_indices corresponding permuted tensor. + """ + permuted_tokens = tokens.index_select(dim=0, index=indices.view(-1)) + + return permuted_tokens, indices + + +def unpermute_with_padded_tokens( + permuted_tokens: torch.Tensor, + indices: torch.Tensor, + probs: torch.Tensor, + restore_shape: torch.Size, +) -> torch.Tensor: + """ + Unpermutes a padded permuted tokens based on sorted indices and merges the tokens with their corresponding probabilities. + + This function takes a tensor of permuted tokens and reorders them according to the provided indices. It also combines the tokens with their associated probabilities. + + Parameters: + permuted_tokens (torch.Tensor): A 2D tensor containing permuted tokens. + indices (torch.Tensor): A tensor with shape [num_expert, capacity], indicating the selected tokens for each expert. + probs (torch.Tensor): A tensor with the same shape as indices, containing probabilities corresponding to each token. + restore_shape (torch.Size): The target shape for the unpermuted tokens tensor. + + Returns: + torch.Tensor: A tensor of unpermuted tokens, merged with their probabilities. + + """ + # Ensure permuted_tokens is 2D + assert permuted_tokens.dim() == 2, f"Got {permuted_tokens.dim()}D." + + # Reshape and expand probabilities and indices to match permuted_tokens + probs = probs.view(-1).unsqueeze(-1) + indices = indices.view(-1, 1).expand(-1, permuted_tokens.shape[1]) + assert ( + permuted_tokens.shape == indices.shape + ), "Shape mismatch between permuted_tokens and indices." + + # Combine tokens with their probabilities + combined_output = probs * permuted_tokens + + # Prepare a tensor of zeros with the desired output shape + empty_tokens = torch.zeros( + restore_shape, + dtype=combined_output.dtype, + device=combined_output.device, + ) + + # Scatter the combined tokens back to their original positions + unpermuted_tokens = torch.scatter_add(empty_tokens, 0, indices, combined_output) + + return unpermuted_tokens + + +def topk_softmax_with_capacity( + logits: torch.Tensor, + topk: int, + capacity_factor: float = None, + pad_to_capacity: bool = False, + drop_policy: str = "probs", + use_pre_softmax: bool = False, +): + """Apply capacity and padding to the top-k selection. + Args: + logits (torch.Tensor): Logits tensor. + topk (int): The number of experts to select for each token. + capacity_factor (int): The capacity factor of each expert. Will drop tokens if the number of tokens exceeds the capacity. + pad_to_capacity (bool): Whether to need padding in token drop mode. + drop_policy (str): The policy to drop tokens. Can be either "prob" or "position". If "prob", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Probs, indices and tokens_per_expert tensor. + + (1) If there's no token padding, the shape of probs and indices is [tokens, top_k], indicating the selected experts for each token. + (2) If there's token padding, the shape of probs and indices is [num_expert, capacity], indicating the tokens selected for each expert. + """ + assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}." + num_tokens = logits.shape[0] + num_experts = logits.shape[1] + if use_pre_softmax: + # Pre softmax + scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits) + probs, top_indices = torch.topk(scores, k=topk, dim=1) + else: + # Post softmax + if topk == 1: + # Requires applying softmax before selecting the top-k when k is 1, since softmax on a [num_tokens, 1] would yield a zero gradient. + raise ValueError("Please use --moe-router-pre-softmax when topk is 1.") + scores, top_indices = torch.topk(logits, k=topk, dim=1) + probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits) + + if capacity_factor is None: + # TopK without capacity + tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts) + return probs, top_indices, tokens_per_expert + else: + # TopK with capacity + expert_capacity = get_capacity( + num_tokens=num_tokens * topk, + num_experts=num_experts, + capacity_factor=capacity_factor, + ) + # TopK selection, Maskout unused experts + topk_masked_gates = torch.zeros_like(logits).scatter(1, top_indices, probs) + topk_mask = torch.zeros_like(logits).scatter(1, top_indices, 1) + + # Maskout exceeded tokens + if drop_policy == "probs": + capacity_probs, capacity_indices = torch.topk( + topk_masked_gates, k=expert_capacity, dim=0, sorted=False + ) + capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1) + elif drop_policy == "position": + _, capacity_indices = torch.topk(topk_mask, k=expert_capacity, dim=0, sorted=False) + capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1) + capacity_probs = torch.gather(topk_masked_gates, 0, capacity_indices) + else: + raise ValueError(f"Invalid drop_policy: {drop_policy}") + + if pad_to_capacity: + final_probs, final_indices = ( + capacity_probs.T.contiguous(), + capacity_indices.T.contiguous(), + ) + tokens_per_expert_before_capacity = topk_mask.sum(dim=0) + else: + # Get exceed mask and maskout exceeded probs and indices + final_mask = torch.logical_and(topk_mask, capacity_mask) + drop_mask = torch.logical_not(final_mask) + exceed_mask = torch.gather(drop_mask, 1, top_indices) + final_probs = probs * torch.logical_not(exceed_mask) + final_indices = top_indices.clone().masked_fill_( + exceed_mask, torch.iinfo(torch.long).max + ) + tokens_per_expert_before_capacity = topk_mask.sum(dim=0) + return final_probs, final_indices, tokens_per_expert_before_capacity + + +def save_to_aux_losses_tracker( + name: str, + loss: torch.Tensor, + layer_number: int, + num_layers: int, + reduce_group: torch.distributed.ProcessGroup = None, + avg_group: torch.distributed.ProcessGroup = None, +): + """Save the auxiliary loss for logging. + Args: + name (str): The name of the loss. + loss (torch.Tensor): The loss tensor. + layer_number (int): Layer index of the loss. + num_layers (int): The number of total layers. + reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss. + mean_group (torch.distributed.ProcessGroup): The group for averaging the loss. + """ + # Skip aux loss logging if layer_number is None. + if layer_number is None: + return + + tracker = parallel_state.get_moe_layer_wise_logging_tracker() + if name not in tracker: + tracker[name] = {} + tracker[name]["values"] = torch.zeros(num_layers, device=loss.device) + tracker[name]["values"][layer_number - 1] += loss.detach() # Aggregate the loss for the layer. + tracker[name]["reduce_group"] = reduce_group + tracker[name]["avg_group"] = avg_group + + +def clear_aux_losses_tracker(): + """Clear the auxiliary losses.""" + tracker = parallel_state.get_moe_layer_wise_logging_tracker() + for name in tracker: + tracker[name]["values"].zero_() + tracker[name]["reduce_group"] = None + tracker[name]["avg_group"] = None + + +def reduce_aux_losses_tracker_across_ranks(): + """Collect and reduce the auxiliary losses across ranks.""" + tracker = parallel_state.get_moe_layer_wise_logging_tracker() + for name in tracker: + values = tracker[name]["values"] + # Collect aux losses across PP. + torch.distributed.all_reduce( + values, group=parallel_state.get_pipeline_model_parallel_group() + ) + # Reduce aux losses across ranks. + if tracker[name].get('reduce_group') is not None: + torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group')) + if tracker[name].get('avg_group') is not None: + torch.distributed.all_reduce( + values, + group=tracker[name]['avg_group'], + op=torch.distributed.ReduceOp.AVG, + ) + + +def track_moe_metrics( + loss_scale, iteration, writer, wandb_writer=None, total_loss_dict=None, per_layer_logging=False +): + # Aux loss logging + reduce_aux_losses_tracker_across_ranks() + tracker = parallel_state.get_moe_layer_wise_logging_tracker() + if writer is not None: + aux_losses = {k: v['values'].float() * loss_scale for k, v in tracker.items()} + for name, loss_list in aux_losses.items(): + if total_loss_dict is not None: + if name not in total_loss_dict: + total_loss_dict[name] = loss_list.mean() + else: + total_loss_dict[name] += loss_list.mean() + + # currently when using add_scalars, + # torch.utils.add_scalars makes each timer its own run, which + # polutes the runs list, so we just add each as a scalar + writer.add_scalar(name, loss_list.mean(), iteration) + if per_layer_logging: + for i, loss in enumerate(loss_list.tolist()): + writer.add_scalar(f"moe/{name}_layer_{i}", loss, iteration) + + # W&B logging lacks support for logging multiple scalars simultaneously. + # As a workaround, we log each scalar individually first, then we can create + # a custom panel to manually group them to a single plot. + if wandb_writer: + wandb_writer.log({f"{name}": loss_list.mean()}, iteration) + if per_layer_logging: + wandb_writer.log( + { + f"moe/{name}_layer_{i}": loss + for i, loss in enumerate(loss_list.tolist()) + }, + iteration, + ) + + clear_aux_losses_tracker() + + +class moe_gather(torch.autograd.Function): + @staticmethod + def forward(ctx, input_, map_): + ctx.input_size = input_.size() + ctx.map = map_ + return torch.gather(input_, 0, map_) + + @staticmethod + def backward(ctx, grad_output): + input_size = ctx.input_size + map_ = ctx.map + + output = torch.zeros( + input_size, dtype=grad_output.dtype, device=torch.cuda.current_device() + ) + output.scatter_add_(0, map_, grad_output) + return output, None, None + + +class moe_scatter(torch.autograd.Function): + @staticmethod + def forward(ctx, input_, map_, output_size=None): + ctx.map = map_ + + if output_size is not None: + output = torch.zeros( + output_size, dtype=input_.dtype, device=torch.cuda.current_device() + ) + else: + output = torch.zeros_like(input_) + + output.scatter_add_(0, map_, input_) + return output + + @staticmethod + def backward(ctx, grad_output): + map_ = ctx.map + grad_input = torch.gather(grad_output, 0, map_) + return grad_input, None, None, None diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py new file mode 100644 index 0000000..eee1aa2 --- /dev/null +++ b/megatron/core/transformer/moe/router.py @@ -0,0 +1,313 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from abc import ABC, abstractmethod + +import torch + +from megatron.core import parallel_state +from megatron.core.tensor_parallel import ( + gather_from_sequence_parallel_region, + get_cuda_rng_tracker, + get_data_parallel_rng_tracker_name, +) +from megatron.core.tensor_parallel.random import ( + get_cuda_rng_tracker, + get_data_parallel_rng_tracker_name, +) +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.moe.moe_utils import ( + MoEAuxLossAutoScaler, + save_to_aux_losses_tracker, + sinkhorn, + switch_load_balancing_loss_func, + topk_softmax_with_capacity, + z_loss_func, +) +from megatron.core.transformer.transformer_config import TransformerConfig + + +class Router(ABC, MegatronModule): + """Base Router class""" + + def __init__(self, config: TransformerConfig) -> None: + """ + Initialize the Router module. + + Args: + config (TransformerConfig): Configuration object for the Transformer model. + """ + super().__init__(config) + self.config = config + self.num_experts = self.config.num_moe_experts + self.moe_aux_loss_func = None + self.layer_number = None + + # Initialize the gate weights. + self.weight = torch.nn.Parameter( + torch.empty((self.config.num_moe_experts, self.config.hidden_size)) + ) + if config.perform_initialization: + if get_cuda_rng_tracker().is_initialized(): + with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()): + config.init_method(self.weight) + else: + config.init_method(self.weight) + setattr(self.weight, 'sequence_parallel', config.sequence_parallel) + + def gating(self, input: torch.Tensor): + """Forward pass of the router gate. + + Args: + input (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Logits tensor. + """ + logits = torch.nn.functional.linear(input, self.weight) + return logits + + @abstractmethod + def routing(self, logits: torch.Tensor): + """Routing function. + + Args: + logits (torch.Tensor): Logits tensor. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of tensors representing max probs and the indices. + """ + raise NotImplementedError("Routing function not implemented.") + + @abstractmethod + def forward(self, input: torch.Tensor): + """ + Forward pass of the router. + + Args: + input (torch.Tensor): Input tensor. + """ + raise NotImplementedError("Forward function not implemented.") + + def set_layer_number(self, layer_number: int): + """Set the layer number for the router.""" + self.layer_number = layer_number + + +class TopKRouter(Router): + """Route each token to the top-k experts.""" + + def __init__( + self, + config: TransformerConfig, + ) -> None: + """Initialize the zero token dropping router. + + Args: + config (TransformerConfig): The configuration for the transformer model. + """ + super().__init__(config=config) + self.topk = self.config.moe_router_topk + self.routing_type = self.config.moe_router_load_balancing_type + self.input_jitter = None + + def sinkhorn_load_balancing(self, logits: torch.Tensor): + """Apply sinkhorn routing to the logits tensor. + + Args: + logits (torch.Tensor): The logits tensor. + + Returns: + torch.Tensor: The logits tensor after applying sinkhorn routing. + """ + + def _sinkhorn_activation(logits): + if self.topk == 1: + logits = torch.sigmoid(logits) + else: # k > 1 + logits = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits) + return logits + + assert self.config.moe_aux_loss_coeff == 0, "Sinkhorn routing does not support aux loss." + if self.training: + with torch.no_grad(): + norm_logits = sinkhorn( + logits.to(dtype=torch.float32) + ) # explicit fp32 conversion for stability + _, indices = torch.topk(norm_logits, k=self.topk, dim=1) + logits = _sinkhorn_activation(logits) + scores = torch.gather(logits, 1, indices) + else: + logits = _sinkhorn_activation(logits) + scores, indices = torch.topk(logits, k=self.topk, dim=1) + return scores, indices + + def aux_loss_load_balancing(self, logits: torch.Tensor): + """Apply loss-based load balancing to the logits tensor. + + Args: + logits (torch.Tensor): the logits tensor after gating, shape: [num_tokens, num_experts]. + + Returns: + probs (torch.Tensor): the probabilities tensor after load balancing. + indices (torch.Tensor): the indices tensor after top-k selection. + """ + probs, indices, tokens_per_expert = topk_softmax_with_capacity( + logits, + self.topk, + capacity_factor=self.config.moe_expert_capacity_factor, + pad_to_capacity=self.config.moe_pad_expert_input_to_capacity, + drop_policy=self.config.moe_token_drop_policy, + use_pre_softmax=self.config.moe_router_pre_softmax, + ) + + if self.training: + # Apply load balancing loss + scores = torch.softmax(logits, dim=-1, dtype=torch.float32) + probs = self.apply_load_balancing_loss(scores, tokens_per_expert, activation=probs) + return probs, indices + + def apply_load_balancing_loss( + self, + probs: torch.Tensor, + num_local_tokens_per_expert: torch.Tensor, + activation: torch.Tensor, + ): + """Applies auxiliary loss to the MoE layer. + + Args: + probs (torch.Tensor): The probs output by the router for each token. [num_tokens, num_experts] + num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert. [num_experts] + activation (torch.Tensor): The activation tensor to attach the gradient function to. + + Returns: + torch.Tensor: The activation tensor with the attached gradient function. + """ + moe_aux_loss_coeff = self.config.moe_aux_loss_coeff + sequence_partition_group = None + if self.config.moe_token_dispatcher_type == "allgather": + sequence_partition_group = parallel_state.get_tensor_and_context_parallel_group() + elif self.config.moe_token_dispatcher_type == "alltoall": + sequence_partition_group = parallel_state.get_context_parallel_group() + moe_aux_loss_coeff /= parallel_state.get_tensor_model_parallel_world_size() + + aux_loss = switch_load_balancing_loss_func( + probs, + num_local_tokens_per_expert, + self.topk, + moe_aux_loss_coeff, + sequence_partition_group=sequence_partition_group, + ) + save_to_aux_losses_tracker( + "load_balancing_loss", + aux_loss / moe_aux_loss_coeff, + self.layer_number, + self.config.num_layers, + reduce_group=sequence_partition_group, + ) + activation = MoEAuxLossAutoScaler.apply(activation, aux_loss) + return activation + + def apply_z_loss(self, logits): + """Encourages the router's logits to remain small to enhance stability. + Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. + + Args: + logits (torch.Tensor): The logits of the router. + + Returns: + torch.Tensor: The logits after applying the z-loss. + """ + if self.config.moe_z_loss_coeff is not None and self.training: + moe_z_loss_coeff = ( + self.config.moe_z_loss_coeff + / parallel_state.get_tensor_and_context_parallel_world_size() + ) + z_loss = z_loss_func(logits, moe_z_loss_coeff) + logits = MoEAuxLossAutoScaler.apply(logits, z_loss) + save_to_aux_losses_tracker( + "z_loss", + z_loss / moe_z_loss_coeff, + self.layer_number, + self.config.num_layers, + ) + return logits + + def apply_input_jitter(self, input: torch.Tensor): + """Add noise to the input tensor. + Refer to https://arxiv.org/abs/2101.03961. + + Args: + input (Tensor): Input tensor. + + Returns: + Tensor: Jittered input. + """ + if self.config.moe_input_jitter_eps is not None: + eps = self.config.moe_input_jitter_eps + if self.input_jitter is None: + self.input_jitter = torch.distributions.uniform.Uniform( + torch.tensor(1.0 - eps, device=input.device), + torch.tensor(1.0 + eps, device=input.device), + ).rsample + return input * self.input_jitter(input.shape) + else: + return input + + def routing(self, logits: torch.Tensor): + """Top-k routing function + + Args: + logits (torch.Tensor): Logits tensor after gating. + + Returns: + probs (torch.Tensor): the probabilities tensor after load balancing. + indices (torch.Tensor): the indices tensor after top-k selection. + """ + logits = logits.view(-1, self.config.num_moe_experts) + + # Apply Z-Loss + logits = self.apply_z_loss(logits) + + if ( + parallel_state.get_tensor_model_parallel_world_size() > 1 + and self.config.moe_token_dispatcher_type == "alltoall" + ): + # Gather the logits from the TP region + logits = gather_from_sequence_parallel_region(logits) + + if self.routing_type == "sinkhorn": + scores, indices = self.sinkhorn_load_balancing(logits) + elif self.routing_type == "aux_loss": + scores, indices = self.aux_loss_load_balancing(logits) + elif self.routing_type == "none": + # A naive top-k routing without load balancing + scores, indices, _ = topk_softmax_with_capacity( + logits, + self.topk, + capacity_factor=self.config.moe_expert_capacity_factor, + pad_to_capacity=self.config.moe_pad_expert_input_to_capacity, + drop_policy=self.config.moe_token_drop_policy, + use_pre_softmax=self.config.moe_router_pre_softmax, + ) + else: + raise ValueError(f"Unsupported MoE routing type: {self.routing_type}") + + return scores, indices + + def forward(self, input: torch.Tensor): + """ + Forward pass of the router. + + Args: + input (torch.Tensor): Input tensor. + """ + self.hidden = input.shape[-1] + + # Apply input jitter + input = self.apply_input_jitter(input) + logits = self.gating(input) + logits = logits.view(-1, self.config.num_moe_experts) + + scores, indices = self.routing(logits) + + return scores, indices diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py new file mode 100644 index 0000000..377403a --- /dev/null +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -0,0 +1,592 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from abc import abstractmethod +from typing import List, Optional, Tuple + +import torch + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.tensor_parallel.mappings import _gather_along_first_dim_expert_parallel +from megatron.core.transformer.moe.moe_utils import moe_gather, moe_scatter, permute, unpermute +from megatron.core.transformer.transformer_config import TransformerConfig + + +class MoETokenDispatcher: + """ + MoE Token Dispatcher + """ + + def __init__(self, config: TransformerConfig) -> None: + """ + Initialize the MoE Token Dispatcher. + """ + self.config = config + + @abstractmethod + def token_permutation( + self, + tokens: torch.Tensor, + indices: torch.Tensor, + ): + """Dispatch tokens to experts. + + Args: + tokens (torch.Tensor): Input tokens. + indices (torch.Tensor): indices tensor. + + Returns: + torch.Tensor: Tokens tensor. + """ + raise NotImplementedError("Dispatch function not implemented.") + + @abstractmethod + def token_unpermutation( + self, + expert_output: torch.Tensor, + probs: torch.Tensor, + indices: torch.Tensor, + ): + """Restores the expert output to its original ordering. + + Args: + expert_output (torch.Tensor): The output tensor from the expert models. + probs (torch.Tensor): Each token's score with each expert. + indices (torch.Tensor): The indices used to reorder the expert output. + + Returns: + (torch.Tensor, torch.Tensor): Unpermuted activation and optional bias. + """ + raise NotImplementedError("Restore function not implemented.") + + +class MoEAllGatherTokenDispatcher(MoETokenDispatcher): + """ + AllGather Based Token dispatcher. + """ + + def __init__( + self, + num_local_experts: int, + local_expert_indices: List[int], + config: TransformerConfig, + ) -> None: + """ + Initialize the zero token dropping router. + """ + super().__init__(config=config) + self.num_local_experts = num_local_experts + assert self.num_local_experts > 0, "Expected at least one expert" + self.local_expert_indices = local_expert_indices + assert len(self.local_expert_indices) > 0, "Expected at least one local expert index" + self.router_topk = config.moe_router_topk + self.add_bias = config.add_bias_linear + + # self.local_probs: probs of global token assignment to local experts. + self.local_probs = None + + # self.indices: The indices of `local_indices` (which holds the un-sorted expert indices of tokens that local expert can process) that give its sorted order along dim 0. + self.indices = None + + # self.global_local_map: 2D tensor. A mask of mapping between global and local tokens where each element is True if it's between the local_expert_indices. Only useful when cross device token permutation is enabled and **AllGahter** is performed. + self.global_local_map = None + + def token_permutation( + self, hidden_states: torch.Tensor, max_prob: torch.Tensor, max_ind: torch.Tensor + ): + """Dispatch tokens to local experts. It's composed of two stages: + (1) Permute the tokens across the expert parallel devices. After this stage, + each device receives all of the tokens assigned to its local set of experts + in its local HBM. + (2) Permute the tokens locally so that they are grouped by their expert + assignment. After the stage (1), the tokens are grouped by which device + they came from. We re-order them locally for subsequent efficient computation. + + Args: + hidden_states: input tokens of shape [SeqLen/TP, MBS, HiddenSize] + max_prob: probs of local token assignment to global experts. + max_ind: token assignment to local experts. + + Returns: + permuted_local_hidden_states: Permutation of tokens to local experts group. + tokens_per_expert: the number of tokens each local expert to process. + """ + self.hidden_shape = hidden_states.shape + # [S/TP, B, H] -> [S*B/TP, H] + hidden_states = hidden_states.view(-1, self.hidden_shape[-1]) + + # Permute the tokens across the expert parallel devices. + if (self.config.tensor_model_parallel_size > 1) or ( + self.config.expert_model_parallel_size > 1 + ): + with torch.no_grad(): + global_indices = tensor_parallel.gather_from_sequence_parallel_region_to_moe( + max_ind + ) + # Create a mask of mapping between global and local tokens where each + # element is True if it's between the local_expert_indices + global_local_mask = (global_indices >= self.local_expert_indices[0]) & ( + global_indices <= self.local_expert_indices[-1] + ) + local_indices = global_indices.masked_select(global_local_mask) + + if self.router_topk > 1: # k > 1 + global_probs = tensor_parallel.gather_from_sequence_parallel_region_to_moe(max_prob) + self.local_probs = global_probs.masked_select(global_local_mask) + else: + self.local_probs = max_prob + + # [S*B/TP, H] -> [S*B, H] + global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe( + hidden_states, use_global_buffer=True + ) + # Reshape global_local_mask to be compatible with Tensor.gather + global_local_map = global_local_mask.nonzero()[:, 0] + self.global_local_map = global_local_map.view(-1, 1).expand(-1, hidden_states.shape[-1]) + local_hidden_states = moe_gather.apply(global_hidden_states, self.global_local_map) + else: + if self.router_topk > 1: + global_local_mask = torch.ones_like(max_ind).bool() + local_indices = max_ind.masked_select(global_local_mask) + self.local_probs = max_prob.masked_select(global_local_mask) + global_local_map = global_local_mask.nonzero()[:, 0] + self.global_local_map = global_local_map.view(-1, 1).expand( + -1, hidden_states.shape[-1] + ) + local_hidden_states = torch.gather(hidden_states, 0, self.global_local_map) + else: + local_indices = max_ind + self.local_probs = max_prob + local_hidden_states = hidden_states + self.global_local_map = None + + with torch.no_grad(): + # The indices of local_indices that give its sorted order along dim 0. + self.indices = torch.argsort(local_indices, dim=0) + tokens_per_expert = torch.bincount( + local_indices.view(-1), + minlength=self.config.num_moe_experts, + ) + if self.num_local_experts < self.config.num_moe_experts: + tokens_per_expert = tokens_per_expert[ + self.local_expert_indices[0] : self.local_expert_indices[-1] + 1 + ] + tokens_per_expert = tokens_per_expert.cpu().to(torch.long) + + # Stage2: permute the tokens locally so that they are grouped by their expert assignment + # Reshape indices to be compatible with Tensor.gather + self.indices = self.indices.view(-1, 1).expand(-1, hidden_states.shape[-1]) + if self.num_local_experts > 1: + permuted_local_hidden_states = moe_gather.apply(local_hidden_states, self.indices) + else: + permuted_local_hidden_states = local_hidden_states + return ( + permuted_local_hidden_states, + tokens_per_expert, + ) + + def token_unpermutation( + self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None, + ): + """ + Reverse process of `dispatch()` which permutes the ouput of local + experts locallay and across expert parallel rank into the original order to + produce the final output. + + Args: + hidden_states: 2D tensor of shape [sum_tokens_of_all_local_experts, HiddenSize], + ouput of local experts. + bias (optional): The bias tensor. + + Returns: + output_total: un-permuted updated hidden states output from all local experts + with shape of [SeqLen/TP, MBS, HiddenSize] + """ + # Stage1: unpermute the tokens and bias locally respectively. + scores = self.local_probs.to(dtype=hidden_states.dtype) + if self.num_local_experts > 1: + assert self.indices.shape == hidden_states.shape + unpermuted_local_hidden = moe_scatter.apply(hidden_states, self.indices) + else: + unpermuted_local_hidden = hidden_states + + # Scale the expert output prior to reduction and subsequent to local unpermutation if k > 1. + if self.router_topk > 1: + unpermuted_local_hidden = unpermuted_local_hidden * scores.view(-1, 1) + + unpermuted_local_bias = None + if self.add_bias: + assert bias is not None + unpermuted_local_bias = torch.zeros_like(hidden_states) + assert self.indices.shape == bias.shape + unpermuted_local_bias = unpermuted_local_bias.scatter(0, self.indices, bias) + if self.router_topk > 1: + unpermuted_local_bias = unpermuted_local_bias * scores.view(-1, 1) + + output_total = unpermuted_local_hidden + output_bias_total = unpermuted_local_bias + + # Unpermute the tokens across expert parallel devices. + if (self.config.tensor_model_parallel_size > 1) or ( + self.config.expert_model_parallel_size > 1 + ): + assert ( + self.global_local_map is not None + ), "global_local_map is necessary for `AllGather`." + ep_group_size = parallel_state.get_tensor_and_expert_parallel_world_size() + # hidden_shape: [SeqLen/TP, MBS, HiddenSize], glboal_num_tokens = SeqLen/TP*MBS*(TP*EP) + global_num_tokens = self.hidden_shape[0] * self.hidden_shape[1] * ep_group_size + global_hidden_shape = [global_num_tokens, hidden_states.shape[-1]] + assert self.global_local_map.shape == unpermuted_local_hidden.shape + unpermuted_global_hidden = moe_scatter.apply( + unpermuted_local_hidden, self.global_local_map, global_hidden_shape + ) + output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( + unpermuted_global_hidden + ) + if self.add_bias: + # Unpermute the bias across expert parallel devices. + unpermuted_global_bias = torch.zeros_like(unpermuted_global_hidden) + unpermuted_global_bias = unpermuted_global_bias.scatter_add( + 0, self.global_local_map, unpermuted_local_bias + ) + output_bias_total = ( + tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( + unpermuted_global_bias + ) + ) + # bias is duplicated across tensor parallelism ranks; + # reduce scatter reduces bias across tensor parallel_ranks + output_bias_total = ( + output_bias_total / parallel_state.get_tensor_model_parallel_world_size() + ) + else: + if self.router_topk > 1: + global_num_tokens = self.hidden_shape[0] * self.hidden_shape[1] + global_hidden_shape = [global_num_tokens, hidden_states.shape[-1]] + unpermuted_global_hidden = torch.zeros( + global_hidden_shape, + dtype=hidden_states.dtype, + device=torch.cuda.current_device(), + ) + output_total = unpermuted_global_hidden.scatter_add( + 0, self.global_local_map, unpermuted_local_hidden + ) + if self.add_bias: + unpermuted_global_bias = torch.zeros_like(unpermuted_global_hidden) + output_bias_total = unpermuted_global_bias.scatter_add( + 0, self.global_local_map, unpermuted_local_bias + ) + + if self.router_topk == 1: + output_total = output_total * scores + output_total = output_total.view(self.hidden_shape) + if self.add_bias: + assert output_bias_total is not None + if self.router_topk == 1: + output_bias_total = output_bias_total * scores + output_bias_total = output_bias_total.view(self.hidden_shape) + else: + output_bias_total = None + + return output_total, output_bias_total + + +class MoEAlltoAllTokenDispatcher(MoETokenDispatcher): + """ + AlltoAll Based Token dispatcher. + """ + + def __init__( + self, + num_local_experts: int, + local_expert_indices: List[int], + config: TransformerConfig, + ) -> None: + """ + Initialize the AlltoAll token dispatcher. + + Args: + num_local_experts (int): Number of local experts on the current device. + local_expert_indices (List[int]): Indices of local experts on the current device. + config (TransformerConfig): Configuration for the transformer model. + """ + super().__init__(config=config) + self.hidden_shape = None + self.num_input_tokens = None + self.num_local_experts = num_local_experts + self.num_experts = config.num_moe_experts + assert self.num_local_experts > 0, "Expected at least one expert" + if self.num_local_experts > 1: + self.expert_ids_per_ep_rank = torch.tensor( + [i % self.num_local_experts for i in range(self.num_experts)], + dtype=torch.int32, + device=torch.cuda.current_device(), + ) + self.local_expert_indices = local_expert_indices + assert ( + len(self.local_expert_indices) == self.num_local_experts + ), "Invalid local expert indices" + for i in range(len(self.local_expert_indices) - 1): + assert ( + self.local_expert_indices[i] == self.local_expert_indices[i + 1] - 1 + ), "local_expert_indices must be continous" + self.router_topk = config.moe_router_topk + self.add_bias = config.add_bias_linear + self.ep_size = config.expert_model_parallel_size + self.probs = None + self.input_splits = None + self.output_splits = None + self.num_global_tokens_per_local_expert = None + + # Token drop and padding. + # We need to keep track of the token num if we drop tokens without padding them. + self.num_out_tokens = None + # Drop and pad the input to capacity. + self.drop_and_pad = self.config.moe_pad_expert_input_to_capacity + if self.drop_and_pad: + assert self.config.moe_expert_capacity_factor is not None + self.capacity = None + + # A cuda stream synchronization is needed in self.token_permutation() in some cases, + # because there are several non-blocking DtoH data transfers called in self.preprocess(). + # The synchronization happens at different points based on MoE settings as late as possible. + # Valid sync points are "before_permutation_1", "before_ep_alltoall", "before_finish", and "no_sync". + self.cuda_sync_point = "no_sync" + + def preprocess(self, indices: torch.Tensor) -> torch.Tensor: + """ + Preprocess token indices for AlltoAll communication and token permutation. This method computes the number of tokens assigned to each expert based on the input indices. + It also initializes the necessary data structures for AlltoAll communication, such as input + and output splits, and the mapping between global tokens and local experts. + + Args: + indices (torch.Tensor): Tensor of indices mapping tokens to experts. + + Returns: + torch.Tensor: Tensor containing the number of tokens assigned to local expert. + """ + num_local_tokens_per_expert = torch.bincount(indices.view(-1), minlength=self.num_experts) + # num_local_tokens_per_expert: [num_experts] + + ep_size = self.config.expert_model_parallel_size + if self.drop_and_pad: + # probs: [num_experts, capacity] + self.capacity = self.probs.size(1) + num_tokens_per_local_expert = torch.full( + (self.num_local_experts,), self.capacity * self.ep_size, dtype=torch.long + ) + return num_tokens_per_local_expert + elif self.config.moe_expert_capacity_factor is not None: + # Token drop but no pad. A synchronization is needed before the first + # permutation to get the `num_out_tokens` CPU value. + self.num_out_tokens = num_local_tokens_per_expert.sum().to( + torch.device("cpu"), non_blocking=True + ) + self.cuda_sync_point = "before_permutation_1" + elif ep_size > 1: + # Token dropless and enable ep. A synchronization is needed before expert parallel + # AlltoAll communication to get the `input_splits` and `output_splits` CPU values. + self.cuda_sync_point = "before_ep_alltoall" + else: + # Token dropless and no ep. A synchronization is needed before the token_permutation() + # function returns to get the `tokens_per_expert` CPU value. + self.cuda_sync_point = "before_finish" + + if ep_size > 1: + # =================================================== + # Calculate input_splits, output_splits for alltoall-v. + # =================================================== + self.input_splits = ( + num_local_tokens_per_expert.reshape(ep_size, self.num_local_experts) + .sum(axis=1) + .to(torch.device("cpu"), non_blocking=True) + .numpy() + ) + num_global_tokens_per_expert = _gather_along_first_dim_expert_parallel( + num_local_tokens_per_expert + ).reshape(ep_size, self.num_experts) + self.num_global_tokens_per_local_expert = num_global_tokens_per_expert[ + :, self.local_expert_indices[0] : self.local_expert_indices[-1] + 1 + ] + self.output_splits = ( + self.num_global_tokens_per_local_expert.sum(axis=-1) + .to(torch.device("cpu"), non_blocking=True) + .numpy() + ) + num_tokens_per_local_expert = self.num_global_tokens_per_local_expert.sum(axis=0).to( + torch.device("cpu"), non_blocking=True + ) + # =================================================== + # num_global_tokens_per_expert: [ep_size, num_experts] + # num_global_tokens_per_local_expert: [ep_size, num_local_experts] + # num_tokens_per_local_expert: [num_local_experts] + # =================================================== + else: + self.num_global_tokens_per_local_expert = num_local_tokens_per_expert.reshape( + -1, self.num_experts + ) + num_tokens_per_local_expert = num_local_tokens_per_expert.to( + torch.device("cpu"), non_blocking=True + ) + + if self.num_local_experts > 1: + # No further synchronization is needed because torch.repeat_interleave() calls stream + # synchronization internally when the `output_size` parameter is not provided. + self.cuda_sync_point = "no_sync" + self.global_input_tokens_local_experts_indices = torch.repeat_interleave( + self.expert_ids_per_ep_rank, self.num_global_tokens_per_local_expert.ravel() + ) + + return num_tokens_per_local_expert + + def token_permutation( + self, + hidden_states: torch.Tensor, + probs: torch.Tensor, + indices: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Dispatch tokens to local experts using AlltoAll communication. + + Args: + hidden_states (torch.Tensor): Input token embeddings. + probs (torch.Tensor): Probs of tokens assigned to experts. + indices (torch.Tensor): Indices of tokens assigned to experts. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: + - Permuted token embeddings for local experts. + - Number of tokens per expert. + """ + # Preprocess: Get the metadata for communication, permutation and computation operations. + self.hidden_shape = hidden_states.shape + self.probs = probs + assert probs.dim() == 2, "Expected 2D tensor for probs" + assert indices.dim() == 2, "Expected 2D tensor for indices" + hidden_states = hidden_states.view(-1, self.hidden_shape[-1]) + tokens_per_expert = self.preprocess(indices) + + # Perform tensor parallel AlltoAll communication + # hidden_states: [S*B/TP, H] -> [S*B, H/TP] + if parallel_state.get_tensor_model_parallel_world_size() > 1: + hidden_states = tensor_parallel.all_to_all_sp2hp(hidden_states) + + # Permutation 1: input to AlltoAll input + self.hiddden_shape_before_permute = hidden_states.shape + if self.cuda_sync_point == "before_permutation_1": + torch.cuda.current_stream().synchronize() + permutated_local_input_tokens, self.reversed_local_input_permutation_mapping = permute( + hidden_states, + indices, + num_out_tokens=self.num_out_tokens, + padded_mode=self.drop_and_pad, + ) + + # Perform expert parallel AlltoAll communication + if self.cuda_sync_point == "before_ep_alltoall": + torch.cuda.current_stream().synchronize() + global_input_tokens = tensor_parallel.all_to_all( + parallel_state.get_expert_model_parallel_group(), + permutated_local_input_tokens, + self.output_splits, + self.input_splits, + ) + + # Permutation 2: Sort alltoall output by local experts when num_local_experts > 1. + if self.num_local_experts > 1: + if not self.drop_and_pad: + global_input_tokens, self.reversed_global_input_permutation_mapping = permute( + global_input_tokens, self.global_input_tokens_local_experts_indices + ) + else: + global_input_tokens = global_input_tokens.reshape( + self.ep_size, self.num_local_experts, self.capacity, -1 + ) + global_input_tokens = ( + global_input_tokens.transpose(0, 1) + .reshape(self.num_local_experts * self.ep_size * self.capacity, -1) + .contiguous() + ) + + # Perform tensor parallel AllGather on the hidden dimension to obtain the input tokens. + # global_input_tokens: [SEQL, H/TP] -> [SEQL, H] + if parallel_state.get_tensor_model_parallel_world_size() > 1: + global_input_tokens = tensor_parallel.all_gather_last_dim_from_tensor_parallel_region( + global_input_tokens + ) + if self.cuda_sync_point == "before_finish": + torch.cuda.current_stream().synchronize() + + return global_input_tokens, tokens_per_expert + + def token_unpermutation( + self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Reverse the token permutation to restore the original order. + + Args: + hidden_states (torch.Tensor): Output from local experts. + bias (torch.Tensor, optional): Bias tensor (not supported). + + Returns: + Tuple[torch.Tensor, Optional[torch.Tensor]]: + - Unpermuted token embeddings in the original order. + - None (bias is not supported). + """ + assert bias is None, "Bias is not supported in MoEAlltoAllTokenDispatcher" + + # Perform tensor parallel Reduce-Scatter + # hidden_states: [SEQL, H] -> [SEQL, H/TP] + if parallel_state.get_tensor_model_parallel_world_size() > 1: + hidden_states = tensor_parallel.reduce_scatter_last_dim_to_tensor_parallel_region( + hidden_states + ) + + # Unpermutation 2: expert output to AlltoAll input + if self.num_local_experts > 1: + if not self.drop_and_pad: + hidden_states = unpermute( + hidden_states, + self.reversed_global_input_permutation_mapping, + ) + else: + hidden_states = hidden_states.reshape( + self.num_local_experts, self.ep_size, self.capacity, -1 + ) + hidden_states = ( + hidden_states.transpose(0, 1) + .reshape(self.ep_size * self.num_local_experts * self.capacity, -1) + .contiguous() + ) + + # Perform expert parallel AlltoAll communication + # hidden_states: [SEQL, H] -> [SEQL, H/TP] + permutated_local_input_tokens = tensor_parallel.all_to_all( + parallel_state.get_expert_model_parallel_group(), + hidden_states, + self.input_splits, + self.output_splits, + ) + + # Unpermutation 1: AlltoAll output to output + output = unpermute( + permutated_local_input_tokens, + self.reversed_local_input_permutation_mapping, + probs=self.probs, + padded_mode=self.drop_and_pad, + restore_shape=self.hiddden_shape_before_permute, + ) + + # Perform tensor parallel AlltoAll communication + # output: [S*B, H/TP] -> [S*B/TP, H] + if parallel_state.get_tensor_model_parallel_world_size() > 1: + output = tensor_parallel.all_to_all_hp2sp(output) + + # Reshape the output tensor + output = output.view(self.hidden_shape) + return output, None diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py new file mode 100644 index 0000000..473933e --- /dev/null +++ b/megatron/core/transformer/spec_utils.py @@ -0,0 +1,109 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import types +from dataclasses import dataclass, field +from typing import Tuple, Union + +import torch + + +@dataclass +class ModuleSpec: + """This is a Module Specification dataclass. + + Specification defines the location of the module (to import dynamically) + or the imported module itself. It also defines the params that need to be + passed to initialize the module. + + Args: + module (Union[Tuple, type]): A tuple describing the location of the + module class e.g. `(module.location, ModuleClass)` or the imported + module class itself e.g. `ModuleClass` (which is already imported + using `from module.location import ModuleClass`). + params (dict): A dictionary of params that need to be passed while init. + + """ + + module: Union[Tuple, type] + params: dict = field(default_factory=lambda: {}) + submodules: type = None + + +def import_module(module_path: Tuple[str]): + """Import a named object from a module in the context of this function. + + TODO: make this importer module more robust, at least make sure there + are no side effects of using this as is + """ + base_path, name = module_path + try: + module = __import__(base_path, globals(), locals(), [name]) + except ImportError as e: + print(f"couldn't import module due to {e}") + return None + return vars(module)[name] + + +def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs): + # If a module clas is already provided return it as is + if isinstance(spec_or_module, (type, types.FunctionType)): + return spec_or_module + + # If the module is provided instead of module path, then return it as is + if isinstance(spec_or_module.module, (type, types.FunctionType)): + return spec_or_module.module + + # Otherwise, return the dynamically imported module from the module path + return import_module(spec_or_module.module) + + +def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs): + # If the passed `spec_or_module` is + # a `Function`, then return it as it is + # NOTE: to support an already initialized module add the following condition + # `or isinstance(spec_or_module, torch.nn.Module)` to the following if check + if isinstance(spec_or_module, types.FunctionType): + return spec_or_module + + # If the passed `spec_or_module` is actually a spec (instance of + # `ModuleSpec`) and it specifies a `Function` using its `module` + # field, return the `Function` as it is + if isinstance(spec_or_module, ModuleSpec) and isinstance( + spec_or_module.module, types.FunctionType + ): + return spec_or_module.module + + # Check if a module class is provided as a spec or if the module path + # itself is a class + if isinstance(spec_or_module, type): + module = spec_or_module + elif hasattr(spec_or_module, "module") and isinstance(spec_or_module.module, type): + module = spec_or_module.module + else: + # Otherwise, dynamically import the module from the module path + module = import_module(spec_or_module.module) + + # If the imported module is actually a `Function` return it as it is + if isinstance(module, types.FunctionType): + return module + + # Finally return the initialized module with params from the spec as well + # as those passed as **kwargs from the code + + # Add the `submodules` argument to the module init call if it exists in the + # spec. + if hasattr(spec_or_module, "submodules") and spec_or_module.submodules is not None: + kwargs["submodules"] = spec_or_module.submodules + + try: + return module( + *args, **spec_or_module.params if hasattr(spec_or_module, "params") else {}, **kwargs + ) + except Exception as e: + # improve the error message since we hide the module name in the line above + import sys + + tb = sys.exc_info()[2] + raise type(e)(f"{str(e)} when instantiating {module.__name__}").with_traceback( + sys.exc_info()[2] + ) diff --git a/megatron/core/transformer/torch_layer_norm.py b/megatron/core/transformer/torch_layer_norm.py new file mode 100644 index 0000000..57202b2 --- /dev/null +++ b/megatron/core/transformer/torch_layer_norm.py @@ -0,0 +1,43 @@ +import warnings + +import torch + +from megatron.core.transformer import TransformerConfig + + +class WrappedTorchLayerNorm(torch.nn.LayerNorm): + + def __init__( + self, + config: TransformerConfig, + hidden_size: int, + eps: float = 1e-5, + persist_layer_norm: bool = False, ## TODO: unused arguments. See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/issues/223 + zero_centered_gamma: bool = False, + normalization: str = "LayerNorm", # included to match TE interface + ): + self.config = config + assert ( + not self.config.layernorm_zero_centered_gamma + ), f"zero_centered_gamma not supported by torch LayerNorm" + + assert ( + self.config.normalization == "LayerNorm" + ), f'({self.config.normalization}) is not supported in by torch Layernorm' + + assert ( + not self.config.persist_layer_norm + ), f"persist_layer_norm not supported by torch LayerNorm" + + assert ( + not self.config.sequence_parallel + ), f"sequence parallel not supported by torch LayerNorm" + + assert ( + not self.config.memory_efficient_layer_norm + ), f"memory_efficient_layer_norm not supported by torch LayerNorm" + + super().__init__( + normalized_shape=hidden_size, ## applied to last len(normalized_shape.size) dimensions + eps=eps, + ) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py new file mode 100644 index 0000000..b832221 --- /dev/null +++ b/megatron/core/transformer/transformer_block.py @@ -0,0 +1,492 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import re +import warnings +from contextlib import nullcontext +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +from torch import Tensor + +from megatron.core import InferenceParams, parallel_state, tensor_parallel +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.fusions.fused_layer_norm import FusedLayerNorm +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import BaseTransformerLayer, TransformerLayer +from megatron.core.transformer.utils import sharded_state_dict_default +from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor + +try: + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDelayedScaling, + TENorm, + get_cpu_offload_context, + te_checkpoint, + ) + + HAVE_TE = True + LayerNormImpl = TENorm +except ImportError: + HAVE_TE = False + get_cpu_offload_context = None + try: + import apex + + LayerNormImpl = FusedLayerNorm + except ModuleNotFoundError: + from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm + + LayerNormImpl = WrappedTorchLayerNorm + + +def get_num_layers_to_build(config: TransformerConfig) -> int: + + num_layers_per_pipeline_rank = ( + config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + # Interleaved pipeline parallelism: + # Number of layers in each model chunk is the number of layers in the stage, + # divided by the number of model chunks in a stage. + # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0] [2] [4] [6] + # Stage 1: [1] [3] [5] [7] + # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0, 1] [4, 5] + # Stage 1: [2, 3] [6, 7] + + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size + + num_layers_to_build = num_layers_per_virtual_rank + + else: + # Non-interleaved pipeline parallelism: + # Each stage gets a contiguous set of layers. + + num_layers_to_build = num_layers_per_pipeline_rank + + return num_layers_to_build + + +@dataclass +class TransformerBlockSubmodules: + layer_specs: List[ModuleSpec] = None + layer_norm: Optional[Union[ModuleSpec, torch.nn.Module]] = None + + +def _get_block_submodules( + config: TransformerConfig, + spec: Union[TransformerBlockSubmodules, ModuleSpec], +) -> TransformerBlockSubmodules: + + # Transformer block submodules. + if isinstance(spec, TransformerBlockSubmodules): + return spec + + # ModuleSpec here is generally assumed to be for a transformer layer that + # is implemented in `transformer_layer.py` or if it subclasses + # `BaseTransformerLayer` from the `transformer_layer.py` file. + elif isinstance(spec, ModuleSpec): + if issubclass(spec.module, TransformerBlock): + return spec.submodules + elif issubclass(spec.module, BaseTransformerLayer): + num_layers = get_num_layers_to_build(config) + return TransformerBlockSubmodules( + layer_specs=[spec] * num_layers, + layer_norm=LayerNormImpl, + ) + else: + raise Exception(f"specialize for {spec.module.__name__}.") + else: + raise Exception(f"specialize for {type(spec).__name__}.") + + +class TransformerBlock(MegatronModule): + """Transformer class.""" + + def __init__( + self, + config: TransformerConfig, + spec: Union[TransformerBlockSubmodules, ModuleSpec], + post_layer_norm: bool = True, + pre_process: bool = True, + post_process: bool = True, + ): + super().__init__(config=config) + + self.submodules = _get_block_submodules(config, spec) + self.post_layer_norm = post_layer_norm + self.pre_process = pre_process + self.post_process = post_process + # Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers). + # Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the + # number of microbatches. Multiple CUDA graphs per layer is required to support + # pipelining which requires running FWD graph of multiple microbatches before BWD graph. + self.cuda_graphs = {} + self.current_microbatch = -1 + + # required for pipeline parallel schedules + self.input_tensor = None + + self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' + + if get_cpu_offload_context is not None: + ( + self.offload_context, + self.group_prefetch_offload_commit_async, + ) = get_cpu_offload_context( + self.config.cpu_offloading, + self.config.cpu_offloading_num_layers, + self.config.cpu_offloading_activations, + self.config.cpu_offloading_weights, + ) + self.config._cpu_offloading_context = ( + self.offload_context if self.config.cpu_offloading else None + ) + else: + assert ( + self.config.cpu_offloading == False + ), "CPU Offloading is enabled when TE is not present" + + self.offload_context, self.group_prefetch_offload_commit_async = nullcontext(), None + self.config._cpu_offloading_context = None + + self._build_layers() + self.num_layers_per_pipeline_rank = len(self.layers) + + def _build_layers(self): + # Transformer layers. + # @jcasper can we improve how we deal with layer_number? + # currently it's only used in CoreAttention? + # if self.apply_query_key_layer_scaling: + # coeff = self.layer_number + # self.norm_factor *= coeff + def build_layer(layer_spec, layer_number): + return build_module( + layer_spec, + config=self.config, + layer_number=layer_number, + ) + + # offset is implicit in TransformerLayer + self.layers = torch.nn.ModuleList( + [ + build_layer(layer_spec, i + 1) + for i, layer_spec in enumerate(self.submodules.layer_specs) + ] + ) + + # # TODO: add back standalone_embedding_stage + # if self.num_layers == 0: + # # When a standalone embedding stage is used (e.g., + # # args.standalone_embedding_stage == True), virtual pipeline ranks + # # on pipeline rank 0 will have zero transformer layers assigned to + # # them. This results in the model's input and output tensors to be + # # the same, which will cause failure for certain output tensor + # # optimizations (e.g., pipeline output deallocation). To remedy + # # this, we assign a 'no-op' layer on these ranks, which will + # # disconnect the input tensor from the output tensor. + # self.num_layers = 1 + # self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) + # else: + # self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)]) + + # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline + # self.post_process and self.post_layer_norm guide this behavior + if self.submodules.layer_norm and self.post_process and self.post_layer_norm: + self.final_layernorm = build_module( + self.submodules.layer_norm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + else: + self.final_layernorm = None # Either this or nn.Identity + + def _get_layer(self, layer_number: int): + return self.layers[layer_number] + + def _checkpointed_forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + context: Tensor, + context_mask: Tensor, + rotary_pos_emb: Tensor, + packed_seq_params: PackedSeqParams, + ): + """Forward method with activation checkpointing.""" + + def custom(start: int, end: int): + def custom_forward( + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + ): + for index in range(start, end): + layer = self._get_layer(index) + hidden_states, context = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + inference_params=None, + packed_seq_params=packed_seq_params, + ) + return hidden_states, context + + return custom_forward + + def checkpoint_handler(forward_func): + if self.config.fp8: + return te_checkpoint( + forward_func, + self.config.distribute_saved_activations, + tensor_parallel.random.get_cuda_rng_tracker, + parallel_state.get_tensor_model_parallel_group(), + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + ) + else: + return tensor_parallel.checkpoint( + forward_func, + self.config.distribute_saved_activations, + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + ) + + if self.config.recompute_method == 'uniform': + # Uniformly divide the total number of Transformer layers and checkpoint + # the input activation of each divided chunk. + # A method to further reduce memory usage reducing checkpoints. + l = 0 + while l < self.num_layers_per_pipeline_rank: + hidden_states, context = checkpoint_handler( + custom(l, l + self.config.recompute_num_layers) + ) + + l += self.config.recompute_num_layers + + elif self.config.recompute_method == 'block': + # Checkpoint the input activation of only a set number of individual + # Transformer layers and skip the rest. + # A method fully use the device memory removing redundant re-computation. + recompute_skip_num_layers = 0 + for l in range(self.num_layers_per_pipeline_rank): + # Skip recomputation when input grad computation is not needed. + # Need to have at least one input tensor with gradient computation + # for re-enterant autograd engine. + if self.config.fp8 and not hidden_states.requires_grad: + recompute_skip_num_layers += 1 + if ( + l >= recompute_skip_num_layers + and l < self.config.recompute_num_layers + recompute_skip_num_layers + ): + hidden_states, context = checkpoint_handler(custom(l, l + 1)) + else: + hidden_states, context = custom(l, l + 1)( + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + ) + else: + raise ValueError("Invalid activation recompute method.") + + return hidden_states + + def set_input_tensor(self, input_tensor: Tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + context: Tensor = None, + context_mask: Tensor = None, + rotary_pos_emb: Tensor = None, + inference_params: InferenceParams = None, + packed_seq_params: PackedSeqParams = None, + ): + # hidden_states (float): [s, b, h] + # attention_mask (bool): [1, 1, s, s] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = make_viewless_tensor( + inp=hidden_states, + requires_grad=True, + keep_graph=True, + ) + + if self.config.sequence_parallel: + rng_context = tensor_parallel.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + + if self.config.fp8: + import transformer_engine # To keep out TE dependency when not training in fp8 + + if self.config.fp8 == "e4m3": + fp8_format = transformer_engine.common.recipe.Format.E4M3 + elif self.config.fp8 == "hybrid": + fp8_format = transformer_engine.common.recipe.Format.HYBRID + else: + raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.") + + fp8_recipe = TEDelayedScaling( + config=self.config, + fp8_format=fp8_format, + override_linear_precision=(False, False, not self.config.fp8_wgrad), + ) + fp8_group = None + if parallel_state.model_parallel_is_initialized(): + fp8_group = parallel_state.get_amax_reduction_group(with_context_parallel=True) + fp8_context = transformer_engine.pytorch.fp8_autocast( + enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group + ) + else: + fp8_context = nullcontext() + + with rng_context and fp8_context: + # Forward pass. + if self.config.recompute_granularity == 'full' and self.training: + hidden_states = self._checkpointed_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + packed_seq_params=packed_seq_params, + ) + else: + for l_no, layer in enumerate(self.layers): + with self.offload_context: + if (len(self.cuda_graphs) == 0) or (not self.training): + hidden_states, context = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + inference_params=inference_params, + packed_seq_params=packed_seq_params, + ) + # CUDA graph doesn't output context and is expected to be None + assert ( + (context is None) + or (not self.config.enable_cuda_graph) + or (not self.training) + ) + else: + # CUDA graph replay for layer `l_no` and microbatch `self.current_microbatch` + # CUDA graph requires positional arguments with the exception of is_first_microbatch. + # Also CUDA graph accepts only Tensor inputs and outputs. Hence, the arg list and + # returned list is limited to `hidden_states`. + assert (len(self.cuda_graphs) > l_no) and ( + self.current_microbatch < len(self.cuda_graphs[l_no]) + ) + hidden_states = self.cuda_graphs[l_no][self.current_microbatch]( + hidden_states, + is_first_microbatch=(self.current_microbatch == 0), + ) + + if ( + torch.is_grad_enabled() + and self.config.cpu_offloading + and self.group_prefetch_offload_commit_async is not None + ): + hidden_states = self.group_prefetch_offload_commit_async(hidden_states) + + # Final layer norm. + if self.final_layernorm is not None: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None + ) -> ShardedStateDict: + assert not sharded_offsets, "Unexpected sharded offsets" + non_homogeneous_layers = metadata is not None and metadata.get( + 'non_homogeneous_layers', False + ) + sharded_state_dict = {} + + layer_prefix = f'{prefix}layers.' + num_layers = self.config.num_layers + for layer in self.layers: + offset = layer._get_layer_offset() + + global_layer_offset = layer.layer_number - 1 # self.layer_number starts at 1 + state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock + if non_homogeneous_layers: + sharded_prefix = f'{layer_prefix}{global_layer_offset}.' + sharded_pp_offset = [] + else: + sharded_prefix = layer_prefix + sharded_pp_offset = [ + (0, global_layer_offset, num_layers) + ] # PP sharding offset for ShardedTensors + layer_sharded_state_dict = layer.sharded_state_dict( + state_dict_prefix, sharded_pp_offset, metadata + ) + replace_prefix_for_sharding(layer_sharded_state_dict, state_dict_prefix, sharded_prefix) + + sharded_state_dict.update(layer_sharded_state_dict) + + # Add modules other than self.layers + for name, module in self.named_children(): + if not module is self.layers: + sharded_state_dict.update( + sharded_state_dict_default( + module, f'{prefix}{name}.', sharded_offsets, metadata + ) + ) + + return sharded_state_dict diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py new file mode 100644 index 0000000..f2c5f7c --- /dev/null +++ b/megatron/core/transformer/transformer_config.py @@ -0,0 +1,440 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import types +from dataclasses import dataclass +from typing import Callable, Optional, Tuple + +import torch +import torch.nn.functional as F + +from ..model_parallel_config import ModelParallelConfig +from ..utils import init_method_normal, scaled_init_method_normal + + +@dataclass +class TransformerConfig(ModelParallelConfig): + """Configuration object for megatron-core transformers. + + The initialization function has an argument for each parameter, including those in ModelParallelConfig. + """ + + #################### + # model architecture + #################### + num_layers: int = 0 + """Number of transformer layers in a transformer block.""" + + hidden_size: int = 0 + """Transformer hidden size.""" + + num_attention_heads: int = 0 + """Number of transformer attention heads.""" + + num_query_groups: int = None + """Number of query groups for group query attention. If None, normal attention is used.""" + + ffn_hidden_size: int = None + """Transformer Feed-Forward Network hidden size. This is set to 4*hidden_size if not provided.""" + + kv_channels: int = None + """Projection weights dimension in multi-head attention. This is set to hidden_size // + num_attention_heads if not provided.""" + + hidden_dropout: float = 0.1 + """Dropout probability for transformer hidden state.""" + + attention_dropout: float = 0.1 + """Post attention dropout probability.""" + + fp32_residual_connection: bool = False + """If true, move residual connections to fp32.""" + + # @jcasper should we keep this option? + apply_residual_connection_post_layernorm: bool = False + """If True, uses the original BERT residule connection ordering.""" + + layernorm_epsilon: float = 1e-5 + """Epsilon value for any LayerNorm operations.""" + + layernorm_zero_centered_gamma: bool = False + """If set to True, the LayerNorm is adjusted to center the gamma values around 0. This improves + numerical stability.""" + + add_bias_linear: bool = True + """Include a bias term in all linear layers (QKV projections, after core attention, and two in + MLP layer).""" + + add_qkv_bias: bool = False + """Add a bias term only for QKV projections.""" + + gated_linear_unit: bool = False + """Use a gated linear unit for the first linear layer in the MLP.""" + + activation_func: Callable = F.gelu + """Activation function to use for the non-linearity in the MLP.""" + + activation_func_fp8_input_store: bool = False + """Store the input of MLP activation function in FP8 for backprop to save memory. + The stored input is casted back to the original precision before backprop compuatation.""" + + num_moe_experts: int = None + """Number of experts to use for MoE layer. When set, it replaces MLP with MoE layer. Set to None + for no MoE.""" + + rotary_interleaved: bool = False + """True is rotate pairs of even and odd dimensions (RoFormer style), False is rotate pairs of + first half and second half (LLaMa style). Default to False.""" + + window_size: Optional[Tuple[int, int]] = None + """If not None, then will use sliding window attention. The size of the window is specified by + the numbers inside the tuple; -1 is special value meaning "infinite window size".""" + + normalization: bool = "LayerNorm" + """Which norm to use for normalization layers, valid options are `LayerNorm` and `RMSNorm`.""" + + qk_layernorm: bool = False + """Whether to apply LayerNorm to the query and key embeddings.""" + + test_mode: bool = False + """Whether to run real-time tests.""" + + calculate_per_token_loss: bool = False + """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the + global batch, versus the default behavior of assuming all tokens are non-padded.""" + + #################### + # initialization + #################### + init_method: Callable = None + """Method to initialize weights. Note that bias is always set to zero. Should be a function that + takes a single Tensor and initializes it. If None, will be set to + megatron.core.utils.init_method_normal(init_method_std) which is torch nn init normal with + mean=0.0 and std=init_method_std.""" + + output_layer_init_method: Callable = None + """Method to initialize weights of the output layer of both attention and MLP blocks. If None, + will be set to megatron.core.utils.scaled_init_method_normal(init_method_std) which is torch nn + init normal with mean=0.0 and std=init_method_std / math.sqrt(2.0 * num_layers).""" + + init_method_std: float = 0.02 + """Standard deviation of the zero mean normal for the default initialization method, not used if + init_method and output_layer_init_method are provided.""" + + #################### + # mixed-precision + #################### + apply_query_key_layer_scaling: bool = False + """If true, scale Q * K^T by 1 / layer-number. This improve numeric stability when training with + fp16.""" + + attention_softmax_in_fp32: bool = True + """If True, run attention masking and softmax in fp32. This should be True if + apply_query_key_layer_scaling is True.""" + + #################### + # fusion + #################### + bias_activation_fusion: bool = False + """If True, fuses bias addition and the activation function when possible.""" + + masked_softmax_fusion: bool = False + """If True, uses softmax fusion.""" + + persist_layer_norm: bool = False + """If True, uses the persistent fused layer norm kernel. This kernel only supports a fixed set + of hidden sizes.""" + + memory_efficient_layer_norm: bool = False + """If True, and using local layers (not from TransformerEngine), tells Apex to use the memory + efficient fused LayerNorm kernel. Ignored if not using LayerNorm.""" + + bias_dropout_fusion: bool = False # TODO: this should be bias_dropout_add_fusion? + """If True, uses bias dropout fusion.""" + + apply_rope_fusion: bool = False + """If True, use fused RoPE kernel.""" + + #################### + # activation recomputation + #################### + recompute_granularity: str = None + recompute_granularity: str = None + """Determines which type of activation recompute to use. Megatron-core supports 'selective' + activation checkpointing where only the memory intensive part of attention is checkpointed. + These memory intensive activations are also less compute intensive which makes activation + checkpointing more efficient for LLMs (20B+). See Reducing Activation Recomputation in Large + Transformer Models (https://arxiv.org/abs/2205.05198) for more details. 'full' will checkpoint + the entire transformer layer. If None, no recompute is performed and all activations are saved. + If set, must be 'selective' or 'full'. 'selective' always uses all layers. + """ + + recompute_method: str = None + """Determines which transformer layers will be recomputed. uniform will uniformly divide the + total number of transformer layers in a transformer block and recompute the input activation of + each divided chunk at the specified granularity. block will recompute the input activations for + only a set number of transformer layers per pipeline stage. The rest of the layers in the + pipeline stage will not have any activations recomputed. If None, and recompute is enabled, all + layers will do recomputation. If set, must be 'uniform' or 'block'.""" + + recompute_num_layers: int = None + """When recompute_method is uniform, recompute_num_layers is the number of transformer layers in + each uniformly divided recompute unit. When recompute_method is block, recompute_num_layers is + the number of transformer layers to recompute within each pipeline stage. Must be None for + 'selective' activation checkpointing.""" + + distribute_saved_activations: bool = None + """If True, distribute recomputed activations across the model parallel group.""" + + #################### + # fp8 related + #################### + fp8: str = None + """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined + choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 + activation and weight tensors and e5m2 for all FP8 output activation gradient tensors.""" + + fp8_margin: int = 0 + """Margin for the scaling factor computation.""" + + fp8_interval: int = 1 + """Controls how often the scaling factor is recomputed.""" + + fp8_amax_history_len: int = 1 + """The length of the amax history window used for scaling factor computation.""" + + fp8_amax_compute_algo: str = "most_recent" + """Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2 + predefined choices: `max` chooses the largest `amax` in the history window, while `most_recent` + always chooses the most recently seen value. + + """ + + fp8_wgrad: bool = True + """When set to False, override FP8 config options and do the wgrad computation in higher precision.""" + + fp8_dot_product_attention: bool = False + """When set to True, use the FP8 implementation of Dot Product Attention.""" + + fp8_multi_head_attention: bool = False + """When set to True, use the FP8 implementation of Multi Head Attention.""" + + #################### + # MoE related + #################### + moe_router_load_balancing_type: str = "aux_loss" + """Determines the load balancing strategy for the router. "aux_loss" corresponds to the load + balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing + algorithm used in S-BASE, and "none" implies no load balancing.""" + + moe_router_topk: int = 2 + """Number of experts to route to for each token.""" + + moe_router_pre_softmax: bool = False + """Enable pre-softmax routing for MoE, which means the top-k selection is before the softmax. By default, top-k is done after the softmax.""" + + moe_grouped_gemm: bool = False + """When there are multiple experts per rank, compress multiple local (potentially small) gemms + in a single kernel launch to improve the utilization and performance by leveraging the Grouped + GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm). + + """ + + moe_aux_loss_coeff: float = 0 # 1e-2 would be a good start value for load balance loss. + """Scaling coefficient for the aux loss. A starting value of 1e-2 is recommended.""" + + moe_z_loss_coeff: float = None # 1e-3 would be a good start value for z-loss + """Scaling coefficient for the z-loss. A starting value of 1e-3 is recommended.""" + + moe_input_jitter_eps: float = None + """Add noise to the input tensor by applying jitter with a specified epsilon value.""" + + moe_token_dropping: bool = False # TODO: Support token dropping. + """This feature involves selectively dropping and padding tokens for each expert to achieve a + specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Note that this is + currently unsupported so should remain False.""" + + moe_token_dispatcher_type: str = "allgather" + """The type of token dispatcher to use. The default is 'allgather'. Options are 'allgather' and 'alltoall'.""" + moe_per_layer_logging: bool = False + """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.""" + + moe_expert_capacity_factor: float = None + """moe_expert_capacity_factor (float): The capacity factor for each expert, None means no token will be dropped. The default is None.""" + + moe_pad_expert_input_to_capacity: bool = False + """moe_pad_expert_input_to_capacity (bool): If True, pads the input for each expert to match the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False.""" + + moe_token_drop_policy: str = 'probs' + """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. + """ + moe_layer_recompute: bool = False + """Memory optimization: checkpointing moe_layer to save actiavtion memory.""" + + #################### + # miscellaneous + #################### + clone_scatter_output_in_embedding: bool = True + """When set to True, clone the output of scatter_to_sequence_parallel_region in embedding layer + to facilitate garbage collection of input.""" + + disable_parameter_transpose_cache: bool = False + """When set to true, the parameter transposes are not cached for subsequent iterations.""" + + enable_cuda_graph: bool = False + """When set to true, TransformerLayer blocks are wrapped with CUDA graph.""" + + def __post_init__(self): + """Python dataclass method that is used to modify attributes after initialization. + See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details. + """ + super().__post_init__() + if self.fp16 and self.bf16: + raise ValueError( + f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.' + ) + + if self.num_attention_heads % self.tensor_model_parallel_size != 0: + raise ValueError( + f"num_attention_heads ({self.num_attention_heads}) must be a multiple of " + f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." + ) + + if self.ffn_hidden_size is None: + self.ffn_hidden_size = 4 * self.hidden_size + + if self.kv_channels is None: + self.kv_channels = self.hidden_size // self.num_attention_heads + + if self.num_query_groups is None: + self.num_query_groups = self.num_attention_heads + + if self.num_query_groups % self.tensor_model_parallel_size != 0: + raise ValueError( + f"num_query_groups ({self.num_query_groups}) must be a multiple of " + f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." + ) + + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + + if self.expert_model_parallel_size > 1 and self.num_moe_experts is None: + raise ValueError(f'num_moe_experts must be non None to use expert-parallel.') + + if self.num_moe_experts is not None and self.num_moe_experts <= 0: + raise ValueError(f'num_moe_experts must be non-negative.') + + if self.moe_expert_capacity_factor is not None: + if self.moe_token_dispatcher_type != "alltoall": + raise ValueError( + f'moe_expert_capacity_factor only works with alltoall token dispatcher' + ) + if self.moe_expert_capacity_factor < 0: + self.moe_expert_capacity_factor = None + if self.moe_router_load_balancing_type not in ["aux_loss", "none"]: + raise ValueError( + f'moe_expert_capacity_factor only works with aux_loss or none load balancing' + ) + + if self.moe_pad_expert_input_to_capacity: + if self.moe_expert_capacity_factor is None: + raise ValueError( + f'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity' + ) + + if self.cpu_offloading and ( + self.cpu_offloading_num_layers < 0 or self.cpu_offloading_num_layers >= self.num_layers + ): + raise ValueError( + f'CPU offloading can be done only for layers less than {self.num_layers}' + ) + + if self.cpu_offloading and self.pipeline_model_parallel_size > 1: + raise ValueError( + f'Currently there is no support for Pipeline parallelism with CPU offloading' + ) + + if self.cpu_offloading and self.recompute_granularity is not None: + raise ValueError( + f'CPU offloading does not work when activation recomputation is enabled' + ) + + if self.recompute_granularity is not None: + if not self.recompute_granularity in ['full', 'selective']: + raise ValueError( + f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".' + ) + + if self.recompute_method is not None: + if not self.recompute_method in ['block', 'uniform']: + raise ValueError( + f'recompute_method: {self.recompute_method} must be "block" or "uniform".' + ) + elif self.recompute_granularity != 'selective': + raise ValueError( + f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"' + ) + + if self.recompute_granularity != 'selective' and self.recompute_num_layers is None: + raise ValueError( + f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between ' + f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}' + ) + elif ( + self.recompute_granularity == 'selective' and self.recompute_num_layers is not None + ): + raise ValueError( + f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.' + ) + + if self.distribute_saved_activations and self.sequence_parallel: + raise ValueError( + f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}' + ) + + if self.virtual_pipeline_model_parallel_size is not None: + if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0: + raise ValueError( + f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}' + ) + + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + + if self.bias_activation_fusion: + if self.activation_func not in [F.gelu, F.silu]: + raise ValueError( + "When bias_activation_fusion is True, activation function should be either gelu or swiglu" + ) + if ( + self.activation_func == F.gelu + and not self.gated_linear_unit + and not self.add_bias_linear + ): + raise ValueError( + "When bias_activation_fusion is True, gated_linear_unit is False, " + "and activation function is gelu, add_bias_linear must also be True." + ) + if self.activation_func_fp8_input_store: + if self.activation_func != F.silu or not self.gated_linear_unit: + raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.") + if self.apply_rope_fusion and self.rotary_interleaved: + raise ValueError(f'rotary_interleaved does not work with apply_rope_fusion.') + + if self.init_method is None: + self.init_method = init_method_normal(self.init_method_std) + + if self.output_layer_init_method is None: + self.output_layer_init_method = scaled_init_method_normal( + self.init_method_std, self.num_layers + ) + + if self.moe_extended_tp: + if self.moe_token_dispatcher_type != 'allgather': + raise ValueError( + "Moe extended TP parallelism only applies to allgather based token dispatcher." + ) + extended_tp_size = self.tensor_model_parallel_size * self.expert_model_parallel_size + if self.ffn_hidden_size % extended_tp_size != 0: + raise ValueError( + f'ffn_hidden_size: {self.ffn_hidden_size} must be divisible by extended_tp_size {extended_tp_size}' + ) diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py new file mode 100644 index 0000000..631179e --- /dev/null +++ b/megatron/core/transformer/transformer_layer.py @@ -0,0 +1,255 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from abc import ABC +from dataclasses import dataclass, field +from typing import Dict, Optional, Union + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.dist_checkpointing.utils import apply_prefix_mapping +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_viewless_tensor + + +@dataclass +class TransformerLayerSubmodules: + input_layernorm: Union[ModuleSpec, type] = IdentityOp + self_attention: Union[ModuleSpec, type] = IdentityOp + self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp + + pre_cross_attn_layernorm: Union[ModuleSpec, type] = IdentityOp + cross_attention: Union[ModuleSpec, type] = IdentityOp + cross_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp + + pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp + mlp: Union[ModuleSpec, type] = IdentityOp + mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp + + # Mapping for sharded tensor keys to be applied in `sharded_state_dict` method + sharded_state_dict_keys_map: Dict[str, str] = field(default_factory=dict) + + +class BaseTransformerLayer(ABC): + """ A common parent class for `TransformerLayer` like implementations. + + A dummy class that is subclassed by similar `TransformerLayer`s e.g. the + `TransformerLayer` in this file and possibly other `TransformerLayer` + implementations that aim to use `TransformerBlock` as the base module. + The main purpose is to check if any layer (or module) provided in the spec + is a subclass of this class to allow fanning-out of that spec for all the + layers in the `TransformerBlock`. See `_get_block_submodules` method + implementation in `transformer_block.py` file for more details. + """ + + def __init__(self): + pass + + +class TransformerLayer(MegatronModule, BaseTransformerLayer): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + hidden_dropout: float = None, + ): + super().__init__(config=config) + self.submodules_config = submodules + + self.layer_number = layer_number + self._get_layer_offset() + self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout + + ## [Module 1: Input Layernorm] Optional Layernorm on the input data + # TODO: add pytorch only layernorm + self.input_layernorm = build_module( + submodules.input_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + + ## [Module 2: SelfAttention] + self.self_attention = build_module( + submodules.self_attention, config=self.config, layer_number=layer_number, + ) + + ## [Module 3: BiasDropoutFusion] + self.self_attn_bda = build_module(submodules.self_attn_bda) + + ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn + self.pre_cross_attn_layernorm = build_module( + submodules.pre_cross_attn_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + + ## [Module 5: CrossAttention] + self.cross_attention = build_module( + submodules.cross_attention, config=self.config, layer_number=layer_number, + ) + + ## [Module 6: BiasDropoutFusion] + self.cross_attn_bda = build_module(submodules.cross_attn_bda, config=self.config,) + + ## [Module 7: Pre MLP] Optional Layernorm before MLP + self.pre_mlp_layernorm = build_module( + submodules.pre_mlp_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + + ## [Module 8: MLP block] + # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1, + # where MLP and MoE layer both appear alternately? + self.mlp = build_module(submodules.mlp, config=self.config) + if hasattr(self.mlp, 'set_layer_number'): + self.mlp.set_layer_number(self.layer_number) + + ## [Module 9: BiasDropoutFusion] + self.mlp_bda = build_module(submodules.mlp_bda) + + # @jcasper how should we handle nvfuser? + # Set bias+dropout+add fusion grad_enable execution handler. + # TORCH_MAJOR = int(torch.__version__.split('.')[0]) + # TORCH_MINOR = int(torch.__version__.split('.')[1]) + # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad + self.bias_dropout_add_exec_handler = torch.enable_grad + + def _get_layer_offset(self): + + pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() + + num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + total_num_layers = self.config.num_layers + num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size + total_virtual_chunks = total_num_layers // vp_size + offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank) + + else: + # Each stage gets a contiguous set of layers. + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + offset = pipeline_rank * num_layers_per_pipeline_rank + else: + offset = 0 + + return offset + + def forward( + self, + hidden_states, + attention_mask, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + ): + # hidden_states: [s, b, h] + + # Residual connection. + residual = hidden_states + + # Optional Input Layer norm + input_layernorm_output = self.input_layernorm(hidden_states) + + # Self attention. + attention_output_with_bias = self.self_attention( + input_layernorm_output, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + packed_seq_params=packed_seq_params, + ) + + # TODO: could we move `bias_dropout_add_exec_handler` itself + # inside the module provided in the `bias_dropout_add_spec` module? + with self.bias_dropout_add_exec_handler(): + hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)( + attention_output_with_bias, residual, self.hidden_dropout + ) + + # Residual connection. + residual = hidden_states + + # Optional Layer norm after self-attention + pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states) + + # Cross attention. + attention_output_with_bias = self.cross_attention( + pre_cross_attn_layernorm_output, + attention_mask=context_mask, + key_value_states=context, + inference_params=inference_params, + ) + + if isinstance(attention_output_with_bias, dict) and "context" in attention_output_with_bias: + context = attention_output_with_bias["context"] + + # TODO: could we move `bias_dropout_add_exec_handler` itself + # inside the module provided in the `bias_dropout_add_spec` module? + with self.bias_dropout_add_exec_handler(): + hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)( + attention_output_with_bias, residual, self.hidden_dropout + ) + + # Residual connection. + residual = hidden_states + + # Optional Layer norm post the cross-attention. + pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) + + # MLP. + mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output) + + # TODO: could we move `bias_dropout_add_exec_handler` itself + # inside the module provided in the `bias_dropout_add_spec` module? + with self.bias_dropout_add_exec_handler(): + hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)( + mlp_output_with_bias, residual, self.hidden_dropout + ) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = make_viewless_tensor( + inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True + ) + + return output, context + + def sharded_state_dict( + self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None + ) -> ShardedStateDict: + sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata) + prefixed_map = { + f'{prefix}{k}': f'{prefix}{v}' + for k, v in self.submodules_config.sharded_state_dict_keys_map.items() + } + if prefixed_map: + apply_prefix_mapping(sharded_state_dict, prefixed_map) + return sharded_state_dict diff --git a/megatron/core/transformer/utils.py b/megatron/core/transformer/utils.py new file mode 100644 index 0000000..025f7c2 --- /dev/null +++ b/megatron/core/transformer/utils.py @@ -0,0 +1,188 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for transformer layers.""" +from functools import lru_cache +from operator import itemgetter +from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Union + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedStateDict, StateDict +from megatron.core.jit import jit_fuser +from megatron.core.utils import ( + make_sharded_tensor_for_checkpoint, + make_tp_sharded_tensor_for_checkpoint, +) + + +def get_linear_layer(rows, columns, init_method, perform_initialization=True): + """Simple linear layer with weight initialization.""" + layer = torch.nn.Linear(rows, columns) + if perform_initialization: # Take from modelparallel config + init_method(layer.weight) + with torch.no_grad(): + layer.bias.zero_() + return layer + + +@lru_cache(maxsize=32) +def get_default_causal_mask(sq: int) -> torch.Tensor: + """Return the causal upper triangular mask for softmax input.""" + return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool() + + +def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + + +@jit_fuser +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) + + +def openai_gelu(x): + return gelu_impl(x) + + +# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter +@jit_fuser +def erf_gelu(x): + return ( + x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype)) + ) + + +def make_sharded_tensors_for_checkpoint( + state_dict: StateDict, + prefix: str, + tensor_parallel_layers_axis_map: Optional[Dict[str, int]] = None, + sharded_offsets: Iterable[Tuple[int, int, int]] = (), + extra_state_suffix: str = '_extra_state', +): + """Wraps tensors from transformer layers with ShardedTensor or ShardedObject. + + For a given `state_dict`, wraps: + - all _extra_states with ShardedObject + - all tensors specified in tensor_parallel_layers_axis_map with TP and DP sharded ShardedTensor + - other values with DP sharded ShardedTensor + + Args: + state_dict (StateDict): state_dict to convert + prefix (str): prefix appended to keys in final state dict + tensor_parallel_layers_axis_map (Dict[str, int], optional): dict mapping layer + names to the axis for TP sharding + sharded_offsets (Iterable[Tuple[int, int, int]], optional): sharding already + applied (e.g. PP related), passed along to ShardedTensor + extra_state_suffix (str, default = '_extra_state'): layers with this + suffix will be wrapped with ShardedObject instead of ShardedTensor. + + """ + + if tensor_parallel_layers_axis_map is None: + tensor_parallel_layers_axis_map = {} + + sharded_state_dict = {} + for layer_name in state_dict.keys(): + tensor = state_dict[layer_name] + layer_key = f'{prefix}{layer_name}' + + if layer_name.endswith(extra_state_suffix): + sharded_state_dict[layer_key] = make_sharded_object_for_checkpoint( + tensor, layer_key, sharded_offsets + ) + + elif layer_name in tensor_parallel_layers_axis_map: + tp_axis = tensor_parallel_layers_axis_map[layer_name] + sharded_state_dict[layer_key] = make_tp_sharded_tensor_for_checkpoint( + tensor, layer_key, tp_axis, prepend_offsets=sharded_offsets, + ) + + else: + sharded_state_dict[layer_key] = make_sharded_tensor_for_checkpoint( + tensor, layer_key, prepend_offsets=sharded_offsets, + ) + + return sharded_state_dict + + +def make_sharded_object_for_checkpoint( + obj: Any, + key: str, + sharded_offsets: Iterable[Tuple[int, int, int]] = (), + replica_id: Union[None, int, Tuple[int, ...]] = None, + **kwargs, +): + """ Helper for instantiating a non-sharded ShardedObject (replicated across TP and DP group). + + Args: + obj (object): any object to be sharded + key (str): unique identifier of the object + sharded_offsets (Iterable[Tuple[int, int, int]]): offsets normally + prepended to ShardedTensors, will be used as global offsets for + ShardedObject + replica_id (Union[None, int, Tuple[int, ...]]): replica id + """ + if replica_id is None: + replica_id = ( + 0, + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_data_parallel_rank(with_context_parallel=True), + ) + + return ShardedObject(key, obj, *_get_extra_state_offsets(sharded_offsets), replica_id, **kwargs) + + +def _get_extra_state_offsets( + sharded_offsets: Iterable[Tuple[int, int, int]] +) -> Tuple[Tuple[int, ...], Tuple[int, ...]]: + """ Turns ShardedTensor offsets into offsets suitable for ShardedObject. """ + if sharded_offsets: + sharded_offsets = sorted(sharded_offsets, key=itemgetter(0)) # sort by axis + axis, extra_state_offset, extra_state_shape = zip(*sharded_offsets) + assert list(axis) == list( + range(len(axis)) + ), f'Expected contiguous axis for offsets: {sharded_offsets}' + else: + extra_state_shape = (1,) + extra_state_offset = (0,) + return extra_state_shape, extra_state_offset + + +def sharded_state_dict_default( + module: torch.nn.Module, + prefix: str = '', + sharded_offsets: Tuple[Tuple[int, int, int]] = (), + metadata: Optional[dict] = None, +) -> ShardedStateDict: + """Provides implementation for sharded_state_dict method for non-MegatronModules. + + Tries to call `module.sharded_state_dict` when possible, + otherwise uses regular state dict and assumes tensors are replicated across TP and DP. + + `keep_vars=True` is passed to module.state_dict so that optimizer states + can be sharded later on. + + Args: + module (torch.nn.Module): module which sharded state dict we want to obtain + prefix (str): prefix for the state dict keys + sharded_offsets (Tuple[Tuple[int, int, int]], optional): sharding already + applied (e.g. PP related) by sup-modules. Passed along to ShardedTensor + metadata (dict, optional): metadata passed to module sharded_state_dict method + + Returns: + dict: dictionary of state dict keys mapped to ShardedTensors + """ + + if hasattr(module, 'sharded_state_dict'): + module_sharded_sd = module.sharded_state_dict( + prefix=prefix, sharded_offsets=sharded_offsets, metadata=metadata + ) + else: + module_sd = module.state_dict(prefix='', keep_vars=True) + module_sharded_sd = make_sharded_tensors_for_checkpoint( + module_sd, prefix, {}, sharded_offsets, + ) + return module_sharded_sd diff --git a/megatron/core/utils.py b/megatron/core/utils.py new file mode 100644 index 0000000..818bb34 --- /dev/null +++ b/megatron/core/utils.py @@ -0,0 +1,1242 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Utility functions used throughout Megatron core""" +import array +import hashlib +import logging +import math +import operator +import queue +import socket +import sys +import threading +import time +import traceback +from dataclasses import dataclass +from datetime import datetime +from functools import reduce +from types import TracebackType +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedTensor + +logger = logging.getLogger(__name__) + + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + + +def get_attr_wrapped_model(model, attr, allow_none=True, return_model_obj=False): + """Get an attribute from a wrapped model. + If return_model_obj is true, return the object that has the 'attr' attribute; + otherwise, return the attribute directly.""" + if isinstance(model, list): + raise RuntimeError("_get_attr_wrapped_model given a list of models") + + if allow_none: + + def condition(model, attr): + return not hasattr(model, attr) + + else: + + def condition(model, attr): + return getattr(model, attr, None) is None + + while condition(model, attr): + if not hasattr(model, "module"): + raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}") + + model = model.module + + if return_model_obj: + return model + return getattr(model, attr) + + +def get_model_type(model): + return get_attr_wrapped_model(model, 'model_type') + + +def get_model_config(model): + return get_attr_wrapped_model(model, 'config', allow_none=False) + + +class GlobalMemoryBuffer: + """Global buffer to avoid dynamic memory allocations. + Caller should ensure that buffers of the same name + are not used concurrently.""" + + def __init__(self): + self.buffer = {} + + def get_tensor(self, tensor_shape, dtype, name): + required_len = reduce(operator.mul, tensor_shape, 1) + if ( + self.buffer.get((name, dtype), None) is None + or self.buffer[(name, dtype)].numel() < required_len + ): + self.buffer[(name, dtype)] = torch.empty( + required_len, dtype=dtype, device=torch.cuda.current_device(), requires_grad=False + ) + + return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape) + + +def _kernel_make_viewless_tensor(inp, requires_grad): + '''Make a viewless tensor. + + View tensors have the undesirable side-affect of retaining a reference + to the originally-viewed tensor, even after manually setting the '.data' + field. This method creates a new tensor that links to the old tensor's + data, without linking the viewed tensor, referenced via the '._base' + field. + ''' + out = torch.empty( + (1,), + dtype=inp.dtype, + device=inp.device, + requires_grad=requires_grad, + ) + out.data = inp.data + return out + + +class MakeViewlessTensor(torch.autograd.Function): + ''' + Autograd function to make a viewless tensor. + + This function should be used in cases where the computation graph needs + to be propagated, but we only want a viewless tensor (e.g., + ParallelTransformer's hidden_states). Call this function by passing + 'keep_graph = True' to 'make_viewless_tensor()'. + ''' + + @staticmethod + def forward(ctx, inp, requires_grad): + return _kernel_make_viewless_tensor(inp, requires_grad) + + @staticmethod + def backward(ctx, grad_output): + return grad_output, None + + +def make_viewless_tensor(inp, requires_grad, keep_graph): + ''' + Entry-point for creating viewless tensors. + + This method should be used, rather than calling 'MakeViewlessTensor' + or '_kernel_make_viewless_tensor' directly. This method acts as a + switch for determining if an autograd function or a regular method + should be used to create the tensor. + ''' + + # return tensor as-is, if not a 'view' + if inp._base is None: + return inp + + # create viewless tensor + if keep_graph: + return MakeViewlessTensor.apply(inp, requires_grad) + else: + return _kernel_make_viewless_tensor(inp, requires_grad) + + +def assert_viewless_tensor(tensor, extra_msg=None): + '''Assert that a tensor is not a view (i.e., its '._base' field is + not set).''' + if isinstance(tensor, list): + [assert_viewless_tensor(t) for t in tensor] + return tensor + if not isinstance(tensor, torch.Tensor): + return tensor + assert tensor._base is None, ( + "Ensure tensor._base is None before setting tensor.data or storing " + "tensor to memory buffer. Otherwise, a memory leak will occur (and " + "likely accumulate over iterations). %s" + ) % extra_msg + return tensor + + +def safely_set_viewless_tensor_data(tensor, new_data_tensor): + '''Safely set tensor's '.data' field. + + Check first that the tensor is viewless (i.e., '._base' not set). If not, + raise an exception. + ''' + assert_viewless_tensor( + tensor, + extra_msg="FYI, tensor._base has shape %s, and new_data_tensor has shape %s." + % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape), + ) + tensor.data = new_data_tensor + + +def init_method_normal(sigma): + """Init method based on N(0, sigma).""" + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method_normal(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any): + """If torch distributed is initialized, log only on rank + + Args: + logger (logging.Logger): The logger to write the logs + + args (Tuple[Any]): All logging.Logger.log positional arguments + + rank (int, optional): The rank to write on. Defaults to 0. + + kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments + """ + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == rank: + logger.log(*args, **kwargs) + else: + logger.log(*args, **kwargs) + + +def log_on_each_pipeline_stage(logger: logging.Logger, *args: Any, **kwargs: Any): + """Log on first rank in each pipeline stage + + Args: + logger (logging.Logger): The logger to write the logs + + args (Tuple[Any]): All logging.Logger.log positional arguments + + kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments + """ + assert torch.distributed.is_initialized() + + if ( + parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0 + and parallel_state.get_tensor_model_parallel_rank() == 0 + ): + logger.log(*args, **kwargs) + + +def check_param_hashes_across_dp_replicas(model: List[torch.nn.Module]) -> bool: + """Computes hashes of all parameters in model, all-gathers hashes across DP replicas, + and then checks for equality between the locally-computed hashes and the hashes + from DP replica 0. + + NOTE: This function computes SHA-1 hashes on the CPU and thus needs to move all param + tensors from GPU to CPU first; as a result, this function is not intended to be called + very frequently in the main training loop. + + Args: + model (List[torch.nn.Module]): List of model chunks whose parameter hashes need to + be checked. + + Returns: + True if all param hashes match with corresponding hash on DP replica 0, False + otherwise. + """ + + # Compute per-parameter hashes on this rank. + params = [] + local_param_hashes = [] + for model_chunk_id, model_chunk in enumerate(model): + for param_name, param in model_chunk.named_parameters(): + param_hash = torch.frombuffer( + array.array( + 'B', hashlib.sha1(param.data.to("cpu").float().numpy(force=True)).digest() + ), + dtype=torch.uint8, + ) + params.append((model_chunk_id, param_name, param)) + local_param_hashes.append(param_hash) + local_param_hashes = torch.stack(local_param_hashes) + + # Collect per-parameter hashes across all ranks in DP group. + all_param_hashes = [ + torch.zeros_like(local_param_hashes) + for _ in range(parallel_state.get_data_parallel_world_size()) + ] + torch.distributed.all_gather( + all_param_hashes, local_param_hashes, group=parallel_state.get_data_parallel_group_gloo() + ) + + # Make sure local per-parameter hash matches DP rank 0. + param_hashes_match = torch.equal(local_param_hashes, all_param_hashes[0]) + if not param_hashes_match: + for i, (model_chunk_id, param_name, param) in enumerate(params): + if not torch.equal(local_param_hashes[i], all_param_hashes[0][i]): + rank = torch.distributed.get_rank() + logger.info( + f"[Rank {rank}] Hash not matching for {param_name} in model chunk {model_chunk_id}" + ) + return param_hashes_match + + +def make_tp_sharded_tensor_for_checkpoint( + tensor, key, tp_axis=0, replica_id=None, prepend_offsets=(), **kwargs +): + """Helper for instantiating a ShardedTensor where the `tp_axis` dimension is sharded across TP group. + + Optionally, can provide offsets which prepend new dimensions to the tensor. + """ + + prepend_axis_num = len(prepend_offsets) + + if replica_id is None: + replica_id = (0, 0, parallel_state.get_data_parallel_rank(with_context_parallel=True)) + + return ShardedTensor.from_rank_offsets( + key, + tensor, + *prepend_offsets, + ( + tp_axis + prepend_axis_num, + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_tensor_model_parallel_world_size(), + ), + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + **kwargs, + ) + + +def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_id=None, **kwargs): + """Helper for instantiating a non-sharded ShardedTensor (replicated across TP and DP group). + + Optionally, can provide offsets which prepend new dimensions to the tensor. + """ + + prepend_axis_num = len(prepend_offsets) + + if replica_id is None: + replica_id = ( + 0, + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_data_parallel_rank(with_context_parallel=True), + ) + + return ShardedTensor.from_rank_offsets( + key, + tensor, + *prepend_offsets, + replica_id=replica_id, + prepend_axis_num=prepend_axis_num, + **kwargs, + ) + + +def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_input): + + # Doing gather + slicing during the NeMo forward pass can make this tensor + # not be contiguous. PyTorch only checks if the tensor is contiguous, and only + # clones it if it's not contiguous: + # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761 + grad_output = grad_output.contiguous() + # Convert the tensor shapes to 2D for execution compatibility + if grad_output.dim() == 3: + grad_output = grad_output.view( + grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2] + ) + all_gathered_input = all_gathered_input.view( + all_gathered_input.shape[0] * all_gathered_input.shape[1], all_gathered_input.shape[2] + ) + + return grad_output, all_gathered_input + + +def drain_embedding_wgrad_compute(config, embedding_activation_buffer, grad_output_buffer, weight): + """Helper for performing embedding wgrad GEMM's during the pipeline drain phase, pipelines the AllGather and GEMM's. + + Should only be used when pipeline model parallelism and gradient accumulation fusion are enabled. + """ + + assert len(embedding_activation_buffer) == len( + grad_output_buffer + ), "Length of activation and gradient buffers need to be equal!" + + import fused_weight_gradient_mlp_cuda + + from megatron.core.parallel_state import ( + get_global_memory_buffer, + get_tensor_model_parallel_group, + get_tensor_model_parallel_world_size, + ) + + input = embedding_activation_buffer.pop(0) + world_size = get_tensor_model_parallel_world_size() + dim_size = list(input.size()) + dim_size[0] = dim_size[0] * world_size + + all_gathered_input = [None, None] + if config.sequence_parallel: + all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu_0") + handle = torch.distributed._all_gather_base( + all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=False + ) + + all_gathered_input[0] = all_gather_buffer + all_gather_buffer = None + else: + all_gathered_input[0] = input + + input = None + + def wgrad_compute(all_gathered_input, grad_output, weight): + + grad_output, all_gathered_input = prepare_input_tensors_for_wgrad_compute( + grad_output, all_gathered_input + ) + + if config.gradient_accumulation_fusion: + if weight.main_grad.dtype == torch.float32: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32( + all_gathered_input, grad_output, weight.main_grad + ) + elif weight.main_grad.dtype in (torch.float16, torch.bfloat16): + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16( + all_gathered_input, grad_output, weight.main_grad + ) + else: + raise RuntimeError("Unsupported gradient type for gradient accumulation fusion") + + # We have all_gathered_input list acting as a double buffer here, + # since we are pipelining the AllGather and GEMM,one buffer all gathers + # the input while the other buffer reads from it for the GEMM. We use i + # and (i+1) for indexing to enable this double buffering. + for i in range(len(embedding_activation_buffer)): + input = embedding_activation_buffer.pop(0) + if config.sequence_parallel: + name = "mpu_" + str((i + 1) % 2) + all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, name) + handle = torch.distributed._all_gather_base( + all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True + ) + + all_gathered_input[(i + 1) % 2] = all_gather_buffer + all_gather_buffer = None + else: + all_gathered_input[(i + 1) % 2] = input + + grad_output = grad_output_buffer.pop(0) + wgrad_compute(all_gathered_input[i % 2], grad_output, weight) + drain_idx = (i + 1) % 2 + input, all_gathered_input[i % 2], grad_output = None, None, None + + if config.sequence_parallel: + handle.wait() + + grad_output = grad_output_buffer.pop(0) + wgrad_compute(all_gathered_input[drain_idx], grad_output, weight) + input, all_gathered_input[drain_idx], grad_output = None, None, None + + +def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): + return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) + + +## computes l2 norm for a list of contiguous tensors +## works as a drop-in replacement for amp_C.multi_tensor_l2norm +def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_tensor, *args): + l2 = [[(torch.norm(tensor)) for tensor in tensor_list] for tensor_list in tensor_lists] + l2_reduced = torch.norm(torch.tensor(l2)) + l2_cuda = torch.tensor([float(l2_reduced)], dtype=torch.float, device='cuda') + return l2_cuda, None + + +## works as a drop-in replacement for amp_C.multi_tensor_scale +def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): + inputs, targets = tensor_lists[0], tensor_lists[1] + if inputs == targets: + for i in range(len(targets)): + ## for parity with apex implementation + targets[i] *= scale + else: + for i in range(len(targets)): + targets[i] = inputs[i] * scale + + +class _ValueWithRank: + """This is an internal class, not for use outside this module + + Attributes: + _rank (int): rank for the value + _value (float) : the value it stores, eg elapsed time + _unit (str) : unit for the value + """ + + def __init__(self, value: float, rank: int, unit: str = "") -> None: + """Initializer + + Args: + _value (float): the initial value with which it is inited + _rank (int): the rank number + _unit (str) : the unit of the value, eg ms or flops + """ + self._rank = rank + self._value = value + self._unit = unit + + def __lt__(self, other) -> bool: + """Check if value of self is smaller than other's value + + Args: + other (_ValueWithRank): The other object to compare with + + Returns: + bool: True if lhs._value of operand is less than rhs._value, else False + """ + return self._value < other._value + + def __gt__(self, other) -> bool: + """Check if value of self is larger than other's value + + Args: + other (_ValueWithRank): The other object to compare with + + Returns: + bool: True if lhs._value of operand is greater than rhs._value, else False + """ + return self._value > other._value + + def __call__(self) -> Tuple[float, int, str]: + """Returns the value, the rank, and unit as a Tuple + + Returns: + Tuple[float, int, str]: value, rank, unit + """ + return self._value, self._rank, self._unit + + def __str__(self) -> str: + """String representation of the object + + Returns: + str: strigified object + """ + + return f"{self._value:.2f}{self._unit}/{self._rank}" + + +@dataclass +class _StragglerData: + """This is an internal dataclass, not for use outside this module + + Attributes: + min_elapsed (_ValueWithRank) min iteration time across all ranks + max_elapsed (_ValueWithRank) max iteration time across all ranks + min_btime (_ValueWithRank) min cpu time across all ranks + max_btime (_ValueWithRank) max cpu time across all ranks + min_temp (_ValueWithRank): min gpu temp across all ranks + max_temp (_ValueWithRank): max gpu temp across all ranks + min_power (_ValueWithRank) min gpu power across all ranks + max_power (_ValueWithRank) max gpu power across all ranks + min_util (_ValueWithRank): min gpu util across all ranks + max_util (_ValueWithRank): max gpu util across all ranks + min_clock (_ValueWithRank): min gpu clock across all ranks + max_clock (_ValueWithRank) max gpu clock across all ranks + aflops (List[_ValueWithRank]): sorted array of (_ValueWithRank) + """ + + # gemm time + min_elapsed = _ValueWithRank(sys.float_info.max, 0, "ms") + max_elapsed = _ValueWithRank(sys.float_info.min, 0, "ms") + # get_batch time + min_btime = _ValueWithRank(sys.float_info.max, 0, "us") + max_btime = _ValueWithRank(sys.float_info.min, 0, "us") + # temp + min_temp = _ValueWithRank(sys.float_info.max, 0, "C") + max_temp = _ValueWithRank(sys.float_info.min, 0, "C") + # power + min_power = _ValueWithRank(sys.float_info.max, 0, "W") + max_power = _ValueWithRank(sys.float_info.min, 0, "W") + # util + min_util = _ValueWithRank(sys.float_info.max, 0, "%") + max_util = _ValueWithRank(sys.float_info.min, 0, "%") + # clock + min_clock = _ValueWithRank(sys.float_info.max, 0, "MHz") + max_clock = _ValueWithRank(sys.float_info.min, 0, "MHz") + aflops: Union[List[_ValueWithRank], None] = None + + +class StragglerDetector: + """Singleton Class implementing per rank Straggler Detector + + It use cuda events to time operation of choice using the + start and stop methods which can be directly invoked using + the class instance or can be used like a python context. + After collection, a report() method is available to display + the collected metrics. It is only supported if CUDA is + available. megatron/core/README_STRAGGLER.md for more info + + Note: + The instance and class attributes mentioned below are all + private to the class and has no use outside the class + + Attributes: + _off (bool): current state of the toggle + start (FunctionType): start method + stop (FunctionType): stop method + world (int): world size + rank (int): rank for this instance + mmcnt (int): number of ranks to report + port (int): control port + amp (float): amplification factor for TFLOPs, default 3.0 + toggle (bool): whether to start/stop detector collection + bdata (bool): when true, just collect get_batch + dev (int): cuda device + evt_q (LifoQueue): cuda event queue + start_gemm_ev (list[torch.cuda.Event]): cuda start event + stop_gemm_ev (list[torch.cuda.Event]): cuda stop event + start_data_ev (list[torch.cuda.Event]): cuda start event + stop_data_ev (list[torch.cuda.Event]): cuda stop event + start_gemm_tm (list[int]): start time (wallclock) + stop_gemm_tm (list[int]): stop time (wallclock) + start_data_tm (list[int]): start time for get_batch + stop_data_tm (list[int]): stop time for get_batch + sock (socket): the controller socket + ctrlr (Thread): the controller thread + """ + + _configured = False + """Indicates if the singleton instance is configured or not + """ + + def __new__(cls: Type["StragglerDetector"]) -> "StragglerDetector": + """Constructor + Creates an instance of the class if not created + + Args: + cls (Type['StragglerDetector']): The class type + + Returns: + StragglerDetector: the class instance + """ + + if not hasattr(cls, "_instance"): + cls._instance = super(StragglerDetector, cls).__new__(cls) + return cls._instance + + def __init__(self) -> None: + """Initializer + + The inital state of the StragglerDetector instance is disabled. + The enabled state is indicated using self._off member variable + and the proerty enabled. + """ + self._off: bool = True + self.start = self.null_method + self.stop = self.null_method + self.world: int = 0 + self.rank: int = 0 + self.mmcnt: int = 1 + self.port: int = 0 + self.amp: float = 3.0 + self.toggle: bool = False + self.bdata: bool = False + self.dev: Union[torch.device, int, None] = None + self.evt_q: Union[queue.LifoQueue, None] = None + self.start_gemm_ev: List[torch.cuda.Event] = [] + self.stop_gemm_ev: List[torch.cuda.Event] = [] + self.start_data_ev: List[torch.cuda.Event] = [] + self.stop_data_ev: List[torch.cuda.Event] = [] + self.start_gemm_tm: List[int] = [] + self.stop_gemm_tm: List[int] = [] + self.start_data_tm: List[int] = [] + self.stop_data_tm: List[int] = [] + self.sock: Union[socket.socket, None] = None + self.ctrlr: Union[threading.Thread, None] = None + + def configure( + self, + world: int, + rank: int, + mmcnt: int = 1, + amp: float = 3.0, + port: int = 65535, + prefill: int = 1024, + enabled: bool = False, + ) -> None: + """This method is called to configure the Singleton instance + + It should be called once per instantiation per process. + + Note: + The constructor keeps the state of instance disabled + i.e no collection will happen even when start/stop methods are + called. Only when enabled is True (self._off is True), the + start/stop method pointers get assigned the real collection + methods, otherwise they are initialized with null_method + + Args: + world (int): World Size + rank (int): The rank of this trainer + mmcnt (int, optional): Number of ranks to print for showing Min/Max Etpt. + Defaults to 1. + amp (float, optional): Set to 3.0 if we only use timers in fwd pass. + Defaults to 3.0. + port (int, optional): Control port, useful only for rank-0. Defaults to 65535. + prefill (int, optional): Howmany Events to pre-populate. Defaults to 1024. + enabled (bool, optional): Whether or not collection is enabled on startup. + Defaults to False. + """ + if StragglerDetector._configured: + # don't throw + return + StragglerDetector._configured = True + self.bdata = False + self.start = self.null_method + self.stop = self.null_method + self._off = True + # No CUDA, No Support + if torch.cuda.is_available(): + self._off = not enabled + self.world = world + self.rank = rank + self.mmcnt = mmcnt if mmcnt > 1 else 1 + self.amp = amp + self.port = port + self.toggle = False + self.bdata = False + self.evt_q = queue.LifoQueue() + self.start_gemm_ev = [] + self.stop_gemm_ev = [] + self.start_data_ev = [] + self.stop_data_ev = [] + self.start_gemm_tm = [] + self.stop_gemm_tm = [] + self.start_data_tm = [] + self.stop_data_tm = [] + backend = torch.distributed.get_backend() + if backend == "nccl": + self.dev = torch.cuda.current_device() + else: + self.dev = torch.device("cpu") + # cache some events + for _ in range(prefill): + self.evt_q.put(torch.cuda.Event(enable_timing=True)) + if self.rank == 0: + # Start the controller + self._controller() + if not self._off: + self.start = self.start_method + self.stop = self.stop_method + + def reset(self) -> None: + """This method is called to reset the metrics state of the instance + + It is generally called from within elapsed() after extracting per rank metrics. + """ + if self._off: + return + # Pool them + if self.evt_q is not None: + _ = [self.evt_q.put(ev) for ev in self.start_gemm_ev] + _ = [self.evt_q.put(ev) for ev in self.stop_gemm_ev] + _ = [self.evt_q.put(ev) for ev in self.start_data_ev] + _ = [self.evt_q.put(ev) for ev in self.stop_data_ev] + self.start_gemm_ev = [] + self.stop_gemm_ev = [] + self.start_data_ev = [] + self.stop_data_ev = [] + # Use regular timers + self.start_gemm_tm = [] + self.stop_gemm_tm = [] + self.start_data_tm = [] + self.stop_data_tm = [] + self.bdata = False + + def start_method(self) -> None: + """This method adds the start timers. + + Both cuda event and perf_counter are added. If bdata is set to + true from __call__, this method skips inserting cuda + timer. This way it can be used to measure time spent on + CPU - generally useful for timing get_batch() + """ + # Not reentrant + if self.evt_q is not None and self.evt_q.qsize() > 1: + sev = self.evt_q.get() # no try-catch + eev = self.evt_q.get() # no try-catch + else: + sev = torch.cuda.Event(enable_timing=True) + eev = torch.cuda.Event(enable_timing=True) + # First check if this start is for data + if self.bdata: + self.start_data_ev.append(sev) + self.stop_data_ev.append(eev) + self.start_data_tm.append(0) + self.stop_data_tm.append(0) + idx = len(self.stop_data_tm) - 1 + self.start_data_tm[idx] = time.perf_counter_ns() + self.start_data_ev[idx].record() + self.bdata = False + return + self.start_gemm_ev.append(sev) + self.stop_gemm_ev.append(eev) + self.start_gemm_tm.append(0) + self.stop_gemm_tm.append(0) + idx = len(self.stop_gemm_tm) - 1 + self.start_gemm_tm[idx] = time.perf_counter_ns() + self.start_gemm_ev[idx].record() + + def stop_method(self) -> None: + """This method adds the stop timers. + + Both cuda event and perf_counter are added. If bdata is set to + true from __call__, this method skips inserting cuda + timer. Also see start_method() + """ + # Not reentrant + # First check if this stop is for data + idx = len(self.stop_data_tm) - 1 + if idx >= 0 and self.stop_data_tm[idx] == 0: + self.stop_data_tm[idx] = time.perf_counter_ns() + self.stop_data_ev[idx].record() + return + idx = len(self.stop_gemm_tm) - 1 + if idx >= 0 and self.stop_gemm_tm[idx] == 0: + self.stop_gemm_tm[idx] = time.perf_counter_ns() + self.stop_gemm_ev[idx].record() + + def elapsed(self) -> Tuple[float, float, int, int, int, int]: + """This method is called from report(), or can be called directly + + It is called to collect all the elapsed time since last reset(). + It finally calls reset() + + Returns: + Tuple[float, float, int, int, int, int]: see below for returns + delta : time spent in kernel + batch_delta : time spent in get_batch + temp : observed gpu temp + power : observed gpu power + util : observed gpu utilization + clock : observed gpu clock + """ + if self._off: + # match with return below + return 0, 0, 0, 0, 0, 0 + ls_ev = len(self.start_gemm_ev) + le_ev = len(self.stop_gemm_ev) + ls_bs = len(self.start_data_ev) + ls_be = len(self.stop_data_ev) + delta = 0.0 + batch_delta = 0.0 + temp = 0 + power = 0 + clock = 0 + if ls_ev != le_ev: + logger.warning(f"Event Start/Stop out of sync {ls_ev}/{le_ev}") + elif ls_bs != ls_be: + logger.warning(f"get_batch Start/Stop out of sync {ls_bs}/{ls_be}") + else: + temp = torch.cuda.temperature() + power = torch.cuda.power_draw() + util = torch.cuda.utilization() + clock = torch.cuda.clock_rate() + torch.cuda.synchronize() + # Process Events + for i in range(ls_ev): + e_ev = self.start_gemm_ev[i].elapsed_time(self.stop_gemm_ev[i]) + e_tm = (self.stop_gemm_tm[i] - self.start_gemm_tm[i]) / 1e6 # ns to ms + # Pick the larger of Event and perf_counter time? + delta += max(e_ev, e_tm) + # Process get_batch + for i in range(ls_bs): + b_ev = self.start_data_ev[i].elapsed_time(self.stop_data_ev[i]) + b_tm = (self.stop_data_tm[i] - self.start_data_tm[i]) / 1e6 # ns to ms + # data fetching has prefetch, hence take the max, instead of avg + batch_delta = max(batch_delta, max(b_ev, b_tm)) + self.reset() # Prepare for next round + # time in ms, batch_delta in ms, check return above + return delta, batch_delta, temp, power, util, clock + + def report(self, total_flops: float = 0.0, log_interval: int = 0) -> bool: + """Function to log the min/max metircs and the associated rank over a time period + + It finds the slowest and fastest rank among all ranks. It should be + called by all ranks, but only rank-0 prints the analysis + At the end it checks, if the straggler detector should + remain active or if it should be deactivated. + + Args: + total_flops (float, optional): The theoretical flops over the period. Defaults to 0.0. + log_interval (int, optional): The training interval over which reporting is called(ms) + Defaults to 0. + + Returns: + bool: True if reported, else False + """ + ret = False + if not self._off and total_flops > 0.0 and log_interval > 0: + elapsed, btime, temp, power, util, clock = self.elapsed() # get raw time + # btime (get_batch time is max in the iteration) + ptime = elapsed / (log_interval * 1.0) # avg per iteration elapsed time, ms + api_flops = total_flops / (log_interval * 1.0) # avg per iteration flops, ms + apir_flops = api_flops / ( + ptime * 10**9 * self.world + ) # this is avg per iteration this rank's thruput, TFLOP/s (note 10**9), + et_flops = apir_flops / self.amp # Estimated TFLOPs, not tracing backward + + o_dt = self._min_max( + ptime, + btime, + float(temp), + float(power), + float(util), + float(clock), + et_flops, + ) + if self.rank == 0 and o_dt is not None and o_dt.aflops is not None: + now = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]" + min_flops, min_frank, _ = o_dt.aflops[0]() + max_flops, max_frank, _ = o_dt.aflops[-1]() + logger.info( + f"{now} | " + f"MnRtt/Rnk: {o_dt.min_elapsed} | " + f"MxRtt/Rnk: {o_dt.max_elapsed} | " + f"MnPwr/Rnk: {o_dt.min_power} | " + f"MxPwr/Rnk: {o_dt.max_power} | " + f"MnTmp/Rnk: {o_dt.min_temp} | " + f"MxTmp/Rnk: {o_dt.max_temp} | " + f"MnUtl/Rnk: {o_dt.min_util} | " + f"MxUtl/Rnk: {o_dt.max_util} | " + f"MnClk/Rnk: {o_dt.min_clock} | " + f"MxClk/Rnk: {o_dt.max_clock} | " + f"MnDRtt/Rnk: {o_dt.min_btime} | " + f"MxDRtt/Rnk: {o_dt.max_btime} | " + f"MnEtpt/Rnk: {min_flops:.2f}TF/{min_frank} | " + f"MxEtpt/Rnk: {max_flops:.2f}TF/{max_frank}" + ) + if self.mmcnt > 1 and self.mmcnt < self.world: + line = f"^^^^ Bottom {self.mmcnt} Ranks with lowest Etpt(TF):" + for i in range(self.mmcnt): + line += f" {o_dt.aflops[i]}," + logger.info(line) + line = f"^^^^ Top {self.mmcnt} Ranks with highest Etpt(TF):" + shift = self.world - self.mmcnt + for i in range(self.mmcnt): + line += f" {o_dt.aflops[i+shift]}," + logger.info(line) + ret = True + + # Check/Communicate if tracking is turned off or on + self._check_toggle() + return ret + + def _check_toggle(self) -> None: + """Helper method to check if a request to toggle the collection state was made + + It checks iof collection state toggle req was made via the server listening on + rank-0 since last call to report(). Called by report(). Calling this method + indirectly from report() is the only way to activate the change that is made + via rank-0 + """ + # If no change just commnunicate the current + off = self._off + if self.rank == 0 and self.toggle: + off = not self._off + self.toggle = False + st = torch.tensor(off, dtype=torch.bool, device=self.dev) + torch.distributed.broadcast(st, 0) # Blocking + # save old switch + off = self._off + self._off = bool(st.item()) + if off != self._off: + if not self._off: + self.start = self.start_method + self.stop = self.stop_method + state = "ON" + else: + self.start = self.null_method + self.stop = self.null_method + state = "OFF" + if self.rank == 0: + logger.info(f"Toggling StragglerDetector State {state}") + + def _handler(self) -> None: + """Thread function for the controller. + + It is a tcp-server that listens on a port. Uses HTTP protocol. + If connected to it using curl, it indicates a toggle of the + collection state. The actual toggling happens at the end of + calling report() when _check_toggle() is called. + """ + resp = f"HTTP/1.0 200 OK\r\nConnection: Close\r\nContent-length: " + + if self.rank == 0: + state = "OFF" if self._off else "ON" + logger.info( + f"Controller ready to recv " f"commands on port {self.port}. Current state {state}" + ) + while True and self.sock is not None: + try: + conn, _ = self.sock.accept() + _ = conn.recv(1024) + self.toggle = True + state = "ON" if self._off else "OFF" + msg = f"Will turn StragglerDetector {state} at next logging interval" + msg_len = len(msg) + final_resp = f"{resp}{msg_len}\r\n\r\n{msg}" + conn.send(final_resp.encode()) + conn.close() + logger.info(msg) + except Exception as err: + logger.error(f"Error in stragler handler.. {str(err)}") + return + + def _controller(self): + """Installs a controller listener that is used to toggle collection state. + + Called from configure(). Ignored for all ranks other than rank-0 + """ + try: + if self.rank == 0: + neth = "0.0.0.0" + netp = self.port + self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.sock.bind((neth, netp)) + self.sock.listen(128) + self.ctrlr = threading.Thread( + target=self._handler, args=(), name="straggler", daemon=True + ) + self.ctrlr.start() + except Exception as err: + logger.warning(f"StragglerDetector cannot be controlled.. {str(err)}") + + def _min_max( + self, + ptime: float, + btime: float, + temp: float, + power: float, + util: float, + clock: float, + flops: float, + ) -> Union[_StragglerData, None]: + """Helper function to find the min/max values + + Args: + ptime (float): avg per iteration gpu time + btime (float): avg per iteration cpu time + temp (float): gpu temp at the time of reporting + power (float): gpu power at the time of reporting + util (float): gpu util at the time of reporting + clock (float): gpu clock at the time of reporting + flops (float): estimated flops for the rank + + Returns: + Union[_StragglerData, None]: It contains the min/max of few metrics and the + corresponding rank it also has sorted list of + all (flops, rank) sorted by flops (aflops) + or returns None if collecton is disabled + """ + if self._off: + return None + # initialize output data object + o_dt = _StragglerData() + + prof_data: Dict[str, Union[int, float]] = {} + data_list: List[Dict[str, Union[int, float]]] = [] + prof_data["rank"] = self.rank + prof_data["time"] = ptime + prof_data["btime"] = btime + prof_data["temp"] = temp + prof_data["power"] = power + prof_data["util"] = util + prof_data["clock"] = clock + prof_data["flops"] = flops + + if self.rank == 0: + data_list = [prof_data] * self.world + + # this is blocking by default + torch.distributed.gather_object(prof_data, object_gather_list=data_list, dst=0) + + if self.rank == 0: + min_ctime = min(data_list, key=lambda k: k["time"]) # elapsed + max_ctime = max(data_list, key=lambda k: k["time"]) # elapsed + + min_cbatch = min(data_list, key=lambda k: k["btime"]) # batch time + max_cbatch = max(data_list, key=lambda k: k["btime"]) # batch time + + min_ctemp = min(data_list, key=lambda k: k["temp"]) # temp + max_ctemp = max(data_list, key=lambda k: k["temp"]) # temp + + min_cpower = min(data_list, key=lambda k: k["power"]) # power + max_cpower = max(data_list, key=lambda k: k["power"]) # power + + min_cutil = min(data_list, key=lambda k: k["util"]) # gpu util + max_cutil = max(data_list, key=lambda k: k["util"]) # gpu util + + min_cclock = min(data_list, key=lambda k: k["clock"]) # gpu clock + max_cclock = max(data_list, key=lambda k: k["clock"]) # gpu clock + + min_val = min_ctime["time"] + min_rank = min_ctime["rank"] + max_val = max_ctime["time"] + max_rank = max_ctime["rank"] + o_dt.min_elapsed = _ValueWithRank(min_val, int(min_rank), "ms") + o_dt.max_elapsed = _ValueWithRank(max_val, int(max_rank), "ms") + + min_val = min_cbatch["btime"] + min_rank = min_cbatch["rank"] + max_val = max_cbatch["btime"] + max_rank = max_cbatch["rank"] + o_dt.min_btime = _ValueWithRank(min_val, int(min_rank), "ms") + o_dt.max_btime = _ValueWithRank(max_val, int(max_rank), "ms") + + min_val = min_ctemp["temp"] + min_rank = min_ctemp["rank"] + max_val = max_ctemp["temp"] + max_rank = max_ctemp["rank"] + o_dt.min_temp = _ValueWithRank(min_val, int(min_rank), "C") + o_dt.max_temp = _ValueWithRank(max_val, int(max_rank), "C") + + min_val = min_cpower["power"] + min_rank = min_cpower["rank"] + max_val = max_cpower["power"] + max_rank = max_cpower["rank"] + o_dt.min_power = _ValueWithRank(min_val, int(min_rank), "W") + o_dt.max_power = _ValueWithRank(max_val, int(max_rank), "W") + + min_val = min_cutil["util"] + min_rank = min_cutil["rank"] + max_val = max_cutil["util"] + max_rank = max_cutil["rank"] + o_dt.min_util = _ValueWithRank(min_val, int(min_rank), "%") + o_dt.max_util = _ValueWithRank(max_val, int(max_rank), "%") + + min_val = min_cclock["clock"] + min_rank = min_cclock["rank"] + max_val = max_cclock["clock"] + max_rank = max_cclock["rank"] + o_dt.min_clock = _ValueWithRank(min_val, int(min_rank), "MHz") + o_dt.max_clock = _ValueWithRank(max_val, int(max_rank), "MHz") + + o_dt.aflops = [ + _ValueWithRank(d.get("flops", 0.0), int(d.get("rank", -1))) + for _, d in enumerate(data_list) + ] + o_dt.aflops.sort(key=lambda val_with_rank: val_with_rank()[0]) + # wait for everyone here + torch.distributed.barrier() + + return o_dt + + @property + def enabled(self) -> bool: + """Can be called to check the enabled state of the instance + + Note: + After the request to toggle the state, the + actual state change happens at end of call + to report() + """ + return not self._off + + @property + def configured(self) -> bool: + """Can be called to check if the the instance is already configured + + Returns: + bool: returns True if configure was called and was a success, else False + """ + return StragglerDetector._configured + + @property + def my_rank(self): + """Can be called to get configured rank of this instance + + Returns: + int: Configured rank for this instance + """ + return self.rank + + @property + def world_size(self) -> int: + """Can be called to get configured world of this instance + + Returns: + int: World size configured for this instance + """ + return self.world + + def null_method(self) -> None: + """Default method to initialize start/stop method ptrs""" + pass + + def __enter__(self) -> "StragglerDetector": + """Define context/instance entry + + Returns: + StragglerDetector: the instance + """ + self.start() + return self + + def __call__(self, bdata: bool = False) -> "StragglerDetector": + """Callable for the instance. Set context state, + + Useful when the context is used for cpu timers only when bdata=True + + Args: + bdata (bool, optional): when true, only enables cpu timers. Defaults to False. + + Returns: + StragglerDetector: the instance + """ + self.bdata = bdata + return self + + def __exit__( + self, + ex_type: Optional[Type[BaseException]], + ex_val: Optional[BaseException], + ex_tb: Optional[TracebackType], + ) -> bool: + """Define context/instance exit, calls the stop method + + Args: + ex_type (Optional[Type[BaseException]]): Exception type + ex_val (Optional[BaseException]): _description_ + ex_tb (Optional[TracebackType]): _description_ + + Returns: + bool: True if the exception was handled + """ + # Should not suppress errors even if turned off + if ex_type is not None: + err = traceback.format_exception(ex_type, ex_val, ex_tb) + logger.warning(f"{str(ex_val)}\n{err}") + self.stop() + return False + + +# Singleton, global visibility +__straggler__ = StragglerDetector() +"""StragglerDetector: private module variable, not be directly accessed +""" diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py new file mode 100644 index 0000000..f801100 --- /dev/null +++ b/megatron/inference/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/inference/arguments.py b/megatron/inference/arguments.py new file mode 100644 index 0000000..7fcd7a7 --- /dev/null +++ b/megatron/inference/arguments.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + + +def add_modelopt_args(parser): + """Add additional arguments for using TensorRT Model Optimizer (modelopt) features.""" + group = parser.add_argument_group(title="modelopt-generic") + + group.add_argument( + "--export-legacy-megatron", + action="store_true", + help="Export a legacy megatron-lm checkpoint.", + ) + group.add_argument( + "--export-te-mcore-model", + action="store_true", + help="Export a megatron-core transformer-engine checkpoint.", + ) + group.add_argument( + "--export-quant-cfg", + type=str, + default=None, + choices=["int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "None"], + help="Specify a quantization config from the supported choices.", + ) + + return parser diff --git a/megatron/inference/checkpointing.py b/megatron/inference/checkpointing.py new file mode 100644 index 0000000..f8d3e2d --- /dev/null +++ b/megatron/inference/checkpointing.py @@ -0,0 +1,135 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os +from pathlib import Path +from typing import Optional, Dict + +from megatron.core import dist_checkpointing +from megatron.training import get_args +from megatron.training.checkpointing import _load_base_checkpoint, load_checkpoint +from megatron.training.utils import print_rank_0, unwrap_model + +try: + from modelopt.torch.opt.plugins import ( + get_sharded_modelopt_state, + restore_modelopt_state_metadata, + ) +except ImportError as e: + raise ImportError("Required `\"nvidia-modelopt[torch]\"` is not installed!") from e + + +def load_modelopt_state(load_dir: Optional[str] = None) -> Dict: + """Loading modelopt_state without a model. + + If --use-dist-ckpt, we try to load from the sharded modelopt_state. This will not load the model + state_dict. Otherwise, if the checkpoint is not sharded, we load the base checkpoint (that + contains the model state as well) and extract the modelopt_state. + + Args: + load_dir: optionally provide a different loading path + """ + args = get_args() + + if load_dir is None: + load_dir = args.load + + if args.use_dist_ckpt: + # Read the tracker file and set the iteration. + tracker_filename = os.path.join(load_dir, 'latest_checkpointed_iteration.txt') + # If no tracker file, assuming that it is a .nemo checkpoint. + if not os.path.isfile(tracker_filename): + sharded_load_dir = Path(load_dir) / "model_weights" + else: + with open(tracker_filename, 'r') as f: + metastring = f.read().strip() + try: + iteration = int(metastring) + sharded_load_dir = Path(load_dir) / 'iter_{:07d}'.format(iteration) + except ValueError: + sharded_load_dir = Path(load_dir) / metastring + modelopt_state_dir = sharded_load_dir / "modelopt_state" + if modelopt_state_dir.exists(): + print_rank_0("Loading sharded modelopt_state ({})".format(modelopt_state_dir)) + modelopt_state = restore_modelopt_state_metadata( + dist_checkpointing.load( + get_sharded_modelopt_state(args.num_layers), modelopt_state_dir, + ) + ) + return modelopt_state + else: + print_rank_0( + "sharded modelopt_state ({}) does not exist!".format(modelopt_state_dir) + ) + return {} + else: + print_rank_0("Loading modelopt_state from base checkpoint ({})".format(load_dir)) + try: + state_dict, _, _ = _load_base_checkpoint(args.load, rank0=False) + except Exception: + print_rank_0("Failed to load base checkpoint via megatron _load_base_checkpoint!") + return {} + if state_dict is None: + return {} + return state_dict.get("modelopt_state", {}) + + +def load_modelopt_checkpoint( + model, + optimizer=None, + opt_param_scheduler=None, + strict: bool = True, + additional_sharded_prefix: str = "model.", + load_arg: str = "load", +) -> None: + """Load a sharded (untar .nemo or megatron --use-dist-ckpt) or unsharded checkpoint. + + Essentially, the function is detecting whether the checkpoint is a .nemo sharded checkpoint. + If so, we load the sharded state_dict with additional_sharded_prefix `model.`. + This additional prefix is tha artifact of the lightning module wrapper. Once the sharded + state_dict is loaded, we use a state_dict pre_hook to pop this additional prefix (`model.`) + from all state_dict keys. + + If this is not a .nemo sharded checkpoint, then this function will simply call + load_checkpoint. See megatron.checkpointing.load_checkpoint for explanation. + + Args: + additional_sharded_prefix: append additional prefix to align the sharded checkpoint keys. + When loading an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is + typically an empty string. + """ + + def _remove_prefix_state_dict_pre_hook( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs, + ): + """Pytorch state_dict pre_hook to remove prefix of the state_dict keys.""" + if additional_sharded_prefix is None: + return + key_rewrite_list = [] + for key, _ in state_dict.items(): + if key.startswith(additional_sharded_prefix): + key_rewrite_list.append(key) + for old_key in key_rewrite_list: + new_key = old_key[len(additional_sharded_prefix) :] + state_dict[new_key] = state_dict.pop(old_key) + + args = get_args() + load_dir = getattr(args, load_arg) + + sharded_load_dir = Path(load_dir) / "model_weights" + + if sharded_load_dir.exists() and optimizer is None and opt_param_scheduler is None: + unwrapped_model = unwrap_model(model) + # Set this attribute will alter the sharded_offsets of transformer_block. + unwrapped_model[0].decoder.config.non_homogeneous_layers = False + sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix) + if additional_sharded_prefix: + unwrapped_model[0]._register_load_state_dict_pre_hook( + _remove_prefix_state_dict_pre_hook + ) + unwrapped_model[0].load_state_dict( + dist_checkpointing.load(sharded_state_dict, sharded_load_dir) + ) + # Set the attribute to True such that by-default we are storing the heterogenous arch. + unwrapped_model[0].decoder.config.non_homogeneous_layers = True + else: + _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict, load_arg=load_arg) diff --git a/megatron/inference/gpt/__init__.py b/megatron/inference/gpt/__init__.py new file mode 100644 index 0000000..f801100 --- /dev/null +++ b/megatron/inference/gpt/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py new file mode 100644 index 0000000..376bfa1 --- /dev/null +++ b/megatron/inference/gpt/model_provider.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""ModelOpt GPT model provider.""" + +import modelopt.torch.opt as mto + +from megatron.training import get_args, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.inference.ammo_support.gpt.model_specs import get_gpt_layer_ammo_spec +from megatron.core.inference.ammo_support.gpt.state_dict_hooks import ( + mcore_gpt_load_classic_state_dict_pre_hook, + mcore_gpt_load_te_state_dict_pre_hook, +) +from megatron.core.models.gpt import GPTModel as MCoreGPTModel +from megatron.core.parallel_state import get_tensor_model_parallel_rank +from megatron.core.transformer.spec_utils import import_module +from megatron.inference.checkpointing import load_modelopt_state +from megatron.training import get_args, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args + + +def model_provider(pre_process=True, post_process=True, parallel_output=True) -> MCoreGPTModel: + """Builds the model. + + If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + parallel_output (bool): whether to allgather the output logits? This must be + True if `model_provider` is called in text_generation_server. + + Returns: + MCoreGPTModel: The returned model + """ + args = get_args() + + print_rank_0("building GPT model ...") + + # ModelOpt by default assumes none homogenous layers. This affect the storage format of the sharded checkpoint. + config = core_transformer_config_from_args(args) + config.non_homogeneous_layers = True + + if args.use_legacy_models: + raise ValueError( + "ModelOpt integration only support MCore models. Use --use-mcore-modules instead." + ) + + if args.spec is not None: + transformer_layer_spec = import_module(args.spec) + else: + transformer_layer_spec = get_gpt_layer_modelopt_spec( + remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False, + ) + + model_type = MCoreGPTModel + model_kwargs = { + "config": config, + "transformer_layer_spec": transformer_layer_spec, + "vocab_size": args.padded_vocab_size, + "max_sequence_length": args.max_position_embeddings, + "pre_process": pre_process, + "post_process": post_process, + "fp16_lm_cross_entropy": args.fp16_lm_cross_entropy, + "parallel_output": parallel_output, + "share_embeddings_and_output_weights": not args.untie_embeddings_and_output_weights, + "position_embedding_type": args.position_embedding_type, + "rotary_percent": args.rotary_percent, + } + + model = model_type(**model_kwargs) + + # Load modelopt_state + modelopt_state = load_modelopt_state() if args.load else {} + if modelopt_state: + model = mto.restore_from_modelopt_state(model, modelopt_state) + + # Register some load_state_dict prehooks to handle some known state_dict key mismatch. + # (legacy <-> modelopt) and (default te <-> modelopt) + if args.export_legacy_megatron: + model._register_load_state_dict_pre_hook(mcore_gpt_load_legacy_state_dict_pre_hook) + if args.export_te_mcore_model: + model._register_load_state_dict_pre_hook(mcore_gpt_load_te_state_dict_pre_hook) + + # Print models on all pp ranks. + if get_tensor_model_parallel_rank() == 0: + print(str(model)) + + return model diff --git a/megatron/inference/static/index.html b/megatron/inference/static/index.html new file mode 100644 index 0000000..8062879 --- /dev/null +++ b/megatron/inference/static/index.html @@ -0,0 +1,124 @@ + + + + + + + +Megatron + + + +
+

Prompt Megatron

+ + + + + +
+0 +/ 1000 +
+ +
+ + + + + diff --git a/megatron/inference/text_generation/__init__.py b/megatron/inference/text_generation/__init__.py new file mode 100644 index 0000000..77da7be --- /dev/null +++ b/megatron/inference/text_generation/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +from .api import ( + generate, + generate_and_post_process, + beam_search_and_post_process) diff --git a/megatron/inference/text_generation/api.py b/megatron/inference/text_generation/api.py new file mode 100644 index 0000000..4015ac5 --- /dev/null +++ b/megatron/inference/text_generation/api.py @@ -0,0 +1,213 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Inference API.""" + + +import torch + +from megatron.core import mpu +from .communication import broadcast_float_list +from .generation import ( + generate_tokens_probs_and_return_on_first_stage, + score_and_return_on_first_stage, + beam_search_and_return_on_first_stage) +from .tokenization import ( + tokenize_prompts, + detokenize_generations) +from .forward_step import ForwardStep + +def generate_and_post_process(model, + forward_step=ForwardStep, + prompts=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + top_p_decay=0.0, + top_p_bound=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=False, + random_seed=-1, + return_logits=False): + """Run inference and post-process outputs, i.e., detokenize, + move to cpu and convert to list.""" + + # Main inference. + tokens, lengths, output_log_probs, logits = generate( + model, + forward_step=forward_step, + prompts=prompts, + tokens_to_generate=tokens_to_generate, + return_output_log_probs=return_output_log_probs, + top_k_sampling=top_k_sampling, + top_p_sampling=top_p_sampling, + top_p_decay=top_p_decay, + top_p_bound=top_p_bound, + temperature=temperature, + add_BOS=add_BOS, + use_eod_token_for_early_termination=use_eod_token_for_early_termination, + stop_on_double_eol=stop_on_double_eol, + stop_on_eol=stop_on_eol, + prevent_newline_after_colon=prevent_newline_after_colon, + random_seed=random_seed) + + # Only post-process on first stage. + if mpu.is_pipeline_first_stage(): + tokens, prompts_plus_generations, prompts_plus_generations_segments = \ + detokenize_generations(tokens, lengths, True) + + if return_output_log_probs: + output_log_probs = output_log_probs.cpu().numpy().tolist() + for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)): + output_log_probs[i] = prob[:len(seg)-1] + + if return_logits: + assert(tokens_to_generate == 0) + assert(mpu.get_pipeline_model_parallel_world_size() == 1) + return prompts_plus_generations, prompts_plus_generations_segments, \ + output_log_probs, tokens, logits + else: + return prompts_plus_generations, prompts_plus_generations_segments, \ + output_log_probs, tokens + + return None + +def generate(model, + forward_step=None, + prompts=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + top_p_decay=0.0, + top_p_bound=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=False, + random_seed=-1): + """Given prompts and input parameters, run inference and return: + tokens: prompts plus the generated tokens. + lengths: length of the prompt + generations. Note that we can + discard tokens in the tokens tensor that are after the + corresponding length. + output_log_probs: log probs of the tokens. + """ + + # Make sure input params are avaialble to all ranks. + values = [tokens_to_generate, + return_output_log_probs, + top_k_sampling, top_p_sampling, top_p_decay, top_p_bound, + temperature, add_BOS, use_eod_token_for_early_termination, + stop_on_double_eol, + stop_on_eol, + prevent_newline_after_colon, + random_seed] + values_float_tensor = broadcast_float_list(len(values), float_list=values) + tokens_to_generate = int(values_float_tensor[0].item()) + return_output_log_probs = bool(values_float_tensor[1].item()) + top_k_sampling = int(values_float_tensor[2].item()) + top_p_sampling = values_float_tensor[3].item() + top_p_decay = values_float_tensor[4].item() + top_p_bound = values_float_tensor[5].item() + temperature = values_float_tensor[6].item() + add_BOS = bool(values_float_tensor[7].item()) + use_eod_token_for_early_termination = bool(values_float_tensor[8].item()) + stop_on_double_eol = bool(values_float_tensor[9].item()) + stop_on_eol = bool(values_float_tensor[10].item()) + prevent_newline_after_colon = bool(values_float_tensor[11].item()) + random_seed = int(values_float_tensor[12].item()) + + if random_seed != -1: + torch.random.manual_seed(random_seed) + + # Tokenize prompts and get the batch. + # Note that these tensors are broadcaseted to all ranks. + if torch.distributed.get_rank() == 0: + assert prompts is not None + + context_tokens_tensor, context_length_tensor = tokenize_prompts( + prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS) + + if tokens_to_generate == 0: + return score_and_return_on_first_stage( + model, context_tokens_tensor, context_length_tensor) + + # Main inference function. + # Note that the outputs are available on the first stage. + return generate_tokens_probs_and_return_on_first_stage( + model, forward_step, context_tokens_tensor, context_length_tensor, + return_output_log_probs=return_output_log_probs, + top_k=top_k_sampling, + top_p=top_p_sampling, + top_p_decay=top_p_decay, + top_p_bound=top_p_bound, + temperature=temperature, + use_eod_token_for_early_termination=use_eod_token_for_early_termination, + stop_on_double_eol=stop_on_double_eol, + stop_on_eol=stop_on_eol, + prevent_newline_after_colon=prevent_newline_after_colon) + +def beam_search_and_post_process(model, + forward_step=ForwardStep, + prompts=None, + tokens_to_generate=0, + beam_size=0, + add_BOS=False, + stop_token=50256, + num_return_gen=1, + length_penalty=1, + prevent_newline_after_colon=False): + """Run beam search and post-process outputs, i.e., detokenize, + move to cpu and convert to list.""" + + # Main inference. + tokens, scores = beam_search(model, + forward_step=forward_step, + prompts=prompts, + tokens_to_generate=tokens_to_generate, + beam_size=beam_size, + add_BOS=add_BOS, + stop_token=stop_token, + num_return_gen=num_return_gen, + length_penalty=length_penalty, + prevent_newline_after_colon=prevent_newline_after_colon) + # Only post-process on first stage. + if mpu.is_pipeline_first_stage(): + lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) + tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True) + scores = scores.cpu().numpy().tolist() + return prompts_plus_generations, prompts_plus_generations_segments, scores + + return None + +def beam_search(model, forward_step, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False): + # Make sure input params are avaialble to all ranks. + values = [tokens_to_generate, + beam_size, + add_BOS, + stop_token, + num_return_gen, + length_penalty, + prevent_newline_after_colon] + values_float_tensor = broadcast_float_list(len(values), float_list=values) + tokens_to_generate = int(values_float_tensor[0].item()) + beam_size = int(values_float_tensor[1].item()) + add_BOS = bool(values_float_tensor[2].item()) + stop_token = int(values_float_tensor[3].item()) + num_return_gen = int(values_float_tensor[4].item()) + length_penalty = values_float_tensor[5].item() + prevent_newline_after_colon = values_float_tensor[6].item() + + context_tokens_tensor, context_length_tensor = tokenize_prompts( + prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS) + + return beam_search_and_return_on_first_stage(model, forward_step, context_tokens_tensor, context_length_tensor, + beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty, + prevent_newline_after_colon=prevent_newline_after_colon) diff --git a/megatron/inference/text_generation/beam_utils.py b/megatron/inference/text_generation/beam_utils.py new file mode 100644 index 0000000..911a641 --- /dev/null +++ b/megatron/inference/text_generation/beam_utils.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +## from huggingface beam search +class BeamHypotheses(object): + def __init__(self, num_beams, length_penalty=1.0, early_stopping=False): + """ + Initialize n-best list of hypotheses. + """ + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs, length): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / length ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret + diff --git a/megatron/inference/text_generation/communication.py b/megatron/inference/text_generation/communication.py new file mode 100644 index 0000000..dee3207 --- /dev/null +++ b/megatron/inference/text_generation/communication.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Communications utilities.""" + + +import torch + +from megatron.core import mpu + + + +# TODO: use functions from megatron/p2p +def recv_from_prev_pipeline_rank_(recv_buffer=None): + """Receive from previous pipeline stage and update the + input buffer inplace.""" + if not mpu.is_pipeline_first_stage(): + assert recv_buffer is not None + recv_prev_op = torch.distributed.P2POp( + torch.distributed.irecv, recv_buffer, + mpu.get_pipeline_model_parallel_prev_rank()) + reqs = torch.distributed.batch_isend_irecv([recv_prev_op]) + for req in reqs: + req.wait() + # To protect against race condition when using batch_isend_irecv(). + torch.cuda.synchronize() + + + +# TODO: use functions from megatron/p2p +def send_to_next_pipeline_rank(tensor=None): + """Send output to the next pipeline stage.""" + if not mpu.is_pipeline_last_stage(): + assert tensor is not None + send_next_op = torch.distributed.P2POp( + torch.distributed.isend, tensor, + mpu.get_pipeline_model_parallel_next_rank()) + reqs = torch.distributed.batch_isend_irecv([send_next_op]) + for req in reqs: + req.wait() + # To protect against race condition when using batch_isend_irecv(). + torch.cuda.synchronize() + + + +def _is_cuda(tensor): + """Check if a tensor is not none and is cuda.""" + assert tensor is not None + assert tensor.is_cuda + + + +def _is_cuda_contiguous(tensor): + """Check if a tensor is not none, is cuda, and is contiguous.""" + _is_cuda(tensor) + assert tensor.is_contiguous() + + + +def broadcast_from_last_pipeline_stage(size, dtype, tensor=None): + """Broadcast a tensor from last pipeline stage to all ranks.""" + + is_last_stage = mpu.is_pipeline_last_stage() + # If first stage and last state are the same, then there is no + # pipeline parallelism and no need to communicate. + if mpu.is_pipeline_first_stage() and is_last_stage: + return tensor + + if is_last_stage: + _is_cuda_contiguous(tensor) + else: + tensor = torch.empty(size, + dtype=dtype, + device=torch.cuda.current_device()) + # Get the group and corresponding source rank. + src = mpu.get_pipeline_model_parallel_last_rank() + group = mpu.get_pipeline_model_parallel_group() + torch.distributed.broadcast(tensor, src, group) + + return tensor + + + +def broadcast_from_last_to_first_pipeline_stage(size, dtype, tensor=None): + """Broadcast tensor values from last stage into the first stage.""" + + is_last_stage = mpu.is_pipeline_last_stage() + is_first_stage = mpu.is_pipeline_first_stage() + # If first stage and last state are the same, then there is no + # pipeline parallelism and no need to communicate. + if is_first_stage and is_last_stage: + return tensor + # Only first and last stage pipeline stages need to be involved. + if is_last_stage or is_first_stage: + if is_last_stage: + _is_cuda_contiguous(tensor) + else: + tensor = torch.empty(size, + dtype=dtype, + device=torch.cuda.current_device()) + src = mpu.get_pipeline_model_parallel_last_rank() + group = mpu.get_embedding_group() + # Broadcast from last stage into the first stage. + torch.distributed.broadcast(tensor, src, group) + else: + tensor = None + + return tensor + + + +def copy_from_last_to_first_pipeline_stage(size, dtype, tensor=None): + """Copy tensor values from last stage into the first stage. + Note that the input tensor is updated in place.""" + + is_last_stage = mpu.is_pipeline_last_stage() + is_first_stage = mpu.is_pipeline_first_stage() + # If first stage and last state are the same, then there is no + # pipeline parallelism and no need to communicate. + if is_first_stage and is_last_stage: + return + # Only first and last stage pipeline stages need to be involved. + if is_last_stage or is_first_stage: + _is_cuda(tensor) + is_contiguous = tensor.is_contiguous() + src = mpu.get_pipeline_model_parallel_last_rank() + group = mpu.get_embedding_group() + if is_contiguous: + tensor_ = tensor + else: + if is_last_stage: + tensor_ = tensor.contiguous() + else: + tensor_ = torch.empty(size, + dtype=dtype, + device=torch.cuda.current_device()) + # Broadcast from last stage into the first stage. + torch.distributed.broadcast(tensor_, src, group) + # Update the first stage tensor + if is_first_stage and not is_contiguous: + tensor[...] = tensor_ + + + +def broadcast_tensor(size, dtype, tensor=None, rank=0): + """ Given size and type of a tensor on all ranks and the tensor value + only on a specific rank, broadcast from that rank to all other ranks. + """ + + if torch.distributed.get_rank() == rank: + _is_cuda_contiguous(tensor) + else: + tensor = torch.empty(size, + dtype=dtype, + device=torch.cuda.current_device()) + + torch.distributed.broadcast(tensor, rank) + + return tensor + + + +def broadcast_list(size, dtype, list_values=None, rank=0): + """Broadcast a list of values with a given type.""" + + tensor = None + if torch.distributed.get_rank() == rank: + tensor = torch.tensor(list_values, dtype=dtype, + device=torch.cuda.current_device()) + + return broadcast_tensor(size, dtype, tensor=tensor, rank=rank) + + + +def broadcast_int_list(size, int_list=None, rank=0): + """Broadcast a list of interger values.""" + + return broadcast_list(size, torch.int64, list_values=int_list, rank=rank) + + + +def broadcast_float_list(size, float_list=None, rank=0): + """Broadcast a list of float values.""" + + return broadcast_list(size, torch.float32, list_values=float_list, + rank=rank) diff --git a/megatron/inference/text_generation/forward_step.py b/megatron/inference/text_generation/forward_step.py new file mode 100644 index 0000000..4d4878d --- /dev/null +++ b/megatron/inference/text_generation/forward_step.py @@ -0,0 +1,164 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Forward step utilities.""" + +from collections.abc import Iterable + +import torch + +from megatron.training import get_args +from megatron.core import mpu, InferenceParams +from .communication import ( + send_to_next_pipeline_rank, + recv_from_prev_pipeline_rank_) + + +class ForwardStep: + """Forward step function with all the communications. + We use a class here to hide the inference parameters + from the outside caller.""" + + def __init__(self, model, max_batch_size, max_sequence_length): + """Set values so we don't need to do it multiple times.""" + # Make sure model is in eval mode. + assert not isinstance(model, Iterable), \ + 'interleaving schedule is not supported for inference' + model.eval() + self.model = model + # Initialize inference parameters. + self.inference_params = InferenceParams(max_batch_size, + max_sequence_length) + # Pipelining arguments. + args = get_args() + self.pipeline_size_larger_than_one = ( + args.pipeline_model_parallel_size > 1) + # Threshold of pipelining. + self.pipelining_batch_x_seqlen = \ + args.inference_batch_times_seqlen_threshold + + def _forward(self, tokens, position_ids, attention_mask): + return self.model(tokens, position_ids, attention_mask, inference_params=self.inference_params) + + def __call__(self, tokens, position_ids, attention_mask): + """Invocation of the forward methods. Note that self.inference_params + is being modified by the forward step.""" + # Pipelining case. + if self.pipeline_size_larger_than_one: + current_batch_x_seqlen = tokens.size(0) * tokens.size(1) + if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen: + micro_batch_size = \ + max(1, self.pipelining_batch_x_seqlen // tokens.size(1)) + return self._with_pipelining_forward_step(tokens, + position_ids, + attention_mask, + micro_batch_size) + + return self._no_pipelining_forward_step(tokens, + position_ids, + attention_mask) + + + def _forward_step_helper(self, tokens, position_ids, attention_mask, recv_buffer=None): + """Single forward step. Update the allocate memory flag so + only the first time the memory is allocated.""" + batch_size = tokens.size(0) + sequence_length = tokens.size(1) + if recv_buffer is None: + recv_buffer = _allocate_recv_buffer(batch_size, sequence_length) + + # Receive from previous stage. + recv_from_prev_pipeline_rank_(recv_buffer) + + # Forward pass through the model. + self.model.set_input_tensor(recv_buffer) + output_tensor = self._forward(tokens, position_ids, attention_mask) + + # Send output to the next stage. + send_to_next_pipeline_rank(output_tensor) + + return output_tensor + + + + def _no_pipelining_forward_step(self, tokens, position_ids, attention_mask, + recv_buffer=None): + """If recv_buffer is none, we will allocate one on the fly.""" + # Run a simple forward pass. + output_tensor = self._forward_step_helper(tokens, position_ids, + attention_mask, recv_buffer=recv_buffer) + # Update the sequence length offset. + self.inference_params.sequence_len_offset += tokens.size(1) + + logits = None + if mpu.is_pipeline_last_stage(): + logits = output_tensor + + return logits + + + def _with_pipelining_forward_step(self, tokens, position_ids, attention_mask, micro_batch_size): + """No interleaving is supported.""" + sequence_length = tokens.size(1) + batch_size = tokens.size(0) + + # Divide the batch dimension into micro batches. + num_micro_batches, last_chunk = divmod(batch_size, + micro_batch_size) + if last_chunk > 0: + num_micro_batches += 1 + + # Preallocate memory for output logits. + logits = None + if mpu.is_pipeline_last_stage(): + args = get_args() + logits = torch.empty( + (batch_size, sequence_length, args.padded_vocab_size), + dtype=torch.float32, device=torch.cuda.current_device()) + + # Preallocate recv buffer. + recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length) + + for micro_batch_index in range(num_micro_batches): + # Slice among the batch dimenion. + start = micro_batch_index * micro_batch_size + end = min(start + micro_batch_size, batch_size) + this_micro_batch_size = end - start + tokens2use = tokens[start:end, ...] + position_ids2use = position_ids[start:end, ...] + + # Run a simple forward pass. + if this_micro_batch_size != micro_batch_size: + recv_buffer = None + output = self._forward_step_helper(tokens2use, position_ids2use, attention_mask, recv_buffer=recv_buffer) + + # Adjust the batch size offset to account for the micro-batch. + self.inference_params.batch_size_offset += this_micro_batch_size + + # Copy logits. + if mpu.is_pipeline_last_stage(): + logits[start:end, ...] = output + + # Once we are done with all the micro-batches, we can + # adjust the sequence length offset. + self.inference_params.sequence_len_offset += sequence_length + # and reset the batch size offset + self.inference_params.batch_size_offset = 0 + + return logits + + +def _get_recv_buffer_dtype(args): + """Receive happens between the layers.""" + if args.fp32_residual_connection: + return torch.float + return args.params_dtype + +def _allocate_recv_buffer(batch_size, sequence_length): + """Receive happens between the layers with size [s, b, h].""" + if mpu.is_pipeline_first_stage(): + return None + args = get_args() + recv_size = (sequence_length, batch_size, args.hidden_size) + return torch.empty(recv_size, + dtype=_get_recv_buffer_dtype(args), + device=torch.cuda.current_device()) diff --git a/megatron/inference/text_generation/generation.py b/megatron/inference/text_generation/generation.py new file mode 100644 index 0000000..5e4c238 --- /dev/null +++ b/megatron/inference/text_generation/generation.py @@ -0,0 +1,437 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Generation utilities.""" + +import torch +import torch.nn.functional as F + +from megatron.training import get_args, get_tokenizer +from megatron.core import mpu +from megatron.training.utils import get_ltor_masks_and_position_ids +from .communication import ( + copy_from_last_to_first_pipeline_stage, + broadcast_from_last_pipeline_stage, + broadcast_from_last_to_first_pipeline_stage) +from .forward_step import ForwardStep +from .sampling import sample +from .beam_utils import BeamHypotheses + +def score_and_return_on_first_stage(model, tokens, lengths): + """Function for just scoring. + + Args: + model: no interleaving is supported. + tokens: prompt tokens extended to be of size [b, max_prompt_length] + lengths: original prompt length, size: [b] + Note: Outside of model, other parameters only need to be available on + rank 0. + + Returns: + output_log_probs: log probability of the selected tokens. size: [b, s] + """ + + args = get_args() + + batch_size = tokens.size(0) + max_prompt_length = lengths.max().item() + assert max_prompt_length == tokens.size(1) + + if max_prompt_length > args.max_position_embeddings: + raise ValueError("Length of prompt + tokens_to_generate longer than allowed") + + if max_prompt_length * batch_size > args.max_tokens_to_oom: + raise ValueError("Too many tokens. " + str(max_prompt_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom)) + + # forward step. + forward_step = ForwardStep(model, batch_size, max_prompt_length) + + # =================== + # Pre-allocate memory + # =================== + + # Log probability of the sequence (prompt + generated tokens). + output_log_probs = None + output_log_probs_size = (batch_size, max_prompt_length - 1) + + if mpu.is_pipeline_last_stage(): + output_log_probs = torch.empty(output_log_probs_size, + dtype=torch.float32, + device=torch.cuda.current_device()) + + # ============= + # Run infernece + # ============= + with torch.no_grad(): + attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens) + + # logits will be meanigful only in the last pipeline stage. + logits = forward_step(tokens, position_ids, attention_mask) + + if mpu.is_pipeline_last_stage(): + # Always the last stage should have an output. + assert logits is not None + log_probs = F.log_softmax(logits, dim=2) + + # Pick the tokens that we need to get the log + # probabilities for. Note that next input token is + # the token which we selected in the current logits, + # so shift by 1. + indices = torch.unsqueeze(tokens[:, 1:], 2) + output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2) + + # ====================================== + # Broadcast to the first pipeline stage. + # ====================================== + output_log_probs = broadcast_from_last_to_first_pipeline_stage( + output_log_probs_size, torch.float32, output_log_probs) + + return tokens, lengths, output_log_probs, logits + +def generate_tokens_probs_and_return_on_first_stage( + model, forward_step, tokens, lengths, + return_output_log_probs=False, + top_k=0, top_p=0.0, top_p_decay=0.0, top_p_bound=0.0, + temperature=1.0, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=True + ): + """Main token generation function. + + Args: + model: no interleaving is supported. + forward_step (ForwardStep): Class for running the model forward step. + tokens: prompt tokens extended to be of size [b, max-sequence-length] + lengths: original prompt length, size: [b] + return_output_log_probs: flag to calculate the log probability of + the generated tokens. Note that the log probability is the one + from the original logit. + top_k, top_p: top-k and top-p sampling parameters. + Note that top-k = 1 is gready. Also, these paramters are + exclusive meaning that: + if top-k > 0 then we expect top-p=0. + if top-p > 0 then we check for top-k=0. + temperature: sampling temperature. + use_eod_token_for_early_termination: if True, do early termination if + all the sequences have reached this token. + prevent_newline_after_colon: if True, it will disable generating new line \n after : + Note: Outside of model, other parameters only need to be available on + rank 0. + + Returns: Note that is size is adjusted to a lower value than + max-sequence-length if generation is terminated early. + tokens: prompt and generated tokens. size: [b, :] + generated_sequence_lengths: total length (including prompt) of + the generated sequence. size: [b] + output_log_probs: log probability of the selected tokens. size: [b, s] + """ + + args = get_args() + tokenizer = get_tokenizer() + + batch_size = tokens.size(0) + min_prompt_length = lengths.min().item() + max_sequence_length = tokens.size(1) + + if max_sequence_length > args.max_position_embeddings: + raise ValueError("Length of prompt + tokens_to_generate longer than allowed") + + if max_sequence_length * batch_size > args.max_tokens_to_oom: + raise ValueError("Too many tokens. " + str(max_sequence_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom)) + + # forward step. + forward_step = forward_step(model, batch_size, max_sequence_length) + + # Added termination_id to support the case that we want to terminate the + # generation once that id is generated. + if hasattr(args, 'eos_id'): + termination_id = args.eos_id + elif hasattr(tokenizer, 'eod'): + termination_id = tokenizer.eod + elif hasattr(tokenizer, 'eos_id'): + termination_id = tokenizer.eos_id + else: + raise AttributeError('No eod token found in tokenizer or args') + + # =================== + # Pre-allocate memory + # =================== + + # Log probability of the sequence (prompt + generated tokens). + output_log_probs = None + output_log_probs_size = (batch_size, max_sequence_length - 1) + # Lengths of generated seuquence including including prompts. + generated_sequence_lengths = None + if mpu.is_pipeline_last_stage(): + if return_output_log_probs: + output_log_probs = torch.empty(output_log_probs_size, + dtype=torch.float32, + device=torch.cuda.current_device()) + generated_sequence_lengths = torch.ones( + batch_size, dtype=torch.int64, + device=torch.cuda.current_device()) * max_sequence_length + + # Whether we have reached a termination id. + is_generation_done = torch.zeros(batch_size, dtype=torch.uint8, + device=torch.cuda.current_device()) + + # ============= + # Run infernece + # ============= + + with torch.no_grad(): + attention_mask, position_ids = _build_attention_mask_and_position_ids( + tokens) + prev_context_length = 0 + for context_length in range(min_prompt_length, max_sequence_length): + + # Pick the slice that we need to pass through the network. + tokens2use = tokens[:, prev_context_length:context_length] + positions2use = position_ids[:, prev_context_length:context_length] + attention_mask2use = attention_mask[ + ..., prev_context_length:context_length, :context_length] + + # logits will be meanigful only in the last pipeline stage. + logits = forward_step(tokens2use, positions2use, attention_mask2use) + + if mpu.is_pipeline_last_stage(): + if prevent_newline_after_colon: + logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":" + # Always the last stage should have an output. + assert logits is not None + + # Sample. + last_token_logits = logits[:, -1, :] + new_sample = sample(last_token_logits, + top_k=top_k, + top_p=top_p, + temperature=temperature, + vocab_size=tokenizer.vocab_size) + if top_p > 0.0 and top_p_decay > 0.0: + top_p = top_p * top_p_decay + if top_p_bound > 0.0: + top_p = max(top_p, top_p_bound) + + # If a prompt length is smaller or equal th current context + # length, it means we have started generating tokens + started = lengths <= context_length + # Update the tokens. + tokens[started, context_length] = new_sample[started] + + # Calculate the log probabilities. + if return_output_log_probs: + log_probs = F.log_softmax(logits, dim=2) + if return_output_log_probs: + # Pick the tokens that we need to get the log + # probabilities for. Note that next input token is + # the token which we selected in the current logits, + # so shift by 1. + indices = torch.unsqueeze( + tokens[ + :, + (prev_context_length + 1):(context_length + 1)], + 2) + output_log_probs[:, + prev_context_length:context_length] = \ + torch.gather(log_probs, 2, indices).squeeze(2) + + # Update the tokens on the first stage so the next input to + # the network is correct. + copy_from_last_to_first_pipeline_stage(batch_size, torch.int64, + tokens[:, context_length]) + + # Update the context length for the next token generation. + prev_context_length = context_length + + # Check if all the sequences have hit the termination_id. + done = None + if mpu.is_pipeline_last_stage(): + # TODO(rprenger) These stopping methods are tokenizer dependent + # instead tokenization should be in the inference loop so stop sequences can be used + if stop_on_double_eol: + hit_double_eol = (new_sample == 628).byte() & started.byte() + hit_two_eols = (new_sample == 198).byte() & (tokens[:, context_length-1] == 198).byte() & started.byte() + done_token = hit_double_eol | hit_two_eols + elif stop_on_eol: + hit_double_eol = (new_sample == 628).byte() & started.byte() + hit_eol = (new_sample == 198).byte() & started.byte() + done_token = hit_double_eol | hit_eol + else: + done_token = (new_sample == termination_id).byte() & \ + started.byte() + + just_finished = (done_token & ~is_generation_done).bool() + generated_sequence_lengths[just_finished.view(-1)] = \ + context_length + 1 + is_generation_done = is_generation_done | done_token + done = torch.all(is_generation_done) + done = broadcast_from_last_pipeline_stage(1, torch.uint8, + tensor=done) + if use_eod_token_for_early_termination and done: + break + + # =================================================== + # Update the length of based on max generated length. + # =================================================== + + tokens = tokens[:, :(context_length + 1)] + if mpu.is_pipeline_last_stage(): + if return_output_log_probs: + output_log_probs = output_log_probs[:, :context_length] + + # ====================================== + # Broadcast to the first pipeline stage. + # ====================================== + + generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage( + batch_size, torch.int64, generated_sequence_lengths) + if return_output_log_probs: + output_log_probs_size = (batch_size, context_length) + output_log_probs = broadcast_from_last_to_first_pipeline_stage( + output_log_probs_size, torch.float32, output_log_probs) + + return tokens, generated_sequence_lengths, output_log_probs, None + +def beam_search_and_return_on_first_stage(model, forward_step, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True): + args = get_args() + tokenizer = get_tokenizer() + + batch_size = tokens.size(0) + assert(batch_size == 1) + prompt_length = lengths.item() + final_sequence_length = tokens.size(1) + final_sequence_length = min(final_sequence_length, args.max_position_embeddings) + + # If the context is too big, this happens + if prompt_length >= final_sequence_length: + raise ValueError("context length + tokens_to_generate too large") + + # forward step. + forward_step = forward_step(model, beam_size, final_sequence_length) + + beam_hyp = BeamHypotheses(beam_size, length_penalty) + best_batches = None + done = torch.zeros(1, dtype=torch.uint8, device=torch.cuda.current_device()) + scores = torch.zeros(beam_size, + dtype=torch.float32, + device=torch.cuda.current_device()).unsqueeze(1) + scores_size_tensor, tokens_size_tensor = None, None + # ============= + # Run infernece + # ============= + with torch.no_grad(): + tokens = tokens.repeat(beam_size, 1) + attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens) + prev_context_length = 0 + for context_length in range(prompt_length, final_sequence_length): + + # Pick the slice that we need to pass through the network. + tokens2use = tokens[:, prev_context_length:context_length] + positions2use = position_ids[:, prev_context_length:context_length] + attention_mask2use = attention_mask[ + ..., prev_context_length:context_length, :context_length] + + # logits will be meanigful only in the last pipeline stage. + logits = forward_step(tokens2use, positions2use, attention_mask2use) + + if mpu.is_pipeline_last_stage(): + if prevent_newline_after_colon: + logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":" + vocab_size = logits.size(2) + log_probs = F.log_softmax(logits, dim=2) + new_scores = log_probs[:, -1, :] + scores + + if context_length == prompt_length: # if this is the first one + sorted_scores, indices = torch.sort(new_scores[0,:], descending=True) + else: + sorted_scores, indices = torch.sort(new_scores.view(-1), descending=True) + + best_beam_ids = torch.div(indices[: 2 * beam_size], vocab_size).trunc().long() + best_words = indices[:2 * beam_size] % vocab_size + best_scores = sorted_scores[: 2 * beam_size] + + next_beams = [] + for beam_token_rank, (token_id, beam_score, beam_id) in enumerate( + zip(best_words, best_scores, best_beam_ids) + ): + if token_id.item() == stop_token: + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= beam_size + if is_beam_token_worse_than_top_num_beams: + continue + beam_hyp.add( + tokens[beam_id].clone(), + beam_score, + context_length + 1 - prompt_length + ) + else: + # add next predicted token since it is not eos_token + next_beams.append((token_id, beam_score, beam_id)) + + if len(next_beams) == beam_size: + break + + if beam_hyp.is_done(best_scores.max().item(), context_length + 1 - prompt_length): + done = torch.ones(1, dtype=torch.uint8, device=torch.cuda.current_device()) + + best_batches = tokens.new([item[2] for item in next_beams]) + tokens = tokens[best_batches,:] + tokens[:, context_length] = tokens.new([item[0] for item in next_beams]) + scores = scores.new([item[1] for item in next_beams]).unsqueeze(1) + + # torch.distributed.barrier() + done = broadcast_from_last_pipeline_stage(1, torch.uint8, done) + if done: + break + + # Update the tokens on the first stage so the next input to + # the network is correct. + copy_from_last_to_first_pipeline_stage(tokens.size(), torch.int64, + tokens) + + # set inference key values to make it consistent with best beam index + best_batches = broadcast_from_last_pipeline_stage(beam_size, torch.int64, best_batches) + forward_step.inference_params.swap_key_value_dict(best_batches) + + # Update the context length for the next token generation. + prev_context_length = context_length + + if mpu.is_pipeline_last_stage(): + # if cannot find stop token, add open beams to hyps + if not done: + for beam_id in range(beam_size): + beam_hyp.add(tokens[beam_id].clone(), scores[beam_id].squeeze(), context_length + 1 - prompt_length) + + # rank based on scores + sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True) + num_return_gen = min(num_return_gen, len(sorted_hyps)) + scores = [sorted_hyps[i][0] for i in range(num_return_gen)] + tokens = [sorted_hyps[i][1] for i in range(num_return_gen)] + scores = torch.stack(scores, dim=0) + tokens = torch.stack(tokens, dim=0) + scores_size_tensor = torch.tensor(scores.shape, dtype=torch.int64, device=torch.cuda.current_device()) + tokens_size_tensor = torch.tensor(tokens.shape, dtype=torch.int64, device=torch.cuda.current_device()) + + scores_size_tensor = broadcast_from_last_pipeline_stage(1, torch.int64, scores_size_tensor) + tokens_size_tensor = broadcast_from_last_pipeline_stage(2, torch.int64, tokens_size_tensor) + + scores = broadcast_from_last_to_first_pipeline_stage(tuple(scores_size_tensor), torch.float32, scores) + tokens = broadcast_from_last_to_first_pipeline_stage(tuple(tokens_size_tensor), torch.int64, tokens) + + return tokens, scores + + +def _build_attention_mask_and_position_ids(tokens): + """Build the attention mask and postition ids for the input tokens.""" + + # Since we are not interested in loss-mask and reset attention/position + # is also False, eod_token is not used so it is safe to set it to None. + attention_mask, _, position_ids = get_ltor_masks_and_position_ids( + data=tokens, + eod_token=None, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False) + + return attention_mask, position_ids diff --git a/megatron/inference/text_generation/sampling.py b/megatron/inference/text_generation/sampling.py new file mode 100644 index 0000000..370773a --- /dev/null +++ b/megatron/inference/text_generation/sampling.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Sampling utilities. +Part of this code is inspired by: + - https://github.com/ari-holtzman/degen/blob/master/gen.py + - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html +""" + + +import torch + + + +def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf.""" + + filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(filter_, float('-Inf')) + + + +def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf.""" + + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Filteration based on the cumulative sum. + filter_ = cumulative_probs > top_p + # This shift by 1 is weird and I cannot justify it. This existed + # in the original implementation: + # https://github.com/ari-holtzman/degen/blob/master/gen.py + # and I guess it is needed so keeping it for now. + filter_[:, 1:] = filter_[:, :-1].clone() + # Make sure we at least have one token to select from. + filter_[..., 0] = 0 + + # Fill in the filtered part + filter_ = filter_.scatter(1, sorted_indices, filter_) + logits.masked_fill_(filter_, float('-Inf')) + + + +def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None): + """ Sample and generate a token. + Note: logits has the dimension [b, v] where b is the batch size + and v is the vocabulary size. + If vocab_size is provided, we will make sure the sample that is + generated is in [0, vocab-size). This will avoid out of vocabulary + generations due to padding. + """ + + # Check logits for consistency. + assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.' + assert logits.type() == 'torch.cuda.FloatTensor', \ + 'input logits should be floats.' + + + # Greedy is just simple argmax. + if top_k == 1: + assert top_p == 0.0, 'cannot set both greedy and top-p samplings.' + samples = torch.argmax(logits, dim=-1) + + # Top-k or top-p sampling. + else: + # Clone so we do not modify the inputs, + logits = logits.clone() + # Apply temperature in place. + if temperature != 1.0: + logits.div_(temperature) + + if top_k > 1: + assert top_p == 0.0, 'cannot set both top-k and top-p samplings.' + assert top_k <= logits.size(1), 'top-k is larger than logit size.' + if vocab_size: + assert top_k < vocab_size, 'top-k is larger than vocab size.' + modify_logits_for_top_k_filtering(logits, top_k) + + elif top_p > 0.0: + assert top_p <= 1.0, 'top-p should be in (0, 1].' + modify_logits_for_top_p_filtering(logits, top_p) + + # After filtering, we need to recalculate the distribution. + probs = logits.softmax(dim=-1) + samples = torch.multinomial(probs, num_samples=1).view(-1) + + # If vocab size is provided, make sure the samples are in + # in the range [0, vocab-size). + if vocab_size: + samples = torch.clamp(samples, min=0, max=(vocab_size - 1)) + + return samples diff --git a/megatron/inference/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py new file mode 100644 index 0000000..78bd303 --- /dev/null +++ b/megatron/inference/text_generation/tokenization.py @@ -0,0 +1,135 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Tokenization utilities.""" + + +import torch + + +from megatron.training import get_args, get_tokenizer +from .communication import broadcast_int_list, broadcast_tensor + + +def detokenize_generations(tokens_gpu_tensor, + lengths_gpu_tensor, + return_segments): + """Detokenize the generated tokens.""" + + args = get_args() + tokenizer = get_tokenizer(args) + prompts_plus_generations = [] + if return_segments: + prompts_plus_generations_segments = [] + + tokens = tokens_gpu_tensor.cpu().numpy().tolist() + lengths = lengths_gpu_tensor.cpu().numpy().tolist() + for sequence_tokens, length in zip(tokens, lengths): + sequence_tokens = sequence_tokens[:length] + prompts_plus_generations.append( + tokenizer.detokenize(sequence_tokens)) + if return_segments: + words = [] + for token in sequence_tokens: + if args.tokenizer_type in ['SentencePieceTokenizer', + 'GPTSentencePieceTokenizer', + 'HuggingFaceTokenizer', + 'Llama2Tokenizer']: + word = tokenizer.decoder[token] + elif args.tokenizer_type in ['Llama3Tokenizer', 'MistralTokenizer']: + word = tokenizer.decode([token]) + elif args.tokenizer_type == 'NullTokenizer': + word = str(token) + else: + word = tokenizer.tokenizer.decoder[token] + word = bytearray( + [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode( + 'utf-8', errors='replace') + words.append(word) + prompts_plus_generations_segments.append(words) + + if return_segments: + return tokens, prompts_plus_generations, \ + prompts_plus_generations_segments + + return tokens, prompts_plus_generations + + +def tokenize_prompts(prompts=None, tokens_to_generate=None, + add_BOS=None, rank=0): + """Tokenize prompts and make them avaiable on all ranks.""" + + # On all ranks set to None so we can pass them to functions + sizes_list = None + prompts_tokens_cuda_long_tensor = None + prompts_length_cuda_long_tensor = None + + # On the specified rank, build the above. + if torch.distributed.get_rank() == rank: + assert prompts is not None + assert tokens_to_generate is not None + # Tensor of tokens padded and their unpadded length. + prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \ + _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS) + # We need the sizes of these tensors for the boradcast + sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size + prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght + + # First, broadcast the sizes. + sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank) + + # Now that we have the sizes, we can boradcast the tokens + # and length tensors. + sizes = sizes_tensor.tolist() + prompts_tokens_cuda_long_tensor = broadcast_tensor( + sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank) + prompts_length_cuda_long_tensor = broadcast_tensor( + sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor, + rank=rank) + + return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor + + +def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS): + """Given a set of prompts and number of tokens to generate: + - tokenize prompts + - set the sequence length to be the max of length of prompts + plus the number of tokens we would like to generate + - pad all the sequences to this length so we can convert them + into a 2D tensor. + """ + + # Tokenize all the prompts. + args = get_args() + tokenizer = get_tokenizer(args) + if hasattr(tokenizer, 'eod'): + eod_token = tokenizer.eod + elif hasattr(tokenizer, 'eos_id'): + eod_token = tokenizer.eos_id + else: + raise AttributeError('No eod token found in Tokenizer') + if add_BOS: + prompts_tokens = [[eod_token] + tokenizer.tokenize(prompt) + for prompt in prompts] + else: + prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts] + + # Now we have a list of list of tokens which each list has a different + # size. We want to extend this list to: + # - incorporate the tokens that need to be generated + # - make all the sequences equal length. + # Get the prompts length. + prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens] + # Get the max prompts length. + max_prompt_len = max(prompts_length) + # Number of tokens in the each sample of the batch. + samples_length = max_prompt_len + tokens_to_generate + # Now update the list of list to be of the same size: samples_length. + for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length): + padding_size = samples_length - prompt_length + prompt_tokens.extend([eod_token] * padding_size) + + # Now we are in a structured format, we can convert to tensors. + prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.long, device='cuda') + prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.long, device='cuda') + + return prompts_tokens_tensor, prompts_length_tensor diff --git a/megatron/inference/text_generation_server.py b/megatron/inference/text_generation_server.py new file mode 100644 index 0000000..2eba2e2 --- /dev/null +++ b/megatron/inference/text_generation_server.py @@ -0,0 +1,241 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import datetime +import torch +import json +import threading +from flask import Flask, request, jsonify, current_app +from flask_restful import Resource, Api +from megatron.training import get_args +from megatron.inference.text_generation import generate_and_post_process +from megatron.inference.text_generation import beam_search_and_post_process + + +GENERATE_NUM = 0 +BEAM_NUM = 1 +lock = threading.Lock() + +class MegatronGenerate(Resource): + def __init__(self, model): + self.model = model + + @staticmethod + def send_do_generate(): + choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device='cuda') + torch.distributed.broadcast(choice, 0) + + @staticmethod + def send_do_beam_search(): + choice = torch.tensor([BEAM_NUM], dtype=torch.long, device='cuda') + torch.distributed.broadcast(choice, 0) + + def put(self): + args = get_args() + + if not "prompts" in request.get_json(): + return "prompts argument required", 400 + + if "max_len" in request.get_json(): + return "max_len is no longer used. Replace with tokens_to_generate", 400 + + if "sentences" in request.get_json(): + return "sentences is no longer used. Replace with prompts", 400 + + prompts = request.get_json()["prompts"] + if not isinstance(prompts, list): + return "prompts is not a list of strings", 400 + + if len(prompts) == 0: + return "prompts is empty", 400 + + if len(prompts) > 128: + return "Maximum number of prompts is 128", 400 + + tokens_to_generate = 64 # Choosing hopefully sane default. Full sequence is slow + if "tokens_to_generate" in request.get_json(): + tokens_to_generate = request.get_json()["tokens_to_generate"] + if not isinstance(tokens_to_generate, int): + return "tokens_to_generate must be an integer greater than 0" + if tokens_to_generate < 0: + return "tokens_to_generate must be an integer greater than or equal to 0" + + logprobs = False + if "logprobs" in request.get_json(): + logprobs = request.get_json()["logprobs"] + if not isinstance(logprobs, bool): + return "logprobs must be a boolean value" + + if tokens_to_generate == 0 and not logprobs: + return "tokens_to_generate=0 implies logprobs should be True" + + temperature = 1.0 + if "temperature" in request.get_json(): + temperature = request.get_json()["temperature"] + if not (type(temperature) == int or type(temperature) == float): + return "temperature must be a positive number less than or equal to 100.0" + if not (0.0 < temperature <= 100.0): + return "temperature must be a positive number less than or equal to 100.0" + + top_k = 0.0 + if "top_k" in request.get_json(): + top_k = request.get_json()["top_k"] + if not (type(top_k) == int): + return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000" + if not (0 <= top_k <= 1000): + return "top_k must be equal to or greater than 0 and less than or equal to 1000" + + top_p = 0.0 + if "top_p" in request.get_json(): + top_p = request.get_json()["top_p"] + if not (type(top_p) == float): + return "top_p must be a positive float less than or equal to 1.0" + if top_p > 0.0 and top_k > 0.0: + return "cannot set both top-k and top-p samplings." + if not (0 <= top_p <= 1.0): + return "top_p must be less than or equal to 1.0" + + top_p_decay = 0.0 + if "top_p_decay" in request.get_json(): + top_p_decay = request.get_json()["top_p_decay"] + if not (type(top_p_decay) == float): + return "top_p_decay must be a positive float less than or equal to 1.0" + if top_p == 0.0: + return "top_p_decay cannot be set without top_p" + if not (0 <= top_p_decay <= 1.0): + return "top_p_decay must be less than or equal to 1.0" + + top_p_bound = 0.0 + if "top_p_bound" in request.get_json(): + top_p_bound = request.get_json()["top_p_bound"] + if not (type(top_p_bound) == float): + return "top_p_bound must be a positive float less than or equal to top_p" + if top_p == 0.0: + return "top_p_bound cannot be set without top_p" + if not (0.0 < top_p_bound <= top_p): + return "top_p_bound must be greater than 0 and less than top_p" + + add_BOS = False + if "add_BOS" in request.get_json(): + add_BOS = request.get_json()["add_BOS"] + if not isinstance(add_BOS, bool): + return "add_BOS must be a boolean value" + + if any([len(prompt) == 0 for prompt in prompts]) and not add_BOS: + return "Empty prompts require add_BOS=true" + + stop_on_double_eol = False + if "stop_on_double_eol" in request.get_json(): + stop_on_double_eol = request.get_json()["stop_on_double_eol"] + if not isinstance(stop_on_double_eol, bool): + return "stop_on_double_eol must be a boolean value" + + stop_on_eol = False + if "stop_on_eol" in request.get_json(): + stop_on_eol = request.get_json()["stop_on_eol"] + if not isinstance(stop_on_eol, bool): + return "stop_on_eol must be a boolean value" + + prevent_newline_after_colon = False + if "prevent_newline_after_colon" in request.get_json(): + prevent_newline_after_colon = request.get_json()["prevent_newline_after_colon"] + if not isinstance(prevent_newline_after_colon, bool): + return "prevent_newline_after_colon must be a boolean value" + + random_seed = -1 + if "random_seed" in request.get_json(): + random_seed = request.get_json()["random_seed"] + if not isinstance(random_seed, int): + return "random_seed must be integer" + if random_seed < 0: + return "random_seed must be a positive integer" + + no_log = False + if "no_log" in request.get_json(): + no_log = request.get_json()["no_log"] + if not isinstance(no_log, bool): + return "no_log must be a boolean value" + + beam_width = None + if "beam_width" in request.get_json(): + beam_width = request.get_json()["beam_width"] + if not isinstance(beam_width, int): + return "beam_width must be integer" + if beam_width < 1: + return "beam_width must be an integer > 1" + if len(prompts) > 1: + return "When doing beam_search, batch size must be 1" + + stop_token=50256 + if "stop_token" in request.get_json(): + stop_token = request.get_json()["stop_token"] + if not isinstance(stop_token, int): + return "stop_token must be an integer" + + length_penalty = 1 + if "length_penalty" in request.get_json(): + length_penalty = request.get_json()["length_penalty"] + if not isinstance(length_penalty, float): + return "length_penalty must be a float" + + with lock: # Need to get lock to keep multiple threads from hitting code + + if not no_log: + print("request IP: " + str(request.remote_addr)) + print(json.dumps(request.get_json()),flush=True) + print("start time: ", datetime.datetime.now()) + + try: + if beam_width is not None: + MegatronGenerate.send_do_beam_search() # Tell other ranks we're doing beam_search + response, response_seg, response_scores = \ + beam_search_and_post_process( + self.model, + prompts=prompts, + tokens_to_generate=tokens_to_generate, + beam_size = beam_width, + add_BOS=add_BOS, + stop_token=stop_token, + num_return_gen=beam_width, # Returning whole beam + length_penalty=length_penalty, + prevent_newline_after_colon=prevent_newline_after_colon + ) + + return jsonify({"text": response, + "segments": response_seg, + "scores": response_scores}) + else: + MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate + response, response_seg, response_logprobs, _ = \ + generate_and_post_process( + self.model, + prompts=prompts, + tokens_to_generate=tokens_to_generate, + return_output_log_probs=logprobs, + top_k_sampling=top_k, + top_p_sampling=top_p, + top_p_decay=top_p_decay, + top_p_bound=top_p_bound, + temperature=temperature, + add_BOS=add_BOS, + use_eod_token_for_early_termination=True, + stop_on_double_eol=stop_on_double_eol, + stop_on_eol=stop_on_eol, + prevent_newline_after_colon=prevent_newline_after_colon, + random_seed=random_seed) + + return jsonify({"text": response, + "segments": response_seg, + "logprobs": response_logprobs}) + + except ValueError as ve: + return ve.args[0] + print("end time: ", datetime.datetime.now()) + + +class MegatronServer(object): + def __init__(self, model): + self.app = Flask(__name__, static_url_path='') + api = Api(self.app) + api.add_resource(MegatronGenerate, '/api', resource_class_args=[model]) + + def run(self, url, port): + self.app.run(url, threaded=True, debug=False, port=port) diff --git a/megatron/legacy/data/__init__.py b/megatron/legacy/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/legacy/data/autoaugment.py b/megatron/legacy/data/autoaugment.py new file mode 100644 index 0000000..7f988c5 --- /dev/null +++ b/megatron/legacy/data/autoaugment.py @@ -0,0 +1,320 @@ +"""AutoAugment data augmentation policy for ImageNet. + +-- Begin license text. + +MIT License + +Copyright (c) 2018 Philip Popien + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-- End license text. + +Code adapted from https://github.com/DeepVoltaire/AutoAugment. + +This module implements the fixed AutoAugment data augmentation policy for ImageNet provided in +Appendix A, Table 9 of reference [1]. It does not include any of the search code for augmentation +policies. + +Reference: +[1] https://arxiv.org/abs/1805.09501 +""" + +import random + +import numpy as np +from PIL import Image +from PIL import ImageEnhance +from PIL import ImageOps + +_MAX_LEVEL = 10 # Maximum integer strength of an augmentation, if applicable. + + +class ImageNetPolicy: + """Definition of an ImageNetPolicy. + + Implements a fixed AutoAugment data augmentation policy targeted at + ImageNet training by randomly applying at runtime one of the 25 pre-defined + data augmentation sub-policies provided in Reference [1]. + + Usage example as a Pytorch Transform: + >>> transform=transforms.Compose([transforms.Resize(256), + >>> ImageNetPolicy(), + >>> transforms.ToTensor()]) + """ + + def __init__(self, fillcolor=(128, 128, 128)): + """Initialize an ImageNetPolicy. + + Args: + fillcolor (tuple): RGB color components of the color to be used for + filling when needed (default: (128, 128, 128), which + corresponds to gray). + """ + # Instantiate a list of sub-policies. + # Each entry of the list is a SubPolicy which consists of + # two augmentation operations, + # each of those parametrized as operation, probability, magnitude. + # Those two operations are applied sequentially on the image upon call. + self.policies = [ + SubPolicy("posterize", 0.4, 8, "rotate", 0.6, 9, fillcolor), + SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor), + SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor), + SubPolicy("posterize", 0.6, 7, "posterize", 0.6, 6, fillcolor), + SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor), + SubPolicy("equalize", 0.4, 4, "rotate", 0.8, 8, fillcolor), + SubPolicy("solarize", 0.6, 3, "equalize", 0.6, 7, fillcolor), + SubPolicy("posterize", 0.8, 5, "equalize", 1.0, 2, fillcolor), + SubPolicy("rotate", 0.2, 3, "solarize", 0.6, 8, fillcolor), + SubPolicy("equalize", 0.6, 8, "posterize", 0.4, 6, fillcolor), + SubPolicy("rotate", 0.8, 8, "color", 0.4, 0, fillcolor), + SubPolicy("rotate", 0.4, 9, "equalize", 0.6, 2, fillcolor), + SubPolicy("equalize", 0.0, 7, "equalize", 0.8, 8, fillcolor), + SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor), + SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor), + SubPolicy("rotate", 0.8, 8, "color", 1.0, 2, fillcolor), + SubPolicy("color", 0.8, 8, "solarize", 0.8, 7, fillcolor), + SubPolicy("sharpness", 0.4, 7, "invert", 0.6, 8, fillcolor), + SubPolicy("shearX", 0.6, 5, "equalize", 1.0, 9, fillcolor), + SubPolicy("color", 0.4, 0, "equalize", 0.6, 3, fillcolor), + SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor), + SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor), + SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor), + SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor), + SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor), + ] + + def __call__(self, img): + """Define call method for ImageNetPolicy class.""" + policy_idx = random.randint(0, len(self.policies) - 1) + return self.policies[policy_idx](img) + + def __repr__(self): + """Define repr method for ImageNetPolicy class.""" + return "ImageNetPolicy" + + +class SubPolicy: + """Definition of a SubPolicy. + + A SubPolicy consists of two augmentation operations, + each of those parametrized as operation, probability, magnitude. + The two operations are applied sequentially on the image upon call. + """ + + def __init__( + self, + operation1, + probability1, + magnitude_idx1, + operation2, + probability2, + magnitude_idx2, + fillcolor, + ): + """Initialize a SubPolicy. + + Args: + operation1 (str): Key specifying the first augmentation operation. + There are fourteen key values altogether (see supported_ops below + listing supported operations). probability1 (float): Probability + within [0., 1.] of applying the first augmentation operation. + magnitude_idx1 (int): Integer specifiying the strength of the first + operation as an index further used to derive the magnitude from a + range of possible values. + operation2 (str): Key specifying the second augmentation operation. + probability2 (float): Probability within [0., 1.] of applying the + second augmentation operation. + magnitude_idx2 (int): Integer specifiying the strength of the + second operation as an index further used to derive the magnitude + from a range of possible values. + fillcolor (tuple): RGB color components of the color to be used for + filling. + Returns: + """ + # List of supported operations for operation1 and operation2. + supported_ops = [ + "shearX", + "shearY", + "translateX", + "translateY", + "rotate", + "color", + "posterize", + "solarize", + "contrast", + "sharpness", + "brightness", + "autocontrast", + "equalize", + "invert", + ] + assert (operation1 in supported_ops) and ( + operation2 in supported_ops + ), "SubPolicy:one of oper1 or oper2 refers to an unsupported operation." + + assert ( + 0.0 <= probability1 <= 1.0 and 0.0 <= probability2 <= 1.0 + ), "SubPolicy: prob1 and prob2 should be within [0., 1.]." + + assert ( + isinstance(magnitude_idx1, int) and 0 <= magnitude_idx1 <= 10 + ), "SubPolicy: idx1 should be specified as an integer within [0, 10]." + + assert ( + isinstance(magnitude_idx2, int) and 0 <= magnitude_idx2 <= 10 + ), "SubPolicy: idx2 should be specified as an integer within [0, 10]." + + # Define a dictionary where each key refers to a specific type of + # augmentation and the corresponding value is a range of ten possible + # magnitude values for that augmentation. + num_levels = _MAX_LEVEL + 1 + ranges = { + "shearX": np.linspace(0, 0.3, num_levels), + "shearY": np.linspace(0, 0.3, num_levels), + "translateX": np.linspace(0, 150 / 331, num_levels), + "translateY": np.linspace(0, 150 / 331, num_levels), + "rotate": np.linspace(0, 30, num_levels), + "color": np.linspace(0.0, 0.9, num_levels), + "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype( + np.int32 + ), + "solarize": np.linspace(256, 0, num_levels), # range [0, 256] + "contrast": np.linspace(0.0, 0.9, num_levels), + "sharpness": np.linspace(0.0, 0.9, num_levels), + "brightness": np.linspace(0.0, 0.9, num_levels), + "autocontrast": [0] + * num_levels, # This augmentation doesn't use magnitude parameter. + "equalize": [0] + * num_levels, # This augmentation doesn't use magnitude parameter. + "invert": [0] + * num_levels, # This augmentation doesn't use magnitude parameter. + } + + def rotate_with_fill(img, magnitude): + """Define rotation transformation with fill. + + The input image is first rotated, then it is blended together with + a gray mask of the same size. Note that fillcolor as defined + elsewhere in this module doesn't apply here. + + Args: + magnitude (float): rotation angle in degrees. + Returns: + rotated_filled (PIL Image): rotated image with gray filling for + disoccluded areas unveiled by the rotation. + """ + rotated = img.convert("RGBA").rotate(magnitude) + rotated_filled = Image.composite( + rotated, Image.new("RGBA", rotated.size, (128,) * 4), rotated + ) + return rotated_filled.convert(img.mode) + + # Define a dictionary of augmentation functions where each key refers + # to a specific type of augmentation and the corresponding value defines + # the augmentation itself using a lambda function. + # pylint: disable=unnecessary-lambda + func_dict = { + "shearX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0), + Image.BICUBIC, + fillcolor=fillcolor, + ), + "shearY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0), + Image.BICUBIC, + fillcolor=fillcolor, + ), + "translateX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + ( + 1, + 0, + magnitude * img.size[0] * random.choice([-1, 1]), + 0, + 1, + 0, + ), + fillcolor=fillcolor, + ), + "translateY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + ( + 1, + 0, + 0, + 0, + 1, + magnitude * img.size[1] * random.choice([-1, 1]), + ), + fillcolor=fillcolor, + ), + "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude), + "color": lambda img, magnitude: ImageEnhance.Color(img).enhance( + 1 + magnitude * random.choice([-1, 1]) + ), + "posterize": lambda img, magnitude: ImageOps.posterize( + img, magnitude + ), + "solarize": lambda img, magnitude: ImageOps.solarize( + img, magnitude + ), + "contrast": lambda img, magnitude: ImageEnhance.Contrast( + img + ).enhance(1 + magnitude * random.choice([-1, 1])), + "sharpness": lambda img, magnitude: ImageEnhance.Sharpness( + img + ).enhance(1 + magnitude * random.choice([-1, 1])), + "brightness": lambda img, magnitude: ImageEnhance.Brightness( + img + ).enhance(1 + magnitude * random.choice([-1, 1])), + "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img), + "equalize": lambda img, magnitude: ImageOps.equalize(img), + "invert": lambda img, magnitude: ImageOps.invert(img), + } + + # Store probability, function and magnitude of the first augmentation + # for the sub-policy. + self.probability1 = probability1 + self.operation1 = func_dict[operation1] + self.magnitude1 = ranges[operation1][magnitude_idx1] + + # Store probability, function and magnitude of the second augmentation + # for the sub-policy. + self.probability2 = probability2 + self.operation2 = func_dict[operation2] + self.magnitude2 = ranges[operation2][magnitude_idx2] + + def __call__(self, img): + """Define call method for SubPolicy class.""" + # Randomly apply operation 1. + if random.random() < self.probability1: + img = self.operation1(img, self.magnitude1) + + # Randomly apply operation 2. + if random.random() < self.probability2: + img = self.operation2(img, self.magnitude2) + + return img diff --git a/megatron/legacy/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py new file mode 100644 index 0000000..4ea43cd --- /dev/null +++ b/megatron/legacy/data/biencoder_dataset_utils.py @@ -0,0 +1,209 @@ +import os +import time + +import numpy as np +import torch + +from megatron.training import get_args, get_tokenizer, print_rank_0 +from megatron.core import mpu, tensor_parallel +from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, \ + pad_and_convert_to_numpy +from megatron.legacy.data.data_samplers import MegatronPretrainingSampler + +def make_attention_mask(source_block, target_block): + """ + Returns a 2-dimensional (2-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) + mask = mask.astype(np.int64) + # (source_length, target_length) + return mask + +def get_one_epoch_dataloader(dataset, micro_batch_size=None): + """Specifically one epoch to be used in an indexing job.""" + args = get_args() + + if micro_batch_size is None: + micro_batch_size = args.micro_batch_size + num_workers = args.num_workers + + # Use megatron's sampler with consumed samples set to 0 as + # this is only for evaluation and don't intend to resume half way. + # Also, set the drop last to false as don't intend to remove + # the last batch + batch_sampler = MegatronPretrainingSampler( + total_samples=len(dataset), + consumed_samples=0, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size(), + drop_last=False) + + return torch.utils.data.DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + pin_memory=True) + + +def get_ict_batch(data_iterator): + # Items and their type. + keys = ['query_tokens', 'query_mask', + 'context_tokens', 'context_mask', 'block_data'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is None: + data = None + else: + data = next(data_iterator) + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + query_tokens = data_b['query_tokens'].long() + query_mask = data_b['query_mask'] < 0.5 + context_tokens = data_b['context_tokens'].long() + context_mask = data_b['context_mask'] < 0.5 + block_indices = data_b['block_data'].long() + + return query_tokens, query_mask,\ + context_tokens, context_mask, block_indices + + +def join_str_list(str_list): + """Join a list of strings, handling spaces appropriately""" + result = "" + for s in str_list: + if s.startswith("##"): + result += s[2:] + else: + result += " " + s + return result + + +class BlockSampleData(object): + """A struct for fully describing a fixed-size block of data as used in REALM + + :param start_idx: for first sentence of the block + :param end_idx: for last sentence of the block (may be partially truncated in sample construction) + :param doc_idx: the index of the document from which the block comes in the original indexed dataset + :param block_idx: a unique integer identifier given to every block. + """ + def __init__(self, start_idx, end_idx, doc_idx, block_idx): + self.start_idx = start_idx + self.end_idx = end_idx + self.doc_idx = doc_idx + self.block_idx = block_idx + + def as_array(self): + return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64) + + def as_tuple(self): + return self.start_idx, self.end_idx, self.doc_idx, self.block_idx + + +class BlockSamplesMapping(object): + def __init__(self, mapping_array): + # make sure that the array is compatible with BlockSampleData + assert mapping_array.shape[1] == 4 + self.mapping_array = mapping_array + + def __len__(self): + return self.mapping_array.shape[0] + + def __getitem__(self, idx): + """Get the data associated with an indexed sample.""" + sample_data = BlockSampleData(*self.mapping_array[idx]) + return sample_data + + +def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs, + max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False): + """Get samples mapping for a dataset over fixed size blocks. This function also requires + a dataset of the titles for the source documents since their lengths must be taken into account. + + :return: samples_mapping (BlockSamplesMapping) + """ + + if not num_epochs: + if not max_num_samples: + raise ValueError("Need to specify either max_num_samples " + "or num_epochs") + num_epochs = np.iinfo(np.int32).max - 1 + if not max_num_samples: + max_num_samples = np.iinfo(np.int64).max - 1 + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + if num_epochs != (np.iinfo(np.int32).max - 1): + indexmap_filename += '_{}ep'.format(num_epochs) + if max_num_samples != (np.iinfo(np.int64).max - 1): + indexmap_filename += '_{}mns'.format(max_num_samples) + indexmap_filename += '_{}msl'.format(max_seq_length) + indexmap_filename += '_{}s'.format(seed) + if use_one_sent_docs: + indexmap_filename += '_1sentok' + indexmap_filename += '.npy' + + # Build the indexed mapping if not exist. + if mpu.get_data_parallel_rank() == 0 and \ + not os.path.isfile(indexmap_filename): + print(' > WARNING: could not find index map file {}, building ' + 'the indices on rank 0 ...'.format(indexmap_filename)) + + # Make sure the types match the helpers input types. + assert block_dataset.document_indices.dtype == np.int64 + assert block_dataset.sequence_lengths.dtype == np.int32 + + # Build samples mapping + verbose = torch.distributed.get_rank() == 0 + start_time = time.time() + print_rank_0(' > building samples index mapping for {} ...'.format( + name)) + + from megatron.core.datasets import helpers + mapping_array = helpers.build_blocks_mapping( + block_dataset.document_indices, + block_dataset.sequence_lengths, + title_dataset.sequence_lengths, + num_epochs, + max_num_samples, + max_seq_length - 3, # account for added tokens + seed, + verbose, + use_one_sent_docs) + + + print_rank_0(' > done building samples index mapping') + np.save(indexmap_filename, mapping_array, allow_pickle=True) + print_rank_0(' > saved the index mapping in {}'.format( + indexmap_filename)) + # Make sure all the ranks have built the mapping + print_rank_0(' > elapsed time to build and save samples mapping ' + '(seconds): {:4f}'.format( + time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.tensor([1], dtype=torch.long, device='cuda') + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + assert counts[0].item() == torch.distributed.get_world_size( + group=mpu.get_data_parallel_group()) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + + mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') + samples_mapping = BlockSamplesMapping(mapping_array) + + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + mapping_array.shape[0])) + + return samples_mapping diff --git a/megatron/legacy/data/data_samplers.py b/megatron/legacy/data/data_samplers.py new file mode 100644 index 0000000..78c7e1a --- /dev/null +++ b/megatron/legacy/data/data_samplers.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Dataloaders.""" + + +import random +import torch +import numpy as np +from torch.utils.data import Dataset +from megatron.training import get_args +from megatron.core import mpu + + +def build_pretraining_data_loader(dataset, consumed_samples): + """Build dataloader given an input dataset.""" + + if dataset is None: + return None + args = get_args() + + # Megatron sampler + if args.dataloader_type == 'single': + batch_sampler = MegatronPretrainingSampler( + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size()) + elif args.dataloader_type == 'cyclic': + batch_sampler = MegatronPretrainingRandomSampler( + dataset, + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size(), + data_sharding=args.data_sharding) + elif args.dataloader_type == "external": + # External dataloaders are passed through. User is expected to provide a + # torch-compatible dataloader and define samplers, if needed. + return dataset + else: + raise Exception('{} dataloader type is not supported.'.format( + args.dataloader_type)) + + # Torch dataloader. + return torch.utils.data.DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + persistent_workers=True if args.num_workers > 0 else False, + ) + +class MegatronPretrainingSampler: + + def __init__(self, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, drop_last=True): + # Keep a copy of input params for later use. + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.drop_last = drop_last + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.consumed_samples < self.total_samples, \ + 'no samples left to consume: {}, {}'.format(self.consumed_samples, + self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples + + def get_start_end_idx(self): + start_idx = self.data_parallel_rank * self.micro_batch_size + end_idx = start_idx + self.micro_batch_size + return start_idx, end_idx + + def __iter__(self): + batch = [] + # Last batch will be dropped if drop_last is not set False + for idx in range(self.consumed_samples, self.total_samples): + batch.append(idx) + if len(batch) == self.micro_batch_times_data_parallel_size: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] + batch = [] + + # Check the last partial batch and see drop_last is set + if len(batch) > 0 and not self.drop_last: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] + + +class RandomSeedDataset(Dataset): + + def __init__(self, dataset): + args = get_args() + self.base_seed = args.seed + self.curr_seed = args.seed + self.dataset = dataset + + def __len__(self): + return len(self.dataset) + + def set_epoch(self, epoch): + self.curr_seed = self.base_seed + epoch + + def __getitem__(self, idx): + seed = idx + self.curr_seed + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + return self.dataset[idx] + + +class MegatronPretrainingRandomSampler: + + def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, data_sharding): + # Keep a copy of input params for later use. + self.dataset = dataset + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.data_parallel_size = data_parallel_size + self.data_sharding = data_sharding + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.last_batch_size = \ + self.total_samples % self.micro_batch_times_data_parallel_size + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples + + def __iter__(self): + active_total_samples = self.total_samples - self.last_batch_size + self.epoch = self.consumed_samples // active_total_samples + current_epoch_samples = self.consumed_samples % active_total_samples + assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 + + if isinstance(self.dataset, RandomSeedDataset): + self.dataset.set_epoch(self.epoch) + + # data sharding and random sampling + if self.data_sharding: + bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ + * self.micro_batch_size + bucket_offset = current_epoch_samples // self.data_parallel_size + start_idx = self.data_parallel_rank * bucket_size + + g = torch.Generator() + g.manual_seed(self.epoch) + random_idx = torch.randperm(bucket_size, generator=g).tolist() + idx_range = [start_idx + x for x in random_idx[bucket_offset:]] + else: + full_bucket_size = (self.total_samples // self.micro_batch_size) \ + * self.micro_batch_size + full_bucket_offset = current_epoch_samples + g = torch.Generator() + g.manual_seed(self.epoch) + idx_range_total = \ + torch.randperm(full_bucket_size, generator=g).tolist() + idx_range_active = idx_range_total[full_bucket_offset:] + idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] + + batch = [] + # Last batch if not complete will be dropped. + for idx in idx_range: + batch.append(idx) + if len(batch) == self.micro_batch_size: + self.consumed_samples += self.micro_batch_times_data_parallel_size + yield batch + batch = [] diff --git a/megatron/legacy/data/dataset_utils.py b/megatron/legacy/data/dataset_utils.py new file mode 100644 index 0000000..f6ff472 --- /dev/null +++ b/megatron/legacy/data/dataset_utils.py @@ -0,0 +1,726 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, and NVIDIA. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Most of the code here has been copied from: +# https://github.com/google-research/albert/blob/master/create_pretraining_data.py +# with some modifications. + +import math +import os +import time +import collections + +import numpy as np +import torch + +from megatron.training import ( + get_args, + print_rank_0 +) +from megatron.core import mpu +from megatron.core.datasets.indexed_dataset import IndexedDataset + + +DSET_TYPE_BERT = 'standard_bert' +DSET_TYPE_ICT = 'ict' +DSET_TYPE_T5 = 't5' +DSET_TYPE_MULTIMODAL = 'multimodal' + +DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_MULTIMODAL] + + +def get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples): + + # The data prefix should be in the format of: + # weight-1, data-prefix-1, weight-2, data-prefix-2, .. + assert len(data_prefix) % 2 == 0 + num_datasets = len(data_prefix) // 2 + weights = [0]*num_datasets + prefixes = [0]*num_datasets + for i in range(num_datasets): + weights[i] = float(data_prefix[2*i]) + prefixes[i] = (data_prefix[2*i+1]).strip() + # Normalize weights + weight_sum = 0.0 + for weight in weights: + weight_sum += weight + assert weight_sum > 0.0 + weights = [weight / weight_sum for weight in weights] + + # Add 0.5% (the 1.005 factor) so in case the bleding dataset does + # not uniformly distribute the number of samples, we still have + # samples left to feed to the network. + if isinstance(train_valid_test_num_samples, list): + datasets_train_valid_test_num_samples = [] + for weight in weights: + datasets_train_valid_test_num_samples.append( + [int(math.ceil(val * weight * 1.005)) + for val in train_valid_test_num_samples]) + else: + # Used when separate dataset files are provided for train, + # valid and test + datasets_train_valid_test_num_samples = [ + int(math.ceil(train_valid_test_num_samples * weight * 1.005)) + for weight in weights] + + return prefixes, weights, datasets_train_valid_test_num_samples + + +def get_a_and_b_segments(sample, np_rng): + """Divide sample into a and b segments.""" + + # Number of sentences in the sample. + n_sentences = len(sample) + # Make sure we always have two sentences. + assert n_sentences > 1, 'make sure each sample has at least two sentences.' + + # First part: + # `a_end` is how many sentences go into the `A`. + a_end = 1 + if n_sentences >= 3: + # Note that randin in numpy is exclusive. + a_end = np_rng.randint(1, n_sentences) + tokens_a = [] + for j in range(a_end): + tokens_a.extend(sample[j]) + + # Second part: + tokens_b = [] + for j in range(a_end, n_sentences): + tokens_b.extend(sample[j]) + + # Random next: + is_next_random = False + if np_rng.random() < 0.5: + is_next_random = True + tokens_a, tokens_b = tokens_b, tokens_a + + return tokens_a, tokens_b, is_next_random + + +def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): + """Truncates a pair of sequences to a maximum sequence length.""" + #print(len_a, len_b, max_num_tokens) + assert len_a > 0 + if len_a + len_b <= max_num_tokens: + return False + while len_a + len_b > max_num_tokens: + if len_a > len_b: + len_a -= 1 + tokens = tokens_a + else: + len_b -= 1 + tokens = tokens_b + if np_rng.random() < 0.5: + del tokens[0] + else: + tokens.pop() + return True + + +def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): + """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" + + tokens = [] + tokentypes = [] + # [CLS]. + tokens.append(cls_id) + tokentypes.append(0) + # Segment A. + for token in tokens_a: + tokens.append(token) + tokentypes.append(0) + # [SEP]. + tokens.append(sep_id) + tokentypes.append(0) + # Segment B. + for token in tokens_b: + tokens.append(token) + tokentypes.append(1) + if tokens_b: + # [SEP]. + tokens.append(sep_id) + tokentypes.append(1) + + return tokens, tokentypes + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def is_start_piece(piece): + """Check if the current word piece is the starting piece (BERT).""" + # When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + return not piece.startswith("##") + + +def create_masked_lm_predictions(tokens, + vocab_id_list, vocab_id_to_token_dict, + masked_lm_prob, + cls_id, sep_id, mask_id, + max_predictions_per_seq, + np_rng, + max_ngrams=3, + do_whole_word_mask=True, + favor_longer_ngram=False, + do_permutation=False, + geometric_dist=False, + masking_style="bert"): + """Creates the predictions for the masked LM objective. + Note: Tokens here are vocab ids and not text tokens.""" + + cand_indexes = [] + # Note(mingdachen): We create a list for recording if the piece is + # the starting piece of current token, where 1 means true, so that + # on-the-fly whole word masking is possible. + token_boundary = [0] * len(tokens) + + for (i, token) in enumerate(tokens): + if token == cls_id or token == sep_id: + token_boundary[i] = 1 + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (do_whole_word_mask and len(cand_indexes) >= 1 and + not is_start_piece(vocab_id_to_token_dict[token])): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + if is_start_piece(vocab_id_to_token_dict[token]): + token_boundary[i] = 1 + + output_tokens = list(tokens) + + masked_lm_positions = [] + masked_lm_labels = [] + + if masked_lm_prob == 0: + return (output_tokens, masked_lm_positions, + masked_lm_labels, token_boundary) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) + if not geometric_dist: + # Note(mingdachen): + # By default, we set the probilities to favor shorter ngram sequences. + pvals = 1. / np.arange(1, max_ngrams + 1) + pvals /= pvals.sum(keepdims=True) + if favor_longer_ngram: + pvals = pvals[::-1] + + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + ngram_indexes.append(ngram_index) + + np_rng.shuffle(ngram_indexes) + + (masked_lms, masked_spans) = ([], []) + covered_indexes = set() + for cand_index_set in ngram_indexes: + if len(masked_lms) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes: + continue + + if not geometric_dist: + n = np_rng.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + else: + # Sampling "n" from the geometric distribution and clipping it to + # the max_ngrams. Using p=0.2 default from the SpanBERT paper + # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) + n = min(np_rng.geometric(0.2), max_ngrams) + + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # Note(mingdachen): + # Repeatedly looking for a candidate that does not exceed the + # maximum number of predictions by trying shorter ngrams. + while len(masked_lms) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + masked_token = None + if masking_style == "bert": + # 80% of the time, replace with [MASK] + if np_rng.random() < 0.8: + masked_token = mask_id + else: + # 10% of the time, keep original + if np_rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] + elif masking_style == "t5": + masked_token = mask_id + else: + raise ValueError("invalid value of masking style") + + output_tokens[index] = masked_token + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + + masked_spans.append(MaskedLmInstance( + index=index_set, + label=[tokens[index] for index in index_set])) + + assert len(masked_lms) <= num_to_predict + np_rng.shuffle(ngram_indexes) + + select_indexes = set() + if do_permutation: + for cand_index_set in ngram_indexes: + if len(select_indexes) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes or index in select_indexes: + continue + + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + + while len(select_indexes) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(select_indexes) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes or index in select_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + select_indexes.add(index) + assert len(select_indexes) <= num_to_predict + + select_indexes = sorted(select_indexes) + permute_indexes = list(select_indexes) + np_rng.shuffle(permute_indexes) + orig_token = list(output_tokens) + + for src_i, tgt_i in zip(select_indexes, permute_indexes): + output_tokens[src_i] = orig_token[tgt_i] + masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) + + masked_lms = sorted(masked_lms, key=lambda x: x.index) + # Sort the spans by the index of the first span + masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) + + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) + + +def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, + masked_labels, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + assert len(tokentypes) == num_tokens + assert len(masked_positions) == len(masked_labels) + + # Tokens and token types. + filler = [pad_id] * padding_length + tokens_np = np.array(tokens + filler, dtype=np.int64) + tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) + + # Padding mask. + padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, + dtype=np.int64) + + # Lables and loss mask. + labels = [-1] * max_seq_length + loss_mask = [0] * max_seq_length + for i in range(len(masked_positions)): + assert masked_positions[i] < num_tokens + labels[masked_positions[i]] = masked_labels[i] + loss_mask[masked_positions[i]] = 1 + labels_np = np.array(labels, dtype=np.int64) + loss_mask_np = np.array(loss_mask, dtype=np.int64) + + return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np + + +def build_train_valid_test_datasets_with_prefixes(train_valid_test_num_samples, + max_seq_length, + seed, + train_data_prefix=None, + valid_data_prefix=None, + test_data_prefix=None, + binary_head=False, + max_seq_length_dec=None, + dataset_type='standard_bert'): + print_rank_0("Separate data paths provided for train, valid & test.") + + train_dataset, valid_dataset, test_dataset = None, None, None + # Single dataset. + if train_data_prefix is not None: + train_dataset = build_dataset("train", train_data_prefix, + train_valid_test_num_samples[0], + max_seq_length, seed, + binary_head, max_seq_length_dec, + dataset_type=dataset_type) + + if valid_data_prefix is not None: + valid_dataset = build_dataset("valid", valid_data_prefix, + train_valid_test_num_samples[1], + max_seq_length, seed, False, + binary_head, max_seq_length_dec, + dataset_type=dataset_type) + + if test_data_prefix is not None: + test_dataset = build_dataset("test", test_data_prefix, + train_valid_test_num_samples[2], + max_seq_length, seed, False, + binary_head, max_seq_length_dec, + dataset_type=dataset_type) + + return (train_dataset, valid_dataset, test_dataset) + + +def build_train_valid_test_datasets(data_prefix, splits_string, + train_valid_test_num_samples, + max_seq_length, seed, + binary_head=False, + max_seq_length_dec=None, + dataset_type='standard_bert'): + + if len(data_prefix) == 1: + return _build_train_valid_test_datasets(data_prefix[0], + splits_string, + train_valid_test_num_samples, + max_seq_length, seed, + binary_head, + max_seq_length_dec, + dataset_type=dataset_type) + + raise NotImplementedError("Blending currently unsupported for non-GPT dataset instances") + + +def _build_train_valid_test_datasets(data_prefix, splits_string, + train_valid_test_num_samples, + max_seq_length, seed, + binary_head, + max_seq_length_dec, + dataset_type='standard_bert'): + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + dataset_type) + + # Get start and end indices of train/valid/train into doc-idx + # Note that doc-idx is desinged to be num-docs + 1 so we can + # easily iterate over it. + total_num_of_documents = indexed_dataset.document_indices.shape[0] - 1 + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + start_index = indexed_dataset.document_indices[splits[index]] + end_index = indexed_dataset.document_indices[splits[index + 1]] + print_rank_0(' sentence indices in [{}, {}) total of {} ' + 'sentences'.format(start_index, end_index, + end_index - start_index)) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_split_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + # Get the pointer to the original doc-idx so we can set it later. + doc_idx_ptr = indexed_dataset.get_document_indices() + # Slice the doc-idx + start_index = splits[index] + # Add +1 so we can index into the dataset to get the upper bound. + end_index = splits[index + 1] + 1 + # New doc_idx view. + indexed_dataset.set_document_indices(doc_idx_ptr[start_index:end_index]) + + dataset = build_dataset( + name, data_prefix, + train_valid_test_num_samples[index], max_seq_length, + seed, binary_head, max_seq_length_dec, + dataset_type, indexed_dataset) + + # Set the original pointer so dataset remains the main dataset. + indexed_dataset.set_document_indices(doc_idx_ptr) + # Checks. + assert indexed_dataset.document_indices[0] == 0 + assert indexed_dataset.document_indices.shape[0] == \ + (total_num_of_documents + 1) + return dataset + + train_dataset = build_split_dataset(0, 'train') + valid_dataset = build_split_dataset(1, 'valid') + test_dataset = build_split_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def build_dataset(name, data_prefix, max_num_samples, + max_seq_length, seed, binary_head, + max_seq_length_dec, dataset_type='standard_bert', + indexed_dataset=None): + + from megatron.legacy.data.ict_dataset import ICTDataset + from megatron.legacy.data.multimodal_dataset import MultiModalDataset + + if dataset_type == DSET_TYPE_BERT or dataset_type == DSET_TYPE_T5: + raise ValueError("The Megatron-LM BERT and T5 datasets are deprecated.") + + if dataset_type not in DSET_TYPES: + raise ValueError("Invalid dataset_type: ", dataset_type) + + if indexed_dataset is None: + indexed_dataset = get_indexed_dataset_(data_prefix, + dataset_type) + + kwargs = dict( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=max_num_samples, + max_seq_length=max_seq_length, + seed=seed, + ) + + if dataset_type == DSET_TYPE_ICT: + args = get_args() + + title_dataset = get_indexed_dataset_( + args.titles_data_path, + dataset_type) + + dataset = ICTDataset( + block_dataset=indexed_dataset, + title_dataset=title_dataset, + query_in_block_prob=args.query_in_block_prob, + use_one_sent_docs=args.use_one_sent_docs, + binary_head=binary_head, + **kwargs + ) + elif dataset_type == DSET_TYPE_MULTIMODAL: + args = get_args() + dataset = MultiModalDataset( + name=name, + data_prefix=data_prefix, + indexed_dataset=indexed_dataset, + num_samples=max_num_samples, + seq_length=max_seq_length, + seed=seed, + img_h=args.img_h, + img_w=args.img_w, + ) + else: + raise NotImplementedError("Dataset type not fully implemented.") + + return dataset + + +def get_indexed_dataset_(data_prefix, dataset_type): + + print_rank_0(' > building dataset index ...') + + start_time = time.time() + multimodal = dataset_type == DSET_TYPE_MULTIMODAL + indexed_dataset = IndexedDataset(data_prefix, multimodal) + assert indexed_dataset.sequence_lengths.shape[0] == indexed_dataset.document_indices[-1] + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + + print_rank_0(' > indexed dataset stats:') + print_rank_0(' number of documents: {}'.format( + indexed_dataset.document_indices.shape[0] - 1)) + print_rank_0(' number of sentences: {}'.format( + indexed_dataset.sequence_lengths.shape[0])) + + return indexed_dataset + + +def get_train_valid_test_split_(splits_string, size): + """ Get dataset splits from comma or '/' separated string list.""" + + splits = [] + if splits_string.find(',') != -1: + splits = [float(s) for s in splits_string.split(',')] + elif splits_string.find('/') != -1: + splits = [float(s) for s in splits_string.split('/')] + else: + splits = [float(splits_string)] + while len(splits) < 3: + splits.append(0.) + splits = splits[:3] + splits_sum = sum(splits) + assert splits_sum > 0.0 + splits = [split / splits_sum for split in splits] + splits_index = [0] + for index, split in enumerate(splits): + splits_index.append(splits_index[index] + + int(round(split * float(size)))) + diff = splits_index[-1] - size + for index in range(1, len(splits_index)): + splits_index[index] -= diff + assert len(splits_index) == 4 + assert splits_index[-1] == size + return splits_index + +def get_samples_mapping(indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + max_seq_length, + short_seq_prob, + seed, + name, + binary_head): + """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" + + if not num_epochs: + if not max_num_samples: + raise ValueError("Need to specify either max_num_samples " + "or num_epochs") + num_epochs = np.iinfo(np.int32).max - 1 + if not max_num_samples: + max_num_samples = np.iinfo(np.int64).max - 1 + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + if num_epochs != (np.iinfo(np.int32).max - 1): + indexmap_filename += '_{}ep'.format(num_epochs) + if max_num_samples != (np.iinfo(np.int64).max - 1): + indexmap_filename += '_{}mns'.format(max_num_samples) + indexmap_filename += '_{}msl'.format(max_seq_length) + indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) + indexmap_filename += '_{}s'.format(seed) + indexmap_filename += '.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0 and \ + not os.path.isfile(indexmap_filename): + print(' > WARNING: could not find index map file {}, building ' + 'the indices on rank 0 ...'.format(indexmap_filename)) + + # Make sure the types match the helpers input types. + assert indexed_dataset.document_indices.dtype == np.int64 + assert indexed_dataset.sequence_lengths.dtype == np.int32 + + # Build samples mapping + verbose = torch.distributed.get_rank() == 0 + start_time = time.time() + print_rank_0(' > building samples index mapping for {} ...'.format( + name)) + # First compile and then import. + from megatron.core.datasets import helpers + samples_mapping = helpers.build_mapping( + indexed_dataset.document_indices, + indexed_dataset.sequence_lengths, + num_epochs, + max_num_samples, + max_seq_length, + short_seq_prob, + seed, + verbose, + 2 if binary_head else 1) + print_rank_0(' > done building samples index maping') + np.save(indexmap_filename, samples_mapping, allow_pickle=True) + print_rank_0(' > saved the index mapping in {}'.format( + indexmap_filename)) + # Make sure all the ranks have built the mapping + print_rank_0(' > elasped time to build and save samples mapping ' + '(seconds): {:4f}'.format( + time.time() - start_time)) + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.tensor([1], dtype=torch.long, device='cuda') + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + samples_mapping.shape[0])) + + return samples_mapping diff --git a/megatron/legacy/data/ict_dataset.py b/megatron/legacy/data/ict_dataset.py new file mode 100644 index 0000000..2c65f2c --- /dev/null +++ b/megatron/legacy/data/ict_dataset.py @@ -0,0 +1,156 @@ +import itertools +import random + +import numpy as np +from torch.utils.data import Dataset + +from megatron.training import get_tokenizer +from megatron.training import get_args +from megatron.legacy.data.dataset_utils import get_indexed_dataset_ +from megatron.legacy.data.realm_dataset_utils import get_block_samples_mapping + +def make_attention_mask(source_block, target_block): + """ + Returns a 2-dimensional (2-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) + mask = mask.astype(np.int64) + # (source_length, target_length) + return mask + +def get_ict_dataset(use_titles=True, query_in_block_prob=1): + """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block()) + rather than for training, since it is only built with a single epoch sample mapping. + """ + args = get_args() + block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True) + titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True) + + kwargs = dict( + name='full', + block_dataset=block_dataset, + title_dataset=titles_dataset, + data_prefix=args.data_path, + num_epochs=1, + max_num_samples=None, + max_seq_length=args.seq_length, + seed=1, + query_in_block_prob=query_in_block_prob, + use_titles=use_titles, + use_one_sent_docs=args.use_one_sent_docs + ) + dataset = ICTDataset(**kwargs) + return dataset + + +class ICTDataset(Dataset): + """Dataset containing sentences and their blocks for an inverse cloze task.""" + def __init__(self, name, block_dataset, title_dataset, data_prefix, + num_epochs, max_num_samples, max_seq_length, query_in_block_prob, + seed, use_titles=True, use_one_sent_docs=False, binary_head=False): + self.name = name + self.seed = seed + self.max_seq_length = max_seq_length + self.query_in_block_prob = query_in_block_prob + self.block_dataset = block_dataset + self.title_dataset = title_dataset + self.rng = random.Random(self.seed) + self.use_titles = use_titles + self.use_one_sent_docs = use_one_sent_docs + + self.samples_mapping = get_block_samples_mapping( + block_dataset, title_dataset, data_prefix, num_epochs, + max_num_samples, max_seq_length, seed, name, use_one_sent_docs) + self.tokenizer = get_tokenizer() + self.vocab_id_list = list(self.tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_list = self.tokenizer.inv_vocab + self.cls_id = self.tokenizer.cls + self.sep_id = self.tokenizer.sep + self.mask_id = self.tokenizer.mask + self.pad_id = self.tokenizer.pad + + def __len__(self): + return len(self.samples_mapping) + + def __getitem__(self, idx): + """Get an ICT example of a pseudo-query and the block of text from which it was extracted""" + sample_data = self.samples_mapping[idx] + start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple() + + if self.use_titles: + title = self.title_dataset[int(doc_idx)] + title_pad_offset = 3 + len(title) + else: + title = None + title_pad_offset = 2 + block = [self.block_dataset[i] for i in range(start_idx, end_idx)] + assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1 + + # randint() is inclusive for Python rng + rand_sent_idx = self.rng.randint(0, len(block) - 1) + + # keep the query in the context query_in_block_prob fraction of the time. + if self.rng.random() < self.query_in_block_prob: + query = block[rand_sent_idx].copy() + else: + query = block.pop(rand_sent_idx) + + # still need to truncate because blocks are concluded when + # the sentence lengths have exceeded max_seq_length. + query = query[:self.max_seq_length - 2] + block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset] + + query_tokens, query_pad_mask = self.concat_and_pad_tokens(query) + context_tokens, context_pad_mask = self.concat_and_pad_tokens(block, title) + + query_mask = make_attention_mask(query_tokens, query_tokens) + context_mask = make_attention_mask(context_tokens, context_tokens) + + block_data = sample_data.as_array() + + sample = { + 'query_tokens': query_tokens, + 'query_mask': query_mask, + 'query_pad_mask': query_pad_mask, + 'context_tokens': context_tokens, + 'context_mask': context_mask, + 'context_pad_mask': context_pad_mask, + 'block_data': block_data, + } + + return sample + + def get_block(self, start_idx, end_idx, doc_idx): + """Get the IDs for an evidence block plus the title of the corresponding document""" + block = [self.block_dataset[i] for i in range(start_idx, end_idx)] + title = self.title_dataset[int(doc_idx)] + + block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))] + block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title) + + return block_tokens, block_pad_mask + + def get_null_block(self): + """Get empty block and title - used in REALM pretraining""" + block, title = [], [] + block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title) + + return block_tokens, block_pad_mask + + def concat_and_pad_tokens(self, tokens, title=None): + """Concat with special tokens and pad sequence to self.max_seq_length""" + tokens = list(tokens) + if title is None: + tokens = [self.cls_id] + tokens + [self.sep_id] + else: + title = list(title) + tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id] + assert len(tokens) <= self.max_seq_length + + num_pad = self.max_seq_length - len(tokens) + pad_mask = [1] * len(tokens) + [0] * num_pad + tokens += [self.pad_id] * num_pad + + return np.array(tokens), np.array(pad_mask) diff --git a/megatron/legacy/data/image_folder.py b/megatron/legacy/data/image_folder.py new file mode 100644 index 0000000..de15b29 --- /dev/null +++ b/megatron/legacy/data/image_folder.py @@ -0,0 +1,302 @@ +# BSD 3-Clause License +# +# Copyright (c) Soumith Chintala 2016, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# code taken from +# https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py +# added support for classes_fraction and data_per_class_fraction + +from torchvision.datasets import VisionDataset +from PIL import Image + +import os +import os.path +from typing import Any, Callable, cast, Dict, List, Optional, Tuple +import numpy as np + +def has_file_allowed_extension(filename: str, extensions: Tuple[str, ...]) -> bool: + """Checks if a file is an allowed extension. + Args: + filename (string): path to a file + extensions (tuple of strings): extensions to consider (lowercase) + Returns: + bool: True if the filename ends with one of given extensions + """ + return filename.lower().endswith(extensions) + + +def is_image_file(filename: str) -> bool: + """Checks if a file is an allowed image extension. + Args: + filename (string): path to a file + Returns: + bool: True if the filename ends with a known image extension + """ + return has_file_allowed_extension(filename, IMG_EXTENSIONS) + + +def make_dataset( + directory: str, + class_to_idx: Dict[str, int], + data_per_class_fraction: float, + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, +) -> List[Tuple[str, int]]: + """Generates a list of samples of a form (path_to_sample, class). + Args: + directory (str): root dataset directory + class_to_idx (Dict[str, int]): dictionary mapping class name to class index + extensions (optional): A list of allowed extensions. + Either extensions or is_valid_file should be passed. Defaults to None. + is_valid_file (optional): A function that takes path of a file + and checks if the file is a valid file + (used to check of corrupt files) both extensions and + is_valid_file should not be passed. Defaults to None. + Raises: + ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None. + Returns: + List[Tuple[str, int]]: samples of a form (path_to_sample, class) + """ + instances = [] + directory = os.path.expanduser(directory) + both_none = extensions is None and is_valid_file is None + both_something = extensions is not None and is_valid_file is not None + if both_none or both_something: + raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time") + if extensions is not None: + def is_valid_file(x: str) -> bool: + return has_file_allowed_extension(x, cast(Tuple[str, ...], extensions)) + is_valid_file = cast(Callable[[str], bool], is_valid_file) + for target_class in sorted(class_to_idx.keys()): + class_index = class_to_idx[target_class] + target_dir = os.path.join(directory, target_class) + if not os.path.isdir(target_dir): + continue + local_instances = [] + for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)): + for fname in sorted(fnames): + path = os.path.join(root, fname) + if is_valid_file(path): + item = path, class_index + local_instances.append(item) + + instances.extend(local_instances[0:int(len(local_instances) * data_per_class_fraction)]) + + return instances + + +class DatasetFolder(VisionDataset): + """A generic data loader where the samples are arranged in this way: :: + root/class_x/xxx.ext + root/class_x/xxy.ext + root/class_x/[...]/xxz.ext + root/class_y/123.ext + root/class_y/nsdf3.ext + root/class_y/[...]/asd932_.ext + Args: + root (string): Root directory path. + loader (callable): A function to load a sample given its path. + extensions (tuple[string]): A list of allowed extensions. + both extensions and is_valid_file should not be passed. + transform (callable, optional): A function/transform that takes in + a sample and returns a transformed version. + E.g, ``transforms.RandomCrop`` for images. + target_transform (callable, optional): A function/transform that takes + in the target and transforms it. + is_valid_file (callable, optional): A function that takes path of a file + and check if the file is a valid file (used to check of corrupt files) + both extensions and is_valid_file should not be passed. + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + samples (list): List of (sample path, class_index) tuples + targets (list): The class_index value for each image in the dataset + """ + + def __init__( + self, + root: str, + loader: Callable[[str], Any], + extensions: Optional[Tuple[str, ...]] = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + classes_fraction=1.0, + data_per_class_fraction=1.0, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> None: + super(DatasetFolder, self).__init__(root, transform=transform, + target_transform=target_transform) + self.classes_fraction = classes_fraction + self.data_per_class_fraction = data_per_class_fraction + classes, class_to_idx = self._find_classes(self.root) + samples = self.make_dataset(self.root, + class_to_idx, + self.data_per_class_fraction, + extensions, + is_valid_file) + if len(samples) == 0: + msg = "Found 0 files in subfolders of: {}\n".format(self.root) + if extensions is not None: + msg += "Supported extensions are: {}".format(",".join(extensions)) + raise RuntimeError(msg) + + self.loader = loader + self.extensions = extensions + self.total = len(samples) + self.classes = classes + self.class_to_idx = class_to_idx + self.samples = samples + self.targets = [s[1] for s in samples] + + @staticmethod + def make_dataset( + directory: str, + class_to_idx: Dict[str, int], + data_per_class_fraction: float, + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> List[Tuple[str, int]]: + return make_dataset(directory, + class_to_idx, + data_per_class_fraction, + extensions=extensions, + is_valid_file=is_valid_file) + + def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]: + """ + Finds the class folders in a dataset. + Args: + dir (string): Root directory path. + Returns: + tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary. + Ensures: + No class is a subdirectory of another. + """ + all_classes = [d.name for d in os.scandir(dir) if d.is_dir()] + classes = all_classes[0:int(len(all_classes) * self.classes_fraction)] + classes.sort() + class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} + return classes, class_to_idx + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + curr_index = index + for x in range(self.total): + try: + path, target = self.samples[curr_index] + sample = self.loader(path) + break + except Exception as e: + curr_index = np.random.randint(0, self.total) + + if self.transform is not None: + sample = self.transform(sample) + if self.target_transform is not None: + target = self.target_transform(target) + + return sample, target + + def __len__(self) -> int: + return len(self.samples) + + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') + + +# TODO: specify the return type +def accimage_loader(path: str) -> Any: + import accimage + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def default_loader(path: str) -> Any: + from torchvision import get_image_backend + if get_image_backend() == 'accimage': + return accimage_loader(path) + else: + return pil_loader(path) + + +class ImageFolder(DatasetFolder): + """A generic data loader where the images are arranged in this way: :: + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + Args: + root (string): Root directory path. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + is_valid_file (callable, optional): A function that takes path of an Image file + and check if the file is a valid file (used to check of corrupt files) + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + imgs (list): List of (image path, class_index) tuples + """ + + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + classes_fraction=1.0, + data_per_class_fraction=1.0, + loader: Callable[[str], Any] = default_loader, + is_valid_file: Optional[Callable[[str], bool]] = None, + ): + super(ImageFolder, self).__init__(root, loader, IMG_EXTENSIONS if is_valid_file is None else None, + transform=transform, + target_transform=target_transform, + classes_fraction=classes_fraction, + data_per_class_fraction=data_per_class_fraction, + is_valid_file=is_valid_file) + self.imgs = self.samples + diff --git a/megatron/legacy/data/multimodal_dataset.py b/megatron/legacy/data/multimodal_dataset.py new file mode 100644 index 0000000..93ea790 --- /dev/null +++ b/megatron/legacy/data/multimodal_dataset.py @@ -0,0 +1,54 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from PIL import Image, UnidentifiedImageError +import numpy as np +import io +import torch + +try: + from torchvision.transforms import InterpolationMode + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + +from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize + +def _convert_image_to_rgb(image): + return image.convert("RGB") + +def _transform(img_h, img_w): + return Compose([ + ToPILImage(), + RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC), + _convert_image_to_rgb, + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ]) + +class MultiModalDataset(torch.utils.data.Dataset): + + def __init__(self, name, data_prefix, indexed_dataset, + num_samples, seq_length, seed, img_h, img_w): + + self.name = name + self.indexed_dataset = indexed_dataset + self.doc_idx = indexed_dataset.get_document_indices() + self.visual_transform = _transform(img_h, img_w) + + def __len__(self): + return self.indexed_dataset.sequence_lengths.shape[0] + + def __getitem__(self, idx): + text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]) + assert mode == 0 + img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1) + assert mode == 1 + img_pad = img_sample[0].item() + xs = img_sample[1:].tobytes(order='C') + xs = xs[:len(xs)-img_pad] + + img_sample = np.array(Image.open(io.BytesIO(xs))) + img_sample = self.visual_transform(img_sample).reshape(-1) + + return {'text': np.array(text_sample, dtype=np.int64), + 'img': np.array(img_sample, dtype=np.float32)} diff --git a/megatron/legacy/data/orqa_wiki_dataset.py b/megatron/legacy/data/orqa_wiki_dataset.py new file mode 100644 index 0000000..99217d6 --- /dev/null +++ b/megatron/legacy/data/orqa_wiki_dataset.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Wikipedia dataset from DPR code for ORQA.""" + +from abc import ABC +import csv +import numpy as np +import random +import torch +from torch.utils.data import Dataset + +from megatron.training import print_rank_0, get_args, get_tokenizer +from megatron.core import tensor_parallel +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask + +def get_open_retrieval_wiki_dataset(): + args = get_args() + tokenizer = get_tokenizer() + + dataset = OpenRetrievalEvidenceDataset('2018 Wikipedia from DPR codebase', + 'evidence', + args.evidence_data_path, + tokenizer, + args.retriever_seq_length) + return dataset + + +def get_open_retrieval_batch(data_iterator): + # Items and their type. + keys = ['row_id', 'context', 'context_mask', 'context_types', + 'context_pad_mask'] + datatype = torch.int64 + + # Broadcast data. + data = None if data_iterator is None else next(data_iterator) + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + row_id = data_b['row_id'].long() + context = data_b['context'].long() + + # TODO: make the context mask a binary one + context_mask = (data_b['context_mask'] < 0.5) + + context_types = data_b['context_types'].long() + context_pad_mask = data_b['context_pad_mask'].long() + + return row_id, context, context_mask, context_types, context_pad_mask + + +def build_tokens_types_paddings_from_text(row, tokenizer, max_seq_length): + """Build token types and paddings, trim if needed, and pad if needed.""" + + title_ids = tokenizer.tokenize(row['title']) + context_ids = tokenizer.tokenize(row['text']) + + # Appending the title of the context at front + extended_context_ids = title_ids + [tokenizer.sep_id] + context_ids + + context_ids, context_types, context_pad_mask = \ + build_tokens_types_paddings_from_ids(extended_context_ids, + max_seq_length, tokenizer.cls, tokenizer.sep, tokenizer.pad) + + return context_ids, context_types, context_pad_mask + + +# noinspection DuplicatedCode +def build_tokens_types_paddings_from_ids(text_ids, max_seq_length, + cls_id, sep_id, pad_id): + """Build token types and paddings, trim if needed, and pad if needed.""" + enc_ids = [] + tokentypes_enc = [] + + # [CLS]. + enc_ids.append(cls_id) + tokentypes_enc.append(0) + + # A. + len_src = len(text_ids) + enc_ids.extend(text_ids) + tokentypes_enc.extend([0] * len_src) + + # Cap the size. + if len(enc_ids) > max_seq_length - 1: + enc_ids = enc_ids[0: max_seq_length - 1] + tokentypes_enc = tokentypes_enc[0: max_seq_length - 1] + + # [SEP]. + enc_ids.append(sep_id) + tokentypes_enc.append(0) + + num_tokens_enc = len(enc_ids) + # Padding. + padding_length = max_seq_length - len(enc_ids) + if padding_length > 0: + enc_ids.extend([pad_id] * padding_length) + tokentypes_enc.extend([pad_id] * padding_length) + + pad_mask = ([1] * num_tokens_enc) + ([0] * padding_length) + pad_mask = np.array(pad_mask, dtype=np.int64) + + return enc_ids, tokentypes_enc, pad_mask + + +def build_sample(row_id, context_ids, context_types, context_pad_mask): + """Convert to numpy and return a sample consumed by the batch producer.""" + + context_ids = np.array(context_ids, dtype=np.int64) + context_types = np.array(context_types, dtype=np.int64) + context_mask = make_attention_mask(context_ids, context_ids) + + sample = ({ + 'row_id': row_id, + 'context': context_ids, + 'context_mask': context_mask, + 'context_types': context_types, + 'context_pad_mask': context_pad_mask + }) + return sample + + +class OpenRetrievalEvidenceDataset(ABC, Dataset): + """Open Retrieval Evidence dataset class.""" + + def __init__(self, task_name, dataset_name, datapath, tokenizer, + max_seq_length): + # Store inputs. + self.task_name = task_name + self.dataset_name = dataset_name + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + print_rank_0(' > building {} dataset for {}:'.format(self.task_name, + self.dataset_name)) + # Process the files. + print_rank_0(datapath) + self.samples, self.id2text = self.process_samples_from_single_path( + datapath) + + args = get_args() + if args.sample_rate < 1: # subsample + k = int(len(self.samples) * args.sample_rate) + self.samples = random.sample(self.samples, k) + + print_rank_0(' >> total number of samples: {}'.format( + len(self.samples))) + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + row = self.samples[idx] + + context_ids, context_types, context_pad_mask = \ + build_tokens_types_paddings_from_text(row, self.tokenizer, + self.max_seq_length) + + sample = build_sample(row['doc_id'], + context_ids, + context_types, + context_pad_mask) + return sample + + @staticmethod + def process_samples_from_single_path(filename): + print_rank_0(' > Processing {} ...'.format(filename)) + total = 0 + + rows = [] + id2text = {} + + with open(filename) as tsvfile: + reader = csv.reader(tsvfile, delimiter='\t') + next(reader, None) # skip the headers + for row in reader: + # file format: doc_id, doc_text, title + doc_id = int(row[0]) + text = row[1] + title = row[2] + + rows.append({'doc_id': doc_id, + 'text': text, + 'title': title}) + + assert doc_id not in id2text + id2text[doc_id] = (text, title) + + total += 1 + if total % 100000 == 0: + print_rank_0(' > processed {} rows so far ...'.format( + total)) + + print_rank_0(' >> processed {} samples.'.format(len(rows))) + return rows, id2text diff --git a/megatron/legacy/data/realm_dataset_utils.py b/megatron/legacy/data/realm_dataset_utils.py new file mode 100644 index 0000000..50bf9bd --- /dev/null +++ b/megatron/legacy/data/realm_dataset_utils.py @@ -0,0 +1,199 @@ +import os +import time + +import numpy as np +import torch + +from megatron.training import print_rank_0 +from megatron.core import mpu, tensor_parallel +from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy +from megatron.training import get_args, get_tokenizer, print_rank_0 + + +def get_one_epoch_dataloader(dataset, micro_batch_size=None): + """Specifically one epoch to be used in an indexing job.""" + args = get_args() + + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + if micro_batch_size is None: + micro_batch_size = args.micro_batch_size + global_batch_size = micro_batch_size * world_size + num_workers = args.num_workers + + sampler = torch.utils.data.SequentialSampler(dataset) + # importantly, drop_last must be False to get all the data. + assert False, 'DistributedBatchSampler deprecated, change the implementation' + from megatron.legacy.data.samplers import DistributedBatchSampler + batch_sampler = DistributedBatchSampler(sampler, + batch_size=global_batch_size, + drop_last=False, + rank=rank, + world_size=world_size) + + return torch.utils.data.DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + pin_memory=True) + + +def get_ict_batch(data_iterator): + # Items and their type. + keys = ['query_tokens', 'query_pad_mask', + 'block_tokens', 'block_pad_mask', 'block_data'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is None: + data = None + else: + data = next(data_iterator) + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + query_tokens = data_b['query_tokens'].long() + query_pad_mask = data_b['query_pad_mask'].long() + block_tokens = data_b['block_tokens'].long() + block_pad_mask = data_b['block_pad_mask'].long() + block_indices = data_b['block_data'].long() + + return query_tokens, query_pad_mask,\ + block_tokens, block_pad_mask, block_indices + + +def join_str_list(str_list): + """Join a list of strings, handling spaces appropriately""" + result = "" + for s in str_list: + if s.startswith("##"): + result += s[2:] + else: + result += " " + s + return result + + +class BlockSampleData(object): + """A struct for fully describing a fixed-size block of data as used in REALM + + :param start_idx: for first sentence of the block + :param end_idx: for last sentence of the block (may be partially truncated in sample construction) + :param doc_idx: the index of the document from which the block comes in the original indexed dataset + :param block_idx: a unique integer identifier given to every block. + """ + def __init__(self, start_idx, end_idx, doc_idx, block_idx): + self.start_idx = start_idx + self.end_idx = end_idx + self.doc_idx = doc_idx + self.block_idx = block_idx + + def as_array(self): + return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64) + + def as_tuple(self): + return self.start_idx, self.end_idx, self.doc_idx, self.block_idx + + +class BlockSamplesMapping(object): + def __init__(self, mapping_array): + # make sure that the array is compatible with BlockSampleData + assert mapping_array.shape[1] == 4 + self.mapping_array = mapping_array + + def __len__(self): + return self.mapping_array.shape[0] + + def __getitem__(self, idx): + """Get the data associated with an indexed sample.""" + sample_data = BlockSampleData(*self.mapping_array[idx]) + return sample_data + + +def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs, + max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False): + """Get samples mapping for a dataset over fixed size blocks. This function also requires + a dataset of the titles for the source documents since their lengths must be taken into account. + + :return: samples_mapping (BlockSamplesMapping) + """ + + if not num_epochs: + if not max_num_samples: + raise ValueError("Need to specify either max_num_samples " + "or num_epochs") + num_epochs = np.iinfo(np.int32).max - 1 + if not max_num_samples: + max_num_samples = np.iinfo(np.int64).max - 1 + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + if num_epochs != (np.iinfo(np.int32).max - 1): + indexmap_filename += '_{}ep'.format(num_epochs) + if max_num_samples != (np.iinfo(np.int64).max - 1): + indexmap_filename += '_{}mns'.format(max_num_samples) + indexmap_filename += '_{}msl'.format(max_seq_length) + indexmap_filename += '_{}s'.format(seed) + if use_one_sent_docs: + indexmap_filename += '_1sentok' + indexmap_filename += '.npy' + + # Build the indexed mapping if not exist. + if mpu.get_data_parallel_rank() == 0 and \ + not os.path.isfile(indexmap_filename): + print(' > WARNING: could not find index map file {}, building ' + 'the indices on rank 0 ...'.format(indexmap_filename)) + + # Make sure the types match the helpers input types. + assert block_dataset.document_indices.dtype == np.int64 + assert block_dataset.sequence_lengths.dtype == np.int32 + + # Build samples mapping + verbose = torch.distributed.get_rank() == 0 + start_time = time.time() + print_rank_0(' > building samples index mapping for {} ...'.format( + name)) + + from megatron.core.datasets import helpers + mapping_array = helpers.build_blocks_mapping( + block_dataset.document_indices, + block_dataset.sequence_lengths, + title_dataset.sequence_lengths, + num_epochs, + max_num_samples, + max_seq_length - 3, # account for added tokens + seed, + verbose, + use_one_sent_docs) + + + print_rank_0(' > done building samples index mapping') + np.save(indexmap_filename, mapping_array, allow_pickle=True) + print_rank_0(' > saved the index mapping in {}'.format( + indexmap_filename)) + # Make sure all the ranks have built the mapping + print_rank_0(' > elapsed time to build and save samples mapping ' + '(seconds): {:4f}'.format( + time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.tensor([1], dtype=torch.long, device='cuda') + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + assert counts[0].item() == torch.distributed.get_world_size( + group=mpu.get_data_parallel_group()) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + + mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') + samples_mapping = BlockSamplesMapping(mapping_array) + + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + mapping_array.shape[0])) + + return samples_mapping diff --git a/megatron/legacy/data/realm_index.py b/megatron/legacy/data/realm_index.py new file mode 100644 index 0000000..2575af7 --- /dev/null +++ b/megatron/legacy/data/realm_index.py @@ -0,0 +1,224 @@ +import itertools +import os +import pickle +import shutil + +import numpy as np +import torch + +from megatron.training import get_args +from megatron.core import mpu + + +def detach(tensor): + return tensor.detach().cpu().numpy() + + +class OpenRetreivalDataStore(object): + """ + Serializable data structure for holding data for blocks -- + embeddings and necessary metadata for Retriever + """ + def __init__(self, embedding_path=None, load_from_path=True, rank=None): + self.embed_data = dict() + if embedding_path is None: + args = get_args() + embedding_path = args.embedding_path + rank = args.rank + self.embedding_path = embedding_path + self.rank = rank + + if load_from_path: + self.load_from_file() + + block_data_name = os.path.splitext(self.embedding_path)[0] + self.temp_dir_name = block_data_name + '_tmp' + + def state(self): + return { + 'embed_data': self.embed_data, + } + + def clear(self): + """ + Clear the embedding data structures to save memory. + The metadata ends up getting used, and is also much smaller in + dimensionality so it isn't really worth clearing. + """ + self.embed_data = dict() + + def load_from_file(self): + """Populate members from instance saved to file""" + + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print("\n> Unpickling BlockData", flush=True) + state_dict = pickle.load(open(self.embedding_path, 'rb')) + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print(">> Finished unpickling BlockData\n", flush=True) + + self.embed_data = state_dict['embed_data'] + + def add_block_data(self, row_id, block_embeds, allow_overwrite=False): + """ + Add data for set of blocks + :param row_id: 1D array of unique int ids for the blocks + :param block_embeds: 2D array of embeddings of the blocks + In the case of retriever this will be [start_idx, end_idx, doc_idx] + """ + for idx, embed in zip(row_id, block_embeds): + if not allow_overwrite and idx in self.embed_data: + raise ValueError("Unexpectedly tried to overwrite block data") + + self.embed_data[idx] = np.float16(embed) + + def save_shard(self): + """ + Save the block data that was created this in this process + """ + if not os.path.isdir(self.temp_dir_name): + os.makedirs(self.temp_dir_name, exist_ok=True) + + # save the data for each shard + with open('{}/{}.pkl'.format(self.temp_dir_name, self.rank), 'wb') \ + as writer: + pickle.dump(self.state(), writer) + + def merge_shards_and_save(self): + #Combine all the shards made using save_shard + shard_names = os.listdir(self.temp_dir_name) + seen_own_shard = False + + for fname in os.listdir(self.temp_dir_name): + shard_rank = int(os.path.splitext(fname)[0]) + if shard_rank == self.rank: + seen_own_shard = True + continue + + with open('{}/{}'.format(self.temp_dir_name, fname), 'rb') as f: + data = pickle.load(f) + old_size = len(self.embed_data) + shard_size = len(data['embed_data']) + + # add the shard's data and check to make sure there + # is no overlap + self.embed_data.update(data['embed_data']) + assert len(self.embed_data) == old_size + shard_size + + assert seen_own_shard + + # save the consolidated shards and remove temporary directory + with open(self.embedding_path, 'wb') as final_file: + pickle.dump(self.state(), final_file) + shutil.rmtree(self.temp_dir_name, ignore_errors=True) + + print("Finished merging {} shards for a total of {} embeds".format( + len(shard_names), len(self.embed_data)), flush=True) + + +class FaissMIPSIndex(object): + """ + Wrapper object for a BlockData which similarity search via FAISS under the hood + """ + def __init__(self, embed_size, embed_data=None, use_gpu=False): + self.embed_size = embed_size + self.embed_data = embed_data + self.use_gpu = use_gpu + + self.mips_index = None + self._set_mips_index() + + def _set_mips_index(self): + """ + Create a Faiss Flat index with inner product as the metric + to search against + """ + try: + import faiss + except ImportError: + raise Exception("Error: Please install faiss to use FaissMIPSIndex") + + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print("\n> Building index", flush=True) + + cpu_index = faiss.IndexFlatIP(self.embed_size) + + if self.use_gpu: + # create resources and config for GpuIndex + config = faiss.GpuMultipleClonerOptions() + config.shard = True + config.useFloat16 = True + gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config) + self.mips_index = faiss.IndexIDMap(gpu_index) + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print(">> Initialized index on GPU", flush=True) + else: + # CPU index supports IDs so wrap with IDMap + self.mips_index = faiss.IndexIDMap(cpu_index) + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print(">> Initialized index on CPU", flush=True) + + # if we were constructed with a BlockData, then automatically load it + # when the FAISS structure is built + if self.embed_data is not None: + self.add_embed_data(self.embed_data) + + def reset_index(self): + """Delete existing index and create a new""" + del self.mips_index + + # reset the block data so that _set_block_index will reload it as well + if self.embed_data is not None: + embed_data_path = self.embed_data.embedding_path + del self.embed_data + self.embed_data = OpenRetreivalDataStore(embed_data_path) + + self._set_mips_index() + + def update_index(self): + """Delete existing index and create a new""" + del self.mips_index + + # reset the block data so that _set_mips_index will reload it as well + if self.embed_data is not None: + self.embed_data.load_from_file() + self._set_mips_index() + + def add_embed_data(self, all_embed_data): + """Add the embedding of each block to the underlying FAISS index""" + + # this assumes the embed_data is a dict : {int: np.array} + block_indices, block_embeds = zip(*all_embed_data.embed_data.items()) + + # the embeddings have to be entered in as float32 even though the math + # internally is done with float16. + embeds_arr = np.float32(np.array(block_embeds)) + indices_arr = np.array(block_indices) + + # we no longer need the embedding data since it's in the index now + all_embed_data.clear() + + self.mips_index.add_with_ids(embeds_arr, indices_arr) + + if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0: + print(">>> Finished adding block data to index", flush=True) + + def search_mips_index(self, query_embeds, top_k, reconstruct=True): + """ + Get the top-k blocks by the index distance metric. + + :param reconstruct: if True: return a [num_queries x k x embed_dim] + array of blocks + if False: return [num_queries x k] array of + distances, and another for indices + """ + query_embeds = np.float32(detach(query_embeds)) + + if reconstruct: + # get the vectors themselves + top_k_block_embeds = self.mips_index.search_and_reconstruct(\ + query_embeds, top_k) + return top_k_block_embeds + else: + # get distances and indices of closest vectors + distances, block_indices = self.mips_index.search(query_embeds, top_k) + return distances, block_indices diff --git a/megatron/legacy/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py new file mode 100644 index 0000000..e65c536 --- /dev/null +++ b/megatron/legacy/data/vit_dataset.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import os +import random +import numpy as np +import torch +import torchvision.transforms as T +from torchvision import datasets +from megatron.training import get_args +from megatron.legacy.data.image_folder import ImageFolder +from megatron.legacy.data.autoaugment import ImageNetPolicy +from megatron.legacy.data.data_samplers import RandomSeedDataset +from PIL import Image, ImageFilter, ImageOps + + +class GaussianBlur(object): + """ + Apply Gaussian Blur to the PIL image. + """ + def __init__(self, p=0.5, radius_min=0.1, radius_max=2.): + self.prob = p + self.radius_min = radius_min + self.radius_max = radius_max + + def __call__(self, img): + do_it = random.random() <= self.prob + if not do_it: + return img + + return img.filter( + ImageFilter.GaussianBlur( + radius=random.uniform(self.radius_min, self.radius_max) + ) + ) + + +class Solarization(object): + """ + Apply Solarization to the PIL image. + """ + def __init__(self, p): + self.p = p + + def __call__(self, img): + if random.random() < self.p: + return ImageOps.solarize(img) + else: + return img + + +class ClassificationTransform(): + def __init__(self, image_size, train=True): + args = get_args() + assert args.fp16 or args.bf16 + self.data_type = torch.half if args.fp16 else torch.bfloat16 + if train: + self.transform = T.Compose([ + T.RandomResizedCrop(image_size), + T.RandomHorizontalFlip(), + T.ColorJitter(0.4, 0.4, 0.4, 0.1), + ImageNetPolicy(), + T.ToTensor(), + T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + T.ConvertImageDtype(self.data_type) + ]) + else: + self.transform = T.Compose([ + T.Resize(image_size), + T.CenterCrop(image_size), + T.ToTensor(), + T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + T.ConvertImageDtype(self.data_type) + ]) + + def __call__(self, input): + output = self.transform(input) + return output + + +class InpaintingTransform(): + def __init__(self, image_size, train=True): + + args = get_args() + self.mask_factor = args.mask_factor + self.mask_type = args.mask_type + self.image_size = image_size + self.patch_size = args.patch_dim + self.mask_size = int(self.mask_factor*(image_size[0]/self.patch_size)*(image_size[1]/self.patch_size)) + self.train = train + assert args.fp16 or args.bf16 + self.data_type = torch.half if args.fp16 else torch.bfloat16 + + if self.train: + self.transform = T.Compose([ + T.RandomResizedCrop(self.image_size), + T.RandomHorizontalFlip(), + T.ColorJitter(0.4, 0.4, 0.4, 0.1), + ImageNetPolicy(), + T.ToTensor(), + T.ConvertImageDtype(self.data_type) + ]) + else: + self.transform = T.Compose([ + T.Resize(self.image_size, interpolation=2), + T.CenterCrop(self.image_size), + T.ToTensor(), + T.ConvertImageDtype(self.data_type) + ]) + + def gen_mask(self, image_size, mask_size, mask_type, patch_size): + # output: mask as a list with indices for missing patches + action_list = [[0, 1], [0, -1], [1, 0], [-1, 0]] + assert image_size[0] == image_size[1] + img_size_patch = image_size[0] // patch_size + + # drop masked patches + mask = torch.zeros((image_size[0], image_size[1]), dtype=torch.float) + + if mask_type == 'random': + x = torch.randint(0, img_size_patch, ()) + y = torch.randint(0, img_size_patch, ()) + for i in range(mask_size): + r = torch.randint(0, len(action_list), ()) + x = torch.clamp(x + action_list[r][0], min=0, max=img_size_patch - 1) + y = torch.clamp(y + action_list[r][1], min=0, max=img_size_patch - 1) + x_offset = x * patch_size + y_offset = y * patch_size + mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1 + else: + assert mask_type == 'row' + count = 0 + for x in reversed(range(img_size_patch)): + for y in reversed(range(img_size_patch)): + if (count < mask_size): + count += 1 + x_offset = x * patch_size + y_offset = y * patch_size + mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1 + return mask + + def __call__(self, input): + trans_input = self.transform(input) + mask = self.gen_mask(self.image_size, self.mask_size, + self.mask_type, self.patch_size) + mask = mask.unsqueeze(dim=0) + return trans_input, mask + + +class DinoTransform(object): + def __init__(self, image_size, train=True): + args = get_args() + self.data_type = torch.half if args.fp16 else torch.bfloat16 + + flip_and_color_jitter = T.Compose([ + T.RandomHorizontalFlip(p=0.5), + T.RandomApply( + [T.ColorJitter(brightness=0.4, contrast=0.4, + saturation=0.2, hue=0.1)], + p=0.8 + ), + T.RandomGrayscale(p=0.2), + ]) + + if args.fp16 or args.bf16: + normalize = T.Compose([ + T.ToTensor(), + T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + T.ConvertImageDtype(self.data_type) + ]) + else: + normalize = T.Compose([ + T.ToTensor(), + T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + + # first global crop + scale_const = 0.4 + self.global_transform1 = T.Compose([ + T.RandomResizedCrop(image_size, + scale=(scale_const, 1), + interpolation=Image.BICUBIC), + flip_and_color_jitter, + GaussianBlur(1.0), + normalize + ]) + # second global crop + self.global_transform2 = T.Compose([ + T.RandomResizedCrop(image_size, + scale=(scale_const, 1), + interpolation=Image.BICUBIC), + flip_and_color_jitter, + GaussianBlur(0.1), + Solarization(0.2), + normalize + ]) + # transformation for the local small crops + self.local_crops_number = args.dino_local_crops_number + self.local_transform = T.Compose([ + T.RandomResizedCrop(args.dino_local_img_size, + scale=(0.05, scale_const), + interpolation=Image.BICUBIC), + flip_and_color_jitter, + GaussianBlur(p=0.5), + normalize + ]) + + def __call__(self, image): + crops = [] + crops.append(self.global_transform1(image)) + crops.append(self.global_transform2(image)) + for _ in range(self.local_crops_number): + crops.append(self.local_transform(image)) + return crops + + +def build_train_valid_datasets(data_path, image_size=224): + args = get_args() + + if args.vision_pretraining_type == 'classify': + train_transform = ClassificationTransform(image_size) + val_transform = ClassificationTransform(image_size, train=False) + elif args.vision_pretraining_type == 'inpaint': + train_transform = InpaintingTransform(image_size, train=False) + val_transform = InpaintingTransform(image_size, train=False) + elif args.vision_pretraining_type == 'dino': + train_transform = DinoTransform(image_size, train=True) + val_transform = ClassificationTransform(image_size, train=False) + else: + raise Exception('{} vit pretraining type is not supported.'.format( + args.vit_pretraining_type)) + + # training dataset + train_data_path = data_path[0] if len(data_path) <= 2 else data_path[2] + train_data = ImageFolder( + root=train_data_path, + transform=train_transform, + classes_fraction=args.classes_fraction, + data_per_class_fraction=args.data_per_class_fraction + ) + train_data = RandomSeedDataset(train_data) + + # validation dataset + val_data_path = data_path[1] + val_data = ImageFolder( + root=val_data_path, + transform=val_transform + ) + val_data = RandomSeedDataset(val_data) + + return train_data, val_data diff --git a/megatron/legacy/fp16_deprecated/loss_scaler.py b/megatron/legacy/fp16_deprecated/loss_scaler.py new file mode 100644 index 0000000..e31d00a --- /dev/null +++ b/megatron/legacy/fp16_deprecated/loss_scaler.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""For backward compatibility, we need the class definitions to deserialize.""" + +class LossScaler: + def __init__(self, scale=1): + self.cur_scale = scale + +class DynamicLossScaler: + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000, + min_scale=1, + delayed_shift=1, + consecutive_hysteresis=False): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + self.min_scale = min_scale + self.delayed_shift = delayed_shift + self.cur_hysteresis = delayed_shift + self.consecutive_hysteresis = consecutive_hysteresis + diff --git a/megatron/legacy/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py new file mode 100644 index 0000000..87cceac --- /dev/null +++ b/megatron/legacy/fused_kernels/__init__.py @@ -0,0 +1,75 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import os +import pathlib +import subprocess + +from torch.utils import cpp_extension + +# Setting this param to a list has a problem of generating different +# compilation commands (with diferent order of architectures) and +# leading to recompilation of fused kernels. Set it to empty string +# to avoid recompilation and assign arch flags explicity in +# extra_cuda_cflags below +os.environ["TORCH_CUDA_ARCH_LIST"] = "" + + +def load(args): + + # Check if cuda 11 is installed for compute capability 8.0 + cc_flag = [] + _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( + cpp_extension.CUDA_HOME + ) + if int(bare_metal_major) >= 11: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_80,code=sm_80') + if int(bare_metal_minor) >= 8: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_90,code=sm_90') + + # Build path + srcpath = pathlib.Path(__file__).parent.absolute() + buildpath = srcpath / "build" + _create_build_dir(buildpath) + + # Helper function to build the kernels. + def _cpp_extention_load_helper(name, sources, extra_cuda_flags): + return cpp_extension.load( + name=name, + sources=sources, + build_directory=buildpath, + extra_cflags=[ + "-O3", + ], + extra_cuda_cflags=[ + "-O3", + "-gencode", + "arch=compute_70,code=sm_70", + "--use_fast_math", + ] + + extra_cuda_flags + + cc_flag, + verbose=(args.rank == 0), + ) + + +def _get_cuda_bare_metal_version(cuda_dir): + raw_output = subprocess.check_output( + [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True + ) + output = raw_output.split() + release_idx = output.index("release") + 1 + release = output[release_idx].split(".") + bare_metal_major = release[0] + bare_metal_minor = release[1][0] + + return raw_output, bare_metal_major, bare_metal_minor + + +def _create_build_dir(buildpath): + try: + os.mkdir(buildpath) + except OSError: + if not os.path.isdir(buildpath): + print(f"Creation of the build directory {buildpath} failed") diff --git a/megatron/legacy/fused_kernels/compat.h b/megatron/legacy/fused_kernels/compat.h new file mode 100644 index 0000000..5495d78 --- /dev/null +++ b/megatron/legacy/fused_kernels/compat.h @@ -0,0 +1,17 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ + +/*This code is copied fron NVIDIA apex: + * https://github.com/NVIDIA/apex + * with minor changes. */ + + + +#ifndef TORCH_CHECK +#define TORCH_CHECK AT_CHECK +#endif + +#ifdef VERSION_GE_1_3 +#define DATA_PTR data_ptr +#else +#define DATA_PTR data +#endif diff --git a/megatron/legacy/fused_kernels/tests/__init__.py b/megatron/legacy/fused_kernels/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/legacy/fused_kernels/tests/test_fused_kernels.py b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py new file mode 100644 index 0000000..adb9ac6 --- /dev/null +++ b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py @@ -0,0 +1,388 @@ +import math + +import torch +from torch.nn import LayerNorm + +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.fused_layer_norm import MixedFusedLayerNorm +from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax +from megatron.legacy.model.utils import attention_mask_func +from megatron.legacy.fused_kernels import load + +def test_load_fused_kernels(): + try: + import fused_layer_norm_cuda + import scaled_masked_softmax_cuda + import scaled_upper_triang_masked_softmax_cuda + import torch + + print("[Success] load_fused_kernels") + except ImportError as e: + print("[Fail] load_fused_kernels") + raise e + +def test_fused_softmax(): + bert = BertModel.from_pretrained("bert-base-cased").cuda().half() + tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + test_text = ( + "Hello. How are you? I am fine thank you and you? yes Good. " + "hi hi hi hi hi hi hi hi hi hi hi hi hi" # 32 + ) + + tokens = tokenizer( + [test_text] * 4, + return_tensors="pt", + ) + + embedding_output = bert.embeddings( + input_ids=tokens["input_ids"].cuda(), + position_ids=None, + token_type_ids=tokens["token_type_ids"].cuda(), + inputs_embeds=None, + past_key_values_length=0, + ) + + # (bsz, 1, 1, seq_len) + mask = bert.get_extended_attention_mask( + attention_mask=tokens["attention_mask"].cuda(), + input_shape=tokens["input_ids"].shape, + device=bert.device, + ) + # (bsz, 1, seq_len, seq_len) + mask = mask.repeat(1, 1, mask.size()[-1], 1) + + attention = bert.encoder.layer[0].attention.self + key_layer = attention.transpose_for_scores(attention.key(embedding_output)) + query_layer = attention.transpose_for_scores(attention.query(embedding_output)) + + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores /= math.sqrt(key_layer.size()[-1]) + + fused_softmax = ( + FusedScaleMaskSoftmax( + input_in_fp16=True, + input_in_bf16=False, + mask_func=attention_mask_func, + scale=None, + softmax_in_fp32=False, + attn_mask_type=AttnMaskType.padding, + scaled_masked_softmax_fusion=True, + ) + .cuda() + .half() + ) + + fused_softmax_output = fused_softmax( + attention_scores, + (mask != 0), + ) + + torch_softmax = ( + FusedScaleMaskSoftmax( + input_in_fp16=True, + input_in_bf16=False, + mask_func=attention_mask_func, + scale=None, + softmax_in_fp32=False, + attn_mask_type=AttnMaskType.padding, + scaled_masked_softmax_fusion=False, + ) + .cuda() + .half() + ) + + torch_softmax_output = torch_softmax( + attention_scores, + (mask != 0), + ) + + test_result = (fused_softmax_output - torch_softmax_output).abs() + + while test_result.dim() != 1: + test_result = test_result.mean(dim=-1) + + diff = test_result.mean(dim=-1) + + if diff <= 1e-3: + print( + f"\n[Success] test_fused_softmax" + f"\n > mean_difference={diff}" + f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}" + f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" + ) + else: + print( + f"\n[Fail] test_fused_softmax" + f"\n > mean_difference={diff}, " + f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, " + f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" + ) + + +def test_fused_upper_triangle_mask_softmax(): + gpt = GPT2Model.from_pretrained("gpt2").cuda().half() + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + test_text = ( + "Hello. How are you? I am fine thank you and you? yes Good. " + "hi hi hi hi hi hi hi" # 24 + ) + + tokens = tokenizer( + [test_text] * 4, + return_tensors="pt", + ) + + attention_mask = tokens["attention_mask"].cuda() + attention_mask = attention_mask.view(attention_mask.size(0), -1) + attention_mask = attention_mask[:, None, None, :] + attention_mask = (1.0 - attention_mask) * -10000.0 + attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1) + attn = gpt.h[0] + + hidden_states = gpt.wte(tokens["input_ids"].cuda()) + q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1) + q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim) + k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim) + attn_weights = torch.matmul(q, k.transpose(-1, -2)) + + sq, sk = q.size(-2), k.size(-2) + causal_mask = attn.attn.bias[:, :, sk - sq : sk, :sk].bool() + total_mask = ~(causal_mask & (attention_mask == 0)) + """ + tensor([[[[False, True, True, ..., True, True, True], + [False, False, True, ..., True, True, True], + [False, False, False, ..., True, True, True], + ..., + [False, False, False, ..., False, True, True], + [False, False, False, ..., False, False, True], + [False, False, False, ..., False, False, False]]] + """ + + fused_softmax = ( + FusedScaleMaskSoftmax( + input_in_fp16=True, + input_in_bf16=False, + mask_func=attention_mask_func, + scale=None, + softmax_in_fp32=False, + attn_mask_type=AttnMaskType.causal, + scaled_masked_softmax_fusion=True, + ) + .cuda() + .half() + ) + + fused_softmax_output = fused_softmax( + attn_weights, + total_mask, + ) + + torch_softmax = ( + FusedScaleMaskSoftmax( + input_in_fp16=True, + input_in_bf16=False, + mask_func=attention_mask_func, + scale=None, + softmax_in_fp32=False, + attn_mask_type=AttnMaskType.causal, + scaled_masked_softmax_fusion=False, + ) + .cuda() + .half() + ) + + torch_softmax_output = torch_softmax( + attn_weights, + total_mask, + ) + + test_result = (fused_softmax_output - torch_softmax_output).abs() + + while test_result.dim() != 1: + test_result = test_result.mean(dim=-1) + + diff = test_result.mean(dim=-1) + + if diff <= 1e-3: + print( + f"\n[Success] test_fused_upper_triangle_mask_softmax" + f"\n > mean_difference={diff}" + f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}" + f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" + ) + else: + print( + f"\n[Fail] test_fused_upper_triangle_mask_softmax" + f"\n > mean_difference={diff}, " + f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, " + f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" + ) + + +def test_layer_norm(): + bert = BertModel.from_pretrained("bert-base-cased").cuda().half() + tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + test_text = ( + "Hello. How are you? I am fine thank you and you? yes Good. " + "hi hi hi hi hi hi hi hi hi hi hi hi hi" # 32 + ) + + tokens = tokenizer( + [test_text] * 4, + return_tensors="pt", + ) + + # [bsz, seq_len, d_model] + embedding_output = ( + bert.embeddings( + input_ids=tokens["input_ids"].cuda(), + position_ids=None, + token_type_ids=tokens["token_type_ids"].cuda(), + inputs_embeds=None, + past_key_values_length=0, + ) + .cuda() + .half() + ) + + fused_layernorm_layer = ( + MixedFusedLayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half() + ) + + torch_layernorm_layer = ( + LayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half() + ) + + fused_output = fused_layernorm_layer(embedding_output) + torch_output = torch_layernorm_layer(embedding_output) + test_result = (fused_output - torch_output).abs() + + while test_result.dim() != 1: + test_result = test_result.mean(dim=-1) + + diff = test_result.mean(dim=-1) + + if diff <= 1e-3: + print( + f"\n[Success] test_layer_norm" + f"\n > mean_difference={diff}" + f"\n > fused_values={fused_output[-1][-1][:5].tolist()}" + f"\n > torch_values={torch_output[-1][-1][:5].tolist()}" + ) + else: + print( + f"\n[Fail] test_layer_norm" + f"\n > mean_difference={diff}, " + f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, " + f"\n > torch_values={torch_output[-1][-1][:5].tolist()}" + ) + + +def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + + +def forward_torch_softmax(input, mask, scale): + input = input * scale + mask_output = attention_mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + return probs + + +def test_masked_softmax_forward(): + import scaled_masked_softmax_cuda + + batch = 2 + attn = 16 + scale_t = torch.tensor([1.0]) + for qlen in [128, 256, 1024, 2048, 4096]: + for klen in [128, 256, 1024, 2048]: + inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0') + masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0') + softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item()) + softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item()) + error = (softmax_results_torch - softmax_results).abs().max() + assert error < 1e-3 + +def test_masked_softmax_backward(): + import scaled_masked_softmax_cuda + + batch = 2 + attn = 16 + scale_t = torch.tensor([1.0]) + for qlen in [128, 256, 1024, 2048, 4096]: + for klen in [128, 256, 1024, 2048]: + inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0') + backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0') + masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0') + softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item()) + back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item()) + + inputs.requires_grad = True + softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item()) + softmax_results_torch.backward(backward) + error = (back_grad - inputs.grad).abs().max() + assert error < 1e-3 + + +def test_allmasked_softmax_forward(): + import scaled_masked_softmax_cuda + + batch = 2 + attn = 16 + scale_t = torch.tensor([1.0]) + for qlen in [128, 256, 1024, 2048, 4096]: + for klen in [128, 256, 1024, 2048]: + inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0') + masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0') + softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item()) + softmax_results_torch = torch.zeros_like(inputs) + error = (softmax_results_torch - softmax_results).abs().max() + assert error == 0.0 + + +def test_allmasked_softmax_backward(): + import scaled_masked_softmax_cuda + + batch = 2 + attn = 16 + scale_t = torch.tensor([1.0]) + for qlen in [128, 256, 1024, 2048, 4096]: + for klen in [128, 256, 1024, 2048]: + inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0') + backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0') + masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0') + softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item()) + back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item()) + inputs.requires_grad = True + softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item()) + softmax_results_torch.backward(backward) + error = (back_grad - inputs.grad).abs().max() + assert error < 1e-3 + + +if __name__ == "__main__": + try: + from transformers import BertTokenizer, GPT2Tokenizer + from transformers.models.bert.modeling_bert import BertModel + from transformers.models.gpt2.modeling_gpt2 import GPT2Model + import transformers + + transformers.logging.set_verbosity( + transformers.logging.FATAL, + ) + + except: + print("\n[Fail] Please install `transformers` package to test fused kernels\n") + exit(-1) + + load() + test_masked_softmax_forward() + test_masked_softmax_backward() + test_allmasked_softmax_forward() + test_allmasked_softmax_backward() + test_load_fused_kernels() + test_fused_softmax() + test_fused_upper_triangle_mask_softmax() + test_layer_norm() diff --git a/megatron/legacy/fused_kernels/type_shim.h b/megatron/legacy/fused_kernels/type_shim.h new file mode 100644 index 0000000..d60a6f8 --- /dev/null +++ b/megatron/legacy/fused_kernels/type_shim.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ + + +#include +#include "compat.h" + + +#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ + switch(TYPE) \ + { \ + case at::ScalarType::Half: \ + { \ + using scalar_t = at::Half; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::BFloat16: \ + { \ + using scalar_t = at::BFloat16; \ + __VA_ARGS__; \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } + + +#define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...) \ + switch(TYPE) \ + { \ + case at::ScalarType::Half: \ + { \ + using scalar_t = at::Half; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::BFloat16: \ + { \ + using scalar_t = at::BFloat16; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::Float: \ + { \ + using scalar_t = float; \ + __VA_ARGS__; \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } + + + +#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ + switch(TYPEIN) \ + { \ + case at::ScalarType::Float: \ + { \ + using scalar_t_in = float; \ + switch(TYPEOUT) \ + { \ + case at::ScalarType::Float: \ + { \ + using scalar_t_out = float; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::Half: \ + { \ + using scalar_t_out = at::Half; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::BFloat16: \ + { \ + using scalar_t_out = at::BFloat16; \ + __VA_ARGS__; \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ + } \ + break; \ + } \ + case at::ScalarType::Half: \ + { \ + using scalar_t_in = at::Half; \ + using scalar_t_out = at::Half; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::BFloat16: \ + { \ + using scalar_t_in = at::BFloat16; \ + using scalar_t_out = at::BFloat16; \ + __VA_ARGS__; \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ + } + diff --git a/megatron/legacy/indexer.py b/megatron/legacy/indexer.py new file mode 100644 index 0000000..75851ad --- /dev/null +++ b/megatron/legacy/indexer.py @@ -0,0 +1,129 @@ +import sys +import time +import torch +import torch.distributed as dist + +from megatron.training import get_args, print_rank_0 +from megatron.core import mpu +from megatron.training.checkpointing import load_biencoder_checkpoint +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_batch +from megatron.legacy.data.biencoder_dataset_utils import get_one_epoch_dataloader +from megatron.legacy.data.realm_index import detach, OpenRetreivalDataStore +from megatron.legacy.model.biencoder_model import get_model_provider +from megatron.training import get_model + + +class IndexBuilder(object): + """ + Object for taking one pass over a dataset and creating a BlockData of its + embeddings + """ + def __init__(self): + args = get_args() + self.model = None + self.dataloader = None + self.evidence_embedder_obj = None + self.biencoder_shared_query_context_model = \ + args.biencoder_shared_query_context_model + + # need to know whether we're using a REALM checkpoint (args.load) + # or ICT checkpoint + assert not (args.load and args.ict_load) + + self.log_interval = args.indexer_log_interval + self.batch_size = args.indexer_batch_size + + self.load_attributes() + self.is_main_builder = mpu.get_data_parallel_rank() == 0 + self.num_total_builders = mpu.get_data_parallel_world_size() + self.iteration = self.total_processed = 0 + + def load_attributes(self): + """ + Load the necessary attributes: model, dataloader and empty BlockData + """ + only_context_model = True + if self.biencoder_shared_query_context_model: + only_context_model = False + + model = get_model(get_model_provider(only_context_model=\ + only_context_model, biencoder_shared_query_context_model=\ + self.biencoder_shared_query_context_model)) + + self.model = load_biencoder_checkpoint(model, + only_context_model=only_context_model) + + assert len(self.model) == 1 + self.model[0].eval() + + self.dataset = get_open_retrieval_wiki_dataset() + self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \ + self.batch_size)) + + self.evidence_embedder_obj = OpenRetreivalDataStore( \ + load_from_path=False) + + def track_and_report_progress(self, batch_size): + """ + Utility function for tracking progress + """ + self.iteration += 1 + self.total_processed += batch_size * self.num_total_builders + if self.is_main_builder and self.iteration % self.log_interval == 0: + print('Batch {:10d} | Total {:10d}'.format(self.iteration, + self.total_processed), flush=True) + + def build_and_save_index(self): + """ + Goes through one epoch of the dataloader and adds all data to this + instance's BlockData. + + The copy of BlockData is saved as a shard, which when run in a + distributed setting will be consolidated by the rank 0 process + and saved as a final pickled BlockData. + """ + assert len(self.model) == 1 + unwrapped_model = self.model[0] + + while not hasattr(unwrapped_model, 'embed_text'): + unwrapped_model = unwrapped_model.module + + while True: + try: + # batch also has query_tokens and query_pad_data + row_id, context_tokens, context_mask, context_types, \ + context_pad_mask = get_open_retrieval_batch( \ + self.dataloader) + except (StopIteration, IndexError): + break + + # TODO: can we add with torch.no_grad() to reduce memory usage + # detach, separate fields and add to BlockData + assert context_mask.dtype == torch.bool + context_logits = unwrapped_model.embed_text( + unwrapped_model.context_model, context_tokens, context_mask, + context_types) + + context_logits = detach(context_logits) + row_id = detach(row_id) + + self.evidence_embedder_obj.add_block_data(row_id, context_logits) + self.track_and_report_progress(batch_size=len(row_id)) + + # This process signals to finalize its shard and then synchronize with + # the other processes + self.evidence_embedder_obj.save_shard() + torch.distributed.barrier() + del self.model + + # rank 0 process builds the final copy + if self.is_main_builder: + self.evidence_embedder_obj.merge_shards_and_save() + # make sure that every single piece of data was embedded + assert len(self.evidence_embedder_obj.embed_data) == \ + len(self.dataset) + self.evidence_embedder_obj.clear() + + # complete building the final copy + torch.distributed.barrier() diff --git a/megatron/legacy/model/__init__.py b/megatron/legacy/model/__init__.py new file mode 100644 index 0000000..cb010e5 --- /dev/null +++ b/megatron/legacy/model/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm +from .rms_norm import RMSNorm + +from .bert_model import BertModel +from .gpt_model import GPTModel +from .t5_model import T5Model +from .language_model import get_language_model +from .module import Float16Module diff --git a/megatron/legacy/model/bert_model.py b/megatron/legacy/model/bert_model.py new file mode 100644 index 0000000..eca22f0 --- /dev/null +++ b/megatron/legacy/model/bert_model.py @@ -0,0 +1,257 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""BERT model.""" + +import torch + +from megatron.training import get_args +from megatron.core import tensor_parallel +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import parallel_lm_logits +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_norm +from megatron.legacy.model.utils import openai_gelu, erf_gelu +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal +from .module import MegatronModule + + +def bert_extended_attention_mask(attention_mask): + # We create a 3D attention mask from a 2D tensor mask. + # [b, 1, s] + attention_mask_b1s = attention_mask.unsqueeze(1) + # [b, s, 1] + attention_mask_bs1 = attention_mask.unsqueeze(2) + # [b, s, s] + attention_mask_bss = attention_mask_b1s * attention_mask_bs1 + # [b, 1, s, s] + extended_attention_mask = attention_mask_bss.unsqueeze(1) + + # Convert attention mask to binary: + extended_attention_mask = (extended_attention_mask < 0.5) + + return extended_attention_mask + +def bert_position_ids(token_ids): + # Create position ids + seq_length = token_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, + device=token_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(token_ids) + + return position_ids + + +class BertLMHead(MegatronModule): + """Masked LM head for Bert + + Args: + config: TransformerConfig object + mpu_vocab_size: model parallel size of vocabulary. + parallel_output: whether output logits being distributed or not. + """ + + def __init__(self, mpu_vocab_size, config, parallel_output): + super().__init__(config=config) + + args = get_args() + self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) + tensor_parallel.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) + self.parallel_output = parallel_output + + self.dense = get_linear_layer(config.hidden_size, config.hidden_size, config.init_method) + setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel) + setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel) + + self.norm = get_norm(config) + self.gelu = torch.nn.functional.gelu + if args.openai_gelu: + self.gelu = openai_gelu + elif args.onnx_safe: + self.gelu = erf_gelu + + def forward(self, hidden_states, word_embeddings_weight): + hidden_states = self.dense(hidden_states) + hidden_states = self.gelu(hidden_states) + hidden_states = self.norm(hidden_states) + output = parallel_lm_logits(hidden_states, + word_embeddings_weight, + self.parallel_output, + bias=self.bias) + return output + + def load_state_dict(self, state_dict, strict=True): + """Customize load.""" + + # Handle renaming layernorm -> norm in component names + state_dict_ = {} + for key in state_dict.keys(): + newkey = key.replace("layernorm", "norm") + state_dict_[newkey] = state_dict[key] + + super().load_state_dict(state_dict_, strict) + + +def post_language_model_processing(lm_output, pooled_output, + lm_head, binary_head, + lm_labels, + logit_weights, + fp16_lm_cross_entropy): + # Output. + lm_logits = lm_head( + lm_output, logit_weights) + + binary_logits = None + if binary_head is not None: + binary_logits = binary_head(pooled_output) + + if lm_labels is None: + # [s b h] => [b s h] + return lm_logits.transpose(0,1).contiguous(), binary_logits + else: + # [b s] => [s b] + lm_labels = lm_labels.transpose(0,1).contiguous() + # lm_logits : [s, b, h] and lm_labels: [s, b] + if fp16_lm_cross_entropy: + assert lm_logits.dtype == torch.half + lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits, lm_labels) + else: + lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits.float(), + lm_labels) + # [s, b] => [b s] + lm_loss = lm_loss.transpose(0,1).contiguous() + return lm_loss, binary_logits + + +class BertModel(MegatronModule): + """Bert Language model.""" + + def __init__(self, + config, + num_tokentypes=2, + add_binary_head=True, + parallel_output=True, + pre_process=True, + post_process=True): + super().__init__(config=config) + args = get_args() + + # TODO this option is not yet implemented in BERT + assert args.untie_embeddings_and_output_weights is False + + self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy + self.add_binary_head = add_binary_head + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + + self.return_embeddings = args.output_bert_embeddings + if self.return_embeddings: + assert self.post_process and self.add_binary_head + + self.language_model, self._language_model_key = get_language_model( + config=config, + num_tokentypes=num_tokentypes, + add_pooler=self.add_binary_head, + encoder_attn_mask_type=AttnMaskType.padding, + pre_process=self.pre_process, + post_process=self.post_process) + + self.initialize_word_embeddings() + if self.post_process: + self.lm_head = BertLMHead(self.shared_embedding_or_output_weight().size(0), config, parallel_output) + self._lm_head_key = 'lm_head' + self.binary_head = None + if self.add_binary_head: + self.binary_head = get_linear_layer(config.hidden_size, 2, + config.init_method) + self._binary_head_key = 'binary_head' + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, bert_model_input, attention_mask, + tokentype_ids=None, lm_labels=None): + + extended_attention_mask = bert_extended_attention_mask(attention_mask) + input_ids = bert_model_input + position_ids = bert_position_ids(input_ids) + + lm_output = self.language_model( + input_ids, + position_ids, + extended_attention_mask, + tokentype_ids=tokentype_ids + ) + + if self.post_process and self.add_binary_head: + lm_output, pooled_output = lm_output + + # Return pooled output (e.g., when computing Bert embeddings). + if self.return_embeddings: + + # Sum attention mask. + embeddings = torch.transpose(lm_output, 0, 1) + masks = torch.sum(attention_mask, dim=1) + + # Collect masked embeddings. + output = torch.zeros( + size=(embeddings.shape[0], embeddings.shape[2]), + dtype=torch.float32, + device=torch.cuda.current_device()) + for i, (embedding, mask) in enumerate(zip(embeddings, masks)): + output[i, :] = torch.mean(embedding[1: mask - 1], dim=0) + + return output + + else: + pooled_output = None + + if self.post_process: + return post_language_model_processing(lm_output, pooled_output, + self.lm_head, self.binary_head, + lm_labels, + self.shared_embedding_or_output_weight(), + self.fp16_lm_cross_entropy) + else: + return lm_output + + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process: + state_dict_[self._lm_head_key] \ + = self.lm_head.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process and self.add_binary_head: + state_dict_[self._binary_head_key] \ + = self.binary_head.state_dict(prefix=prefix, keep_vars=keep_vars) + # Save word_embeddings. + if self.post_process and not self.pre_process: + state_dict_[self._word_embeddings_for_head_key] \ + = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + if self.post_process: + self.lm_head.load_state_dict( + state_dict[self._lm_head_key], strict=strict) + if self.post_process and self.add_binary_head: + self.binary_head.load_state_dict( + state_dict[self._binary_head_key], strict=strict) + # Load word_embeddings. + if self.post_process and not self.pre_process: + self.word_embeddings.load_state_dict( + state_dict[self._word_embeddings_for_head_key], strict=strict) diff --git a/megatron/legacy/model/biencoder_model.py b/megatron/legacy/model/biencoder_model.py new file mode 100644 index 0000000..8983cb5 --- /dev/null +++ b/megatron/legacy/model/biencoder_model.py @@ -0,0 +1,328 @@ +import os +import torch +import sys + +from megatron.training import get_args, print_rank_0, get_tokenizer +from megatron.core import mpu +from megatron.training.checkpointing import fix_query_key_value_ordering +from megatron.training.checkpointing import get_checkpoint_tracker_filename +from megatron.training.checkpointing import get_checkpoint_name +from megatron.legacy.model.bert_model import bert_position_ids +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal +from .module import MegatronModule + +def get_model_provider(only_query_model=False, only_context_model=False, + biencoder_shared_query_context_model=False): + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building Bienoder model ...') + model = biencoder_model_provider(only_query_model=only_query_model, + only_context_model = only_context_model, + biencoder_shared_query_context_model = \ + biencoder_shared_query_context_model, + pre_process=pre_process, post_process=post_process) + + return model + + return model_provider + + +def biencoder_model_provider(only_query_model=False, + only_context_model=False, + biencoder_shared_query_context_model=False, + pre_process=True, + post_process=True): + """Build the model.""" + + assert mpu.get_tensor_model_parallel_world_size() == 1 and \ + mpu.get_pipeline_model_parallel_world_size() == 1, \ + "Model parallel size > 1 not supported for ICT" + + print_rank_0('building BiEncoderModel...') + + # simpler to just keep using 2 tokentypes since + # the LM we initialize with has 2 tokentypes + model = BiEncoderModel( + num_tokentypes=2, + parallel_output=False, + only_query_model=only_query_model, + only_context_model=only_context_model, + biencoder_shared_query_context_model=\ + biencoder_shared_query_context_model, + pre_process=pre_process, + post_process=post_process) + + return model + + +class BiEncoderModel(MegatronModule): + """Bert-based module for Biencoder model.""" + + def __init__(self, + num_tokentypes=1, + parallel_output=True, + only_query_model=False, + only_context_model=False, + biencoder_shared_query_context_model=False, + pre_process=True, + post_process=True): + super(BiEncoderModel, self).__init__() + args = get_args() + + bert_kwargs = dict( + num_tokentypes=num_tokentypes, + parallel_output=parallel_output, + pre_process=pre_process, + post_process=post_process) + + self.biencoder_shared_query_context_model = \ + biencoder_shared_query_context_model + assert not (only_context_model and only_query_model) + self.use_context_model = not only_query_model + self.use_query_model = not only_context_model + self.biencoder_projection_dim = args.biencoder_projection_dim + + if self.biencoder_shared_query_context_model: + self.model = PretrainedBertModel(**bert_kwargs) + self._model_key = 'shared_model' + self.query_model, self.context_model = self.model, self.model + else: + if self.use_query_model: + # this model embeds (pseudo-)queries - Embed_input in the paper + self.query_model = PretrainedBertModel(**bert_kwargs) + self._query_key = 'query_model' + + if self.use_context_model: + # this model embeds evidence blocks - Embed_doc in the paper + self.context_model = PretrainedBertModel(**bert_kwargs) + self._context_key = 'context_model' + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + # this is just a placeholder and will be needed when model + # parallelism will be used + # self.language_model.set_input_tensor(input_tensor) + return + + def forward(self, query_tokens, query_attention_mask, query_types, + context_tokens, context_attention_mask, context_types): + """Run a forward pass for each of the models and + return the respective embeddings.""" + + if self.use_query_model: + query_logits = self.embed_text(self.query_model, + query_tokens, + query_attention_mask, + query_types) + else: + raise ValueError("Cannot embed query without the query model.") + if self.use_context_model: + context_logits = self.embed_text(self.context_model, + context_tokens, + context_attention_mask, + context_types) + else: + raise ValueError("Cannot embed block without the block model.") + return query_logits, context_logits + + @staticmethod + def embed_text(model, tokens, attention_mask, token_types): + """Embed a batch of tokens using the model""" + logits = model(tokens, + attention_mask, + token_types) + return logits + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """Save dict with state dicts of each of the models.""" + state_dict_ = {} + if self.biencoder_shared_query_context_model: + state_dict_[self._model_key] = \ + self.model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + else: + if self.use_query_model: + state_dict_[self._query_key] = \ + self.query_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + + if self.use_context_model: + state_dict_[self._context_key] = \ + self.context_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Load the state dicts of each of the models""" + if self.biencoder_shared_query_context_model: + print_rank_0("Loading shared query-context model") + self.model.load_state_dict(state_dict[self._model_key], \ + strict=strict) + else: + if self.use_query_model: + print_rank_0("Loading query model") + self.query_model.load_state_dict( \ + state_dict[self._query_key], strict=strict) + + if self.use_context_model: + print_rank_0("Loading context model") + self.context_model.load_state_dict( \ + state_dict[self._context_key], strict=strict) + + def init_state_dict_from_bert(self): + """Initialize the state from a pretrained BERT model + on iteration zero of ICT pretraining""" + args = get_args() + + if args.bert_load is None: + print_rank_0("bert-load argument is None") + return + + tracker_filename = get_checkpoint_tracker_filename(args.bert_load) + if not os.path.isfile(tracker_filename): + raise FileNotFoundError("Could not find BERT checkpoint") + with open(tracker_filename, 'r') as f: + iteration = int(f.read().strip()) + assert iteration > 0 + + checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False) + if mpu.get_data_parallel_rank() == 0: + print('global rank {} is loading BERT checkpoint {}'.format( + torch.distributed.get_rank(), checkpoint_name)) + + # Load the checkpoint. + try: + state_dict = torch.load(checkpoint_name, map_location='cpu') + except ModuleNotFoundError: + from megatron.legacy.fp16_deprecated import loss_scaler + # For backward compatibility. + print_rank_0(' > deserializing using the old code structure ...') + sys.modules['fp16.loss_scaler'] = sys.modules[ + 'megatron.fp16_deprecated.loss_scaler'] + sys.modules['megatron.fp16.loss_scaler'] = sys.modules[ + 'megatron.fp16_deprecated.loss_scaler'] + state_dict = torch.load(checkpoint_name, map_location='cpu') + sys.modules.pop('fp16.loss_scaler', None) + sys.modules.pop('megatron.fp16.loss_scaler', None) + except BaseException: + print_rank_0('could not load the BERT checkpoint') + sys.exit() + + checkpoint_version = state_dict.get('checkpoint_version', 0) + + # load the LM state dict into each model + model_dict = state_dict['model']['language_model'] + + if self.biencoder_shared_query_context_model: + self.model.language_model.load_state_dict(model_dict) + fix_query_key_value_ordering(self.model, checkpoint_version) + else: + if self.use_query_model: + self.query_model.language_model.load_state_dict(model_dict) + # give each model the same ict_head to begin with as well + if self.biencoder_projection_dim > 0: + query_proj_state_dict = \ + self.state_dict_for_save_checkpoint()\ + [self._query_key]['projection_enc'] + fix_query_key_value_ordering(self.query_model, checkpoint_version) + + if self.use_context_model: + self.context_model.language_model.load_state_dict(model_dict) + if self.query_model is not None and \ + self.biencoder_projection_dim > 0: + self.context_model.projection_enc.load_state_dict\ + (query_proj_state_dict) + fix_query_key_value_ordering(self.context_model, checkpoint_version) + + +class PretrainedBertModel(MegatronModule): + """BERT-based encoder for queries or contexts used for + learned information retrieval.""" + + def __init__(self, num_tokentypes=2, + parallel_output=True, pre_process=True, post_process=True): + super(PretrainedBertModel, self).__init__() + + args = get_args() + tokenizer = get_tokenizer() + self.pad_id = tokenizer.pad + self.biencoder_projection_dim = args.biencoder_projection_dim + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + init_method = init_method_normal(args.init_method_std) + scaled_init_method = scaled_init_method_normal( + args.init_method_std, args.num_layers) + + self.language_model, self._language_model_key = get_language_model( + num_tokentypes=num_tokentypes, + add_pooler=False, + encoder_attn_mask_type=AttnMaskType.padding, + init_method=init_method, + scaled_init_method=scaled_init_method, + pre_process=self.pre_process, + post_process=self.post_process) + + if args.biencoder_projection_dim > 0: + self.projection_enc = get_linear_layer(args.hidden_size, + args.biencoder_projection_dim, + init_method) + self._projection_enc_key = 'projection_enc' + + def forward(self, input_ids, attention_mask, tokentype_ids=None): + extended_attention_mask = attention_mask.unsqueeze(1) + #extended_attention_mask = bert_extended_attention_mask(attention_mask) + position_ids = bert_position_ids(input_ids) + + lm_output = self.language_model(input_ids, + position_ids, + extended_attention_mask, + tokentype_ids=tokentype_ids) + # This mask will be used in average-pooling and max-pooling + pool_mask = (input_ids == self.pad_id).unsqueeze(2) + + # Taking the representation of the [CLS] token of BERT + pooled_output = lm_output[0, :, :] + + # Converting to float16 dtype + pooled_output = pooled_output.to(lm_output.dtype) + + # Output. + if self.biencoder_projection_dim: + pooled_output = self.projection_enc(pooled_output) + + return pooled_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + + if self.biencoder_projection_dim > 0: + state_dict_[self._projection_enc_key] = \ + self.projection_enc.state_dict(prefix=prefix, + keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + print_rank_0("loading pretrained weights") + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + + if self.biencoder_projection_dim > 0: + print_rank_0("loading projection head weights") + self.projection_enc.load_state_dict( + state_dict[self._projection_enc_key], strict=strict) diff --git a/megatron/legacy/model/classification.py b/megatron/legacy/model/classification.py new file mode 100644 index 0000000..c9fe165 --- /dev/null +++ b/megatron/legacy/model/classification.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Classification model.""" + +import torch + +from megatron.training import get_args, print_rank_last +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal +from .module import MegatronModule + + +class Classification(MegatronModule): + + def __init__(self, + config, + num_classes, + num_tokentypes=2, + pre_process=True, + post_process=True): + super().__init__(config=config, share_embeddings_and_output_weights=False) + args = get_args() + + self.num_classes = num_classes + self.pre_process = pre_process + self.post_process = post_process + + self.language_model, self._language_model_key = get_language_model( + config=config, + num_tokentypes=num_tokentypes, + add_pooler=True, + encoder_attn_mask_type=AttnMaskType.padding, + pre_process=self.pre_process, + post_process=self.post_process) + + # Multi-choice head. + if self.post_process: + self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) + self.classification_head = get_linear_layer(args.hidden_size, + self.num_classes, + config.init_method) + self._classification_head_key = 'classification_head' + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, model_input, attention_mask, tokentype_ids=None): + + extended_attention_mask = bert_extended_attention_mask(attention_mask) + input_ids = model_input + position_ids = bert_position_ids(input_ids) + + lm_output = self.language_model( + input_ids, + position_ids, + extended_attention_mask, + tokentype_ids=tokentype_ids + ) + + if self.post_process: + _, pooled_output = lm_output + classification_output = self.classification_dropout(pooled_output) + classification_logits = self.classification_head(classification_output) + + # Reshape back to separate choices. + classification_logits = classification_logits.view(-1, self.num_classes) + + return classification_logits + return lm_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process: + state_dict_[self._classification_head_key] \ + = self.classification_head.state_dict(prefix=prefix, keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + if self.post_process: + if self._classification_head_key in state_dict: + self.classification_head.load_state_dict( + state_dict[self._classification_head_key], strict=strict) + else: + print_rank_last('***WARNING*** could not find {} in the checkpoint, ' + 'initializing to random'.format( + self._classification_head_key)) diff --git a/megatron/legacy/model/enums.py b/megatron/legacy/model/enums.py new file mode 100644 index 0000000..bc4e4aa --- /dev/null +++ b/megatron/legacy/model/enums.py @@ -0,0 +1,21 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import enum + +class LayerType(enum.Enum): + encoder = 1 + decoder = 2 + retro_encoder = 3 + retro_decoder = 4 + retro_decoder_with_retriever = 5 + +class AttnType(enum.Enum): + self_attn = 1 + cross_attn = 2 + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 + +# For backward compatibility with old model checkpoints +from megatron.core.enums import ModelType diff --git a/megatron/legacy/model/fused_bias_gelu.py b/megatron/legacy/model/fused_bias_gelu.py new file mode 100644 index 0000000..e00e631 --- /dev/null +++ b/megatron/legacy/model/fused_bias_gelu.py @@ -0,0 +1,44 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch +from megatron.core.jit import jit_fuser + + +###### BIAS GELU FUSION/ NO AUTOGRAD ################ +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) + +@jit_fuser +def bias_gelu(bias, y): + x = bias + y + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@jit_fuser +def bias_gelu_back(g, bias, y): + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff*g + +class GeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(bias, input) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_back(grad_output, bias, input) + return tmp, tmp + +bias_gelu_impl = GeLUFunction.apply diff --git a/megatron/legacy/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py new file mode 100644 index 0000000..fcec35a --- /dev/null +++ b/megatron/legacy/model/fused_layer_norm.py @@ -0,0 +1,99 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""This code is copied fron NVIDIA apex: + https://github.com/NVIDIA/apex + with some changes. """ + +import inspect +import numbers +import torch +from torch.nn.parameter import Parameter +from torch.nn import init +import importlib + +from megatron.core.utils import make_viewless_tensor + +try: + from apex.contrib.layer_norm.layer_norm import FastLayerNormFN + HAVE_PERSIST_LAYER_NORM = True +except: + HAVE_PERSIST_LAYER_NORM = False + +try: + from apex.normalization.fused_layer_norm import fused_layer_norm_affine +except: + fused_layer_norm_affine = None + +global fused_layer_norm_cuda +fused_layer_norm_cuda = None + + +class MixedFusedLayerNorm(torch.nn.Module): + + def __init__(self, normalized_shape, eps=1e-5, + no_persist_layer_norm=True, + sequence_parallel=False, + apply_layernorm_1p=False): + super(MixedFusedLayerNorm, self).__init__() + + self.apply_layernorm_1p = apply_layernorm_1p + + global fused_layer_norm_cuda + fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda") + + # List of hiddens sizes supported in the persistent layer norm kernel + # If the hidden size is not supported, fall back to the non-persistent + # kernel. + persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096, + 5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480, + 24576, 25600, 30720, 32768, 40960, 49152, 65536] + if normalized_shape not in persist_ln_hidden_sizes or \ + not HAVE_PERSIST_LAYER_NORM: + no_persist_layer_norm = True + + if isinstance(normalized_shape, numbers.Integral): + normalized_shape = (normalized_shape,) + self.normalized_shape = torch.Size(normalized_shape) + self.eps = eps + self.weight = Parameter(torch.Tensor(*normalized_shape)) + self.bias = Parameter(torch.Tensor(*normalized_shape)) + self.reset_parameters() + self.no_persist_layer_norm = no_persist_layer_norm + self.sequence_parallel = sequence_parallel + + # set sequence parallelism flag on weight and bias parameters + setattr(self.weight, 'sequence_parallel', self.sequence_parallel) + setattr(self.bias, 'sequence_parallel', self.sequence_parallel) + + + def reset_parameters(self): + + if self.apply_layernorm_1p: + init.zeros_(self.weight) + init.zeros_(self.bias) + else: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input): + + weight = self.weight + 1 if self.apply_layernorm_1p else self.weight + + if self.no_persist_layer_norm: + assert fused_layer_norm_affine is not None, \ + "fused_layer_norm_affine is not available, please install apex from https://github.com/NVIDIA/apex" + return fused_layer_norm_affine(input, weight, self.bias, self.normalized_shape, eps=self.eps) + else: + if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args: + output = FastLayerNormFN.apply(input, weight, self.bias, self.eps, False) + else: + output = FastLayerNormFN.apply(input, weight, self.bias, self.eps) + # Apex's fast layer norm function outputs a 'view' tensor (i.e., has + # a populated '_base' field). This will result in schedule.py's + # deallocate_output_tensor() throwing an error, so a viewless tensor is + # created to prevent this. + output = make_viewless_tensor(inp = output, + requires_grad = input.requires_grad, + keep_graph = True) + + return output diff --git a/megatron/legacy/model/fused_softmax.py b/megatron/legacy/model/fused_softmax.py new file mode 100644 index 0000000..58f900b --- /dev/null +++ b/megatron/legacy/model/fused_softmax.py @@ -0,0 +1,234 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +import torch +import torch.nn as nn +from megatron.legacy.model.enums import AttnMaskType + + +class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + try: + import scaled_upper_triang_masked_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + scale_t = torch.tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_cuda.forward( + inputs, scale_t[0] + ) + + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + try: + import scaled_upper_triang_masked_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_upper_triang_masked_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + + return input_grads, None + + +class ScaledMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply the mask. + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, mask, scale): + try: + import scaled_masked_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + try: + import scaled_masked_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_masked_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + return input_grads, None, None + + +class ScaledSoftmax(torch.autograd.Function): + """ + Fused operation which performs following two operations in sequence + 1. Scale the tensor. + 2. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + try: + import scaled_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_softmax_cuda.forward( + inputs, scale_t[0] + ) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + try: + import scaled_softmax_cudaa + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + return input_grads, None, None + + +class FusedScaleMaskSoftmax(nn.Module): + """ + fused operation: scaling + mask + softmax + + Args: + input_in_fp16: flag to indicate if input in fp16 data format. + input_in_bf16: flag to indicate if input in bf16 data format. + attn_mask_type: attention mask type (pad or causal) + scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + scale: scaling factor used in input tensor scaling. + """ + + def __init__( + self, + input_in_fp16, + input_in_bf16, + attn_mask_type, + scaled_masked_softmax_fusion, + mask_func, + softmax_in_fp32, + scale, + ): + super(FusedScaleMaskSoftmax, self).__init__() + self.input_in_fp16 = input_in_fp16 + self.input_in_bf16 = input_in_bf16 + assert not ( + self.input_in_fp16 and self.input_in_bf16 + ), "both fp16 and bf16 flags cannot be active at the same time." + self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.scale = scale + + assert ( + self.scale is None or softmax_in_fp32 + ), "softmax should be in fp32 when scaled" + + def forward(self, input, mask): + # [b, np, sq, sk] + assert input.dim() == 4 + + if self.is_kernel_available(mask, *input.size()): + return self.forward_fused_softmax(input, mask) + else: + return self.forward_torch_softmax(input, mask) + + def is_kernel_available(self, mask, b, np, sq, sk): + attn_batches = b * np + + if ( + self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_float16 # input must be fp16 + and 16 < sk <= 16384 # sk must be 16 ~ 16384 + and sq % 4 == 0 # sq must be divisor of 4 + and sk % 4 == 0 # sk must be divisor of 4 + and attn_batches % 4 == 0 # np * b must be divisor of 4 + ): + if 0 <= sk <= 16384: + batch_per_block = self.get_batch_per_block(sq, sk, b, np) + + if self.attn_mask_type == AttnMaskType.causal: + if attn_batches % batch_per_block == 0: + return True + else: + if sq % batch_per_block == 0: + return True + return False + + def forward_fused_softmax(self, input, mask): + b, np, sq, sk = input.size() + scale = self.scale if self.scale is not None else 1.0 + + if self.attn_mask_type == AttnMaskType.causal: + assert sq == sk, "causal mask is only for self attention" + + # input is 3D tensor (attn_batches, sq, sk) + input = input.view(-1, sq, sk) + probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) + return probs.view(b, np, sq, sk) + else: + # input is 4D tensor (b, np, sq, sk) + if mask is not None: + return ScaledMaskedSoftmax.apply(input, mask, scale) + else: + return ScaledSoftmax.apply(input, scale) + + def forward_torch_softmax(self, input, mask): + if self.input_in_float16 and self.softmax_in_fp32: + input = input.float() + + if self.scale is not None: + input = input * self.scale + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + + if self.input_in_float16 and self.softmax_in_fp32: + if self.input_in_fp16: + probs = probs.half() + else: + probs = probs.bfloat16() + + return probs + + @staticmethod + def get_batch_per_block(sq, sk, b, np): + try: + import scaled_masked_softmax_cuda + except (ImportError, ModuleNotFoundError): + print(f'Please install Apex to use fused_softmax') + + return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np) diff --git a/megatron/legacy/model/gpt_model.py b/megatron/legacy/model/gpt_model.py new file mode 100644 index 0000000..8e38019 --- /dev/null +++ b/megatron/legacy/model/gpt_model.py @@ -0,0 +1,122 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""GPT-2 model.""" + +import torch + +from megatron.training import get_args +from megatron.core import tensor_parallel +from .module import MegatronModule + +from .enums import AttnMaskType +from .language_model import parallel_lm_logits +from .language_model import get_language_model + + +def post_language_model_processing(lm_output, labels, logit_weights, + parallel_output, + fp16_lm_cross_entropy): + + # Output. Format [s b h] + output = parallel_lm_logits( + lm_output, + logit_weights, + parallel_output) + + if labels is None: + # [s b h] => [b s h] + return output.transpose(0,1).contiguous() + else: + # [b s] => [s b] + labels = labels.transpose(0,1).contiguous() + if fp16_lm_cross_entropy: + assert output.dtype == torch.half + loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels) + else: + loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels) + + # [s b] => [b, s] + loss = loss.transpose(0,1).contiguous() + return loss + + +class GPTModel(MegatronModule): + """GPT-2 Language model.""" + + def __init__(self, + config, + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True): + args = get_args() + super().__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) + + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy + self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + + self.language_model, self._language_model_key = get_language_model( + config=config, + num_tokentypes=num_tokentypes, + add_pooler=False, + encoder_attn_mask_type=AttnMaskType.causal, + pre_process=self.pre_process, + post_process=self.post_process) + + if not args.untie_embeddings_and_output_weights: + self.initialize_word_embeddings() + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, input_ids, position_ids, attention_mask, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + labels=None, tokentype_ids=None, inference_params=None): + + lm_output = self.language_model( + input_ids, + position_ids, + attention_mask, + retriever_input_ids=retriever_input_ids, + retriever_position_ids=retriever_position_ids, + retriever_attn_mask=retriever_attn_mask, + inference_params=inference_params) + + if self.post_process: + return post_language_model_processing( + lm_output, labels, + self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(), + self.parallel_output, + self.fp16_lm_cross_entropy) + else: + return lm_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + # Save word_embeddings. + if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights: + state_dict_[self._word_embeddings_for_head_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Load word_embeddings. + if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights: + self.word_embeddings.load_state_dict( + state_dict[self._word_embeddings_for_head_key], strict=strict) + if self._language_model_key in state_dict: + state_dict = state_dict[self._language_model_key] + self.language_model.load_state_dict(state_dict, strict=strict) diff --git a/megatron/legacy/model/language_model.py b/megatron/legacy/model/language_model.py new file mode 100644 index 0000000..1beb5f9 --- /dev/null +++ b/megatron/legacy/model/language_model.py @@ -0,0 +1,627 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Transformer based language model.""" + +import torch +import torch.nn.functional as F + +from megatron.training import get_args +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding + +from .enums import AttnMaskType, LayerType +from .module import MegatronModule +from .transformer import ParallelTransformer +from .utils import get_linear_layer +from .utils import init_method_normal, scaled_init_method_normal + + +def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, + bias=None): + """LM logits using word embedding weights.""" + args = get_args() + # Parallel logits. + model_parallel = mpu.get_tensor_model_parallel_world_size() > 1 + if model_parallel or args.sequence_parallel: + input_parallel = input_ + allreduce_dgrad = model_parallel and not args.sequence_parallel + else: + input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_) + allreduce_dgrad = False + + # Matrix multiply. + logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce( + input=input_parallel, + weight=word_embeddings_weight, + bias=bias, + gradient_accumulation_fusion=args.gradient_accumulation_fusion, + async_grad_allreduce=allreduce_dgrad, + sequence_parallel=args.sequence_parallel, + grad_output_buffer=None, + allreduce_dgrad=allreduce_dgrad, + ) + # Gather if needed. + + if parallel_output: + return logits_parallel + + return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel) + + +def get_language_model(config, num_tokentypes, add_pooler, + encoder_attn_mask_type, + add_encoder=True, + add_decoder=False, + decoder_attn_mask_type=AttnMaskType.causal, + pre_process=True, post_process=True): + """Build language model and return along with the key to save.""" + args = get_args() + if config.init_method is None: + config.init_method = init_method_normal(config.init_method_std) + + if config.output_layer_init_method is None: + config.output_layer_init_method = scaled_init_method_normal(config.init_method_std, + config.num_layers) + + # Language model. + language_model = TransformerLanguageModel( + config, + encoder_attn_mask_type, + num_tokentypes=num_tokentypes, + add_encoder=add_encoder, + add_decoder=add_decoder, + decoder_attn_mask_type=decoder_attn_mask_type, + add_pooler=add_pooler, + pre_process=pre_process, + post_process=post_process + ) + # key used for checkpoints. + language_model_key = 'language_model' + + return language_model, language_model_key + + +class Pooler(MegatronModule): + """Pooler layer. + + Pool hidden states of a specific token (for example start of the + sequence) and add a linear transformation followed by a tanh. + + Args: + hidden_size: hidden size + init_method: weight initialization method for the linear layer. + bias is set to zero. + """ + + def __init__(self, hidden_size, init_method): + super(Pooler, self).__init__() + args = get_args() + self.dense = get_linear_layer(hidden_size, hidden_size, init_method) + self.sequence_parallel = args.sequence_parallel + + + def forward(self, hidden_states, sequence_index=0): + # hidden_states: [s, b, h] + # sequence_index: index of the token to pool. + + # gather data along sequence dimensions + # same pooler is run on all tensor parallel nodes + if self.sequence_parallel: + hidden_states = tensor_parallel.gather_from_sequence_parallel_region( + hidden_states, + tensor_parallel_output_grad=False) + + pooled = hidden_states[sequence_index, :, :] + pooled = self.dense(pooled) + pooled = torch.tanh(pooled) + return pooled + + +class Embedding(MegatronModule): + """Language model embeddings. + + Args: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + init_method: weight initialization method + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, + hidden_size, + vocab_size, + max_sequence_length, + embedding_dropout_prob, + config, + num_tokentypes=0): + super(Embedding, self).__init__() + + self.hidden_size = hidden_size + self.init_method = config.init_method + self.num_tokentypes = num_tokentypes + + args = get_args() + + # Word embeddings (parallel). + self.params_dtype = args.params_dtype + self.word_embeddings = tensor_parallel.VocabParallelEmbedding( + vocab_size, self.hidden_size, config=config, init_method=config.init_method) + self._word_embeddings_key = 'word_embeddings' + + # Position embedding (serial). + self.add_position_embedding = args.position_embedding_type == 'learned_absolute' + if self.add_position_embedding: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, self.hidden_size) + self._position_embeddings_key = 'position_embeddings' + # Initialize the position embeddings. + if args.perform_initialization: + self.init_method(self.position_embeddings.weight) + + # Token type embedding. + # Add this as an optional field that can be added through + # method call so we can load a pretrain model without + # token types and add them as needed. + self._tokentype_embeddings_key = 'tokentype_embeddings' + if self.num_tokentypes > 0: + self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + if args.perform_initialization: + self.init_method(self.tokentype_embeddings.weight) + else: + self.tokentype_embeddings = None + + self.fp32_residual_connection = args.fp32_residual_connection + self.sequence_parallel = args.sequence_parallel + self.clone_scatter_output_in_embedding = args.clone_scatter_output_in_embedding + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + + def zero_parameters(self): + """Zero out all parameters in embedding.""" + self.word_embeddings.weight.data.fill_(0) + self.word_embeddings.weight.shared = True + if self.add_position_embedding: + self.position_embeddings.weight.data.fill_(0) + self.position_embeddings.weight.shared = True + if self.num_tokentypes > 0: + self.tokentype_embeddings.weight.data.fill_(0) + self.tokentype_embeddings.weight.shared = True + + def add_tokentype_embeddings(self, num_tokentypes): + """Add token-type embedding. This function is provided so we can add + token-type embeddings in case the pretrained model does not have it. + This allows us to load the model normally and then add this embedding. + """ + if self.tokentype_embeddings is not None: + raise Exception('tokentype embeddings is already initialized') + if torch.distributed.get_rank() == 0: + print('adding embedding for {} tokentypes'.format(num_tokentypes), + flush=True) + self.num_tokentypes = num_tokentypes + self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + args = get_args() + self.init_method(self.tokentype_embeddings.weight) + + def forward(self, input_ids, position_ids, tokentype_ids=None): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + if self.add_position_embedding: + position_embeddings = self.position_embeddings(position_ids) + embeddings = words_embeddings + position_embeddings + else: + embeddings = words_embeddings + + if tokentype_ids is not None: + assert self.tokentype_embeddings is not None + embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) + else: + assert self.tokentype_embeddings is None + + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + + # Dropout. + if self.sequence_parallel: + embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings) + # `scatter_to_sequence_parallel_region` returns a view, which prevents + # the original tensor from being garbage collected. Clone to facilitate GC. + # Has a small runtime cost (~0.5%). + if self.clone_scatter_output_in_embedding: + embeddings = embeddings.clone() + with tensor_parallel.get_cuda_rng_tracker().fork(): + embeddings = self.embedding_dropout(embeddings) + else: + embeddings = self.embedding_dropout(embeddings) + + return embeddings + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load.""" + + state_dict_ = {} + state_dict_[self._word_embeddings_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + if self.add_position_embedding: + state_dict_[self._position_embeddings_key] \ + = self.position_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + if self.num_tokentypes > 0: + state_dict_[self._tokentype_embeddings_key] \ + = self.tokentype_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Word embedding. + if self._word_embeddings_key in state_dict: + state_dict_ = state_dict[self._word_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'word_embeddings' in key: + state_dict_[key.split('word_embeddings.')[1]] \ + = state_dict[key] + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + + # Position embedding. + if self.add_position_embedding: + if self._position_embeddings_key in state_dict: + state_dict_ = state_dict[self._position_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'position_embeddings' in key: + state_dict_[key.split('position_embeddings.')[1]] \ + = state_dict[key] + self.position_embeddings.load_state_dict(state_dict_, strict=strict) + + # Tokentype embedding. + if self.num_tokentypes > 0: + state_dict_ = {} + if self._tokentype_embeddings_key in state_dict: + state_dict_ = state_dict[self._tokentype_embeddings_key] + else: + # for backward compatibility. + for key in state_dict.keys(): + if 'tokentype_embeddings' in key: + state_dict_[key.split('tokentype_embeddings.')[1]] \ + = state_dict[key] + if len(state_dict_.keys()) > 0: + self.tokentype_embeddings.load_state_dict(state_dict_, + strict=strict) + else: + print('***WARNING*** expected tokentype embeddings in the ' + 'checkpoint but could not find it', flush=True) + + +class TransformerLanguageModel(MegatronModule): + """Transformer language model. + + Args: + transformer_hparams: transformer hyperparameters + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, + config, + encoder_attn_mask_type, + num_tokentypes=0, + add_encoder=True, + add_decoder=False, + decoder_attn_mask_type=AttnMaskType.causal, + add_pooler=False, + pre_process=True, + post_process=True): + args = get_args() + # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. + if args.untie_embeddings_and_output_weights: assert not add_decoder + super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) + + self.pre_process = pre_process + self.post_process = post_process + self.hidden_size = config.hidden_size + self.num_tokentypes = num_tokentypes + self.init_method = config.init_method + self.add_encoder = add_encoder + self.encoder_attn_mask_type = encoder_attn_mask_type + self.add_decoder = add_decoder + self.decoder_attn_mask_type = decoder_attn_mask_type + self.add_pooler = add_pooler + self.encoder_hidden_state = None + self.add_retriever = args.retro_add_retriever + self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + + # Embeddings. + if self.pre_process: + self.embedding = Embedding(self.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + self.num_tokentypes) + self._embedding_key = 'embedding' + + # Rotary positional embeddings + self.use_rotary_position_embeddings = \ + args.position_embedding_type == 'rope' + if self.use_rotary_position_embeddings: + self.seq_length = args.seq_length + rotary_dim = args.hidden_size // args.num_attention_heads \ + if args.kv_channels is None else args.kv_channels + + # partial rotary embeddings, which is better than full rotary + # Wang and Komatsuzaki et al + # https://github.com/kingoflolz/mesh-transformer-jax/ + self.rotary_pos_emb = RotaryEmbedding( + kv_channels=rotary_dim, + rotary_percent=args.rotary_percent, + seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor, + ) + + # Encoder (usually set to True, False if part of an encoder-decoder + # architecture and in encoder-only stage). + if self.add_encoder: + self.encoder = ParallelTransformer( + config, + model_type=args.model_type if not args.retro_add_retriever \ + else ModelType.retro_decoder, + self_attn_mask_type=self.encoder_attn_mask_type, + pre_process=self.pre_process, + post_process=self.post_process, + ) + self._encoder_key = 'encoder' + else: + self.encoder = None + + # Decoder (usually set to False, True if part of an encoder-decoder + # architecture and in decoder-only stage). + if self.add_decoder: + self.decoder = ParallelTransformer( + config, + model_type=args.model_type, + layer_type=LayerType.decoder, + self_attn_mask_type=self.decoder_attn_mask_type, + pre_process=self.pre_process, + post_process=self.post_process) + self._decoder_key = 'decoder' + else: + self.decoder = None + + if self.post_process: + # Pooler. + if self.add_pooler: + self.pooler = Pooler(self.hidden_size, self.init_method) + self._pooler_key = 'pooler' + + if self.untie_embeddings_and_output_weights: + self.output_layer = tensor_parallel.ColumnParallelLinear( + args.hidden_size, + args.padded_vocab_size, + config=config, + init_method=self.init_method, + bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. + self._output_layer_key = 'output_layer' + + def set_input_tensor(self, input_tensor): + """ See megatron.legacy.model.transformer.set_input_tensor()""" + + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + if self.add_encoder and self.add_decoder: + assert len(input_tensor) == 1, \ + 'input_tensor should only be length 1 for stage with both encoder and decoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_encoder: + assert len(input_tensor) == 1, \ + 'input_tensor should only be length 1 for stage with only encoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_decoder: + if len(input_tensor) == 2: + self.decoder.set_input_tensor(input_tensor[0]) + self.encoder_hidden_state = input_tensor[1] + elif len(input_tensor) == 1: + self.decoder.set_input_tensor(None) + self.encoder_hidden_state = input_tensor[0] + else: + raise Exception('input_tensor must have either length 1 or 2') + else: + raise Exception('Stage must have at least either encoder or decoder') + + def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, + dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + enc_dec_attn_mask=None, tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, output_enc_hidden=False): + + # Encoder embedding. + if self.pre_process: + encoder_input = self.embedding(enc_input_ids, enc_position_ids, + tokentype_ids=tokentype_ids) + else: + encoder_input = None + + # Retriever embedding. + if self.add_retriever and self.pre_process: + retriever_input = self.embedding(retriever_input_ids, + retriever_position_ids, + tokentype_ids=tokentype_ids) + else: + retriever_input = None + + # Rotary positional embeddings + rotary_pos_emb = None + if self.use_rotary_position_embeddings: + if inference_params is not None: + rotary_pos_emb = \ + self.rotary_pos_emb(inference_params.max_sequence_length) + else: + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + + # Run encoder. + if enc_hidden_states is None: + if self.encoder is not None: + encoder_output = self.encoder( + encoder_input, + enc_attn_mask, + retriever_input=retriever_input, + retriever_attn_mask=retriever_attn_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb) + else: + encoder_output = self.encoder_hidden_state + else: + encoder_output = enc_hidden_states.to(encoder_input.dtype) + + if self.post_process: + if self.add_pooler: + pooled_output = self.pooler(encoder_output, + pooling_sequence_index) + + # output_enc_hidden refers to when we just need the encoder's + # output. For example, it is helpful to compute + # similarity between two sequences by average pooling + if not self.add_decoder or output_enc_hidden: + if self.add_pooler and self.post_process: + return encoder_output, pooled_output + else: + return encoder_output + + # Decoder embedding. + if self.pre_process: + decoder_input = self.embedding(dec_input_ids, + dec_position_ids) + else: + decoder_input = None + + # Run decoder. + decoder_output = self.decoder( + decoder_input, + dec_attn_mask, + encoder_output=encoder_output, + enc_dec_attn_mask=enc_dec_attn_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb) + + if self.add_pooler and self.post_process: + return decoder_output, encoder_output, pooled_output + else: + return decoder_output, encoder_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load.""" + + state_dict_ = {} + if self.pre_process: + state_dict_[self._embedding_key] \ + = self.embedding.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.add_encoder: + state_dict_[self._encoder_key] \ + = self.encoder.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process: + if self.add_pooler: + state_dict_[self._pooler_key] \ + = self.pooler.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.untie_embeddings_and_output_weights: + state_dict_[self._output_layer_key] \ + = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars) + + if self.add_decoder: + state_dict_[self._decoder_key] \ + = self.decoder.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Embedding. + if self.pre_process: + if self._embedding_key in state_dict: + state_dict_ = state_dict[self._embedding_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if '_embeddings' in key: + state_dict_[key] = state_dict[key] + self.embedding.load_state_dict(state_dict_, strict=strict) + + # Encoder. + if self.add_encoder: + if self._encoder_key in state_dict: + state_dict_ = state_dict[self._encoder_key] + # For backward compatibility. + elif 'transformer' in state_dict: + state_dict_ = state_dict['transformer'] + else: + # For backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'transformer.' in key: + state_dict_[key.split('transformer.')[1]] = state_dict[key] + + # For backward compatibility. + state_dict_self_attention = {} + for key in state_dict_.keys(): + if '.attention.' in key: + state_dict_self_attention[key.replace(".attention.", + ".self_attention.")] = state_dict_[key] + else: + state_dict_self_attention[key] = state_dict_[key] + state_dict_ = state_dict_self_attention + + self.encoder.load_state_dict(state_dict_, strict=strict) + + # Pooler. + if self.post_process: + if self.add_pooler: + assert 'pooler' in state_dict, \ + 'could not find data for pooler in the checkpoint' + self.pooler.load_state_dict(state_dict[self._pooler_key], + strict=strict) + if self.untie_embeddings_and_output_weights: + assert 'output_layer' in state_dict, \ + 'could not find data for output_layer in the checkpoint' + self.output_layer.load_state_dict(state_dict[self._output_layer_key], + strict=strict) + # Decoder. + if self.add_decoder: + assert 'decoder' in state_dict, \ + 'could not find data for pooler in the checkpoint' + self.decoder.load_state_dict(state_dict[self._decoder_key], + strict=strict) diff --git a/megatron/legacy/model/module.py b/megatron/legacy/model/module.py new file mode 100644 index 0000000..849fda7 --- /dev/null +++ b/megatron/legacy/model/module.py @@ -0,0 +1,206 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron Module""" + +import torch +from torch.autograd import Variable +from torch.nn.parameter import Parameter + +from megatron.training import get_args +from megatron.core import mpu, tensor_parallel + + +_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) +_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor) + + + +def param_is_not_shared(param): + return not hasattr(param, 'shared') or not param.shared + + + +class MegatronModule(torch.nn.Module): + """Megatron specific extensions of torch Module with support + for pipelining.""" + + def __init__(self, config=None, share_embeddings_and_output_weights=True): + super(MegatronModule, self).__init__() + self.config = config + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """Use this function to override the state dict for + saving checkpoints.""" + return self.state_dict(prefix=prefix, keep_vars=keep_vars) + + + def shared_embedding_or_output_weight(self): + if self.pre_process: + return self.language_model.embedding.word_embeddings.weight + else: + if not self.share_embeddings_and_output_weights: + raise Exception('shared_embedding_or_output_weight() called for last ' + 'stage, but share_embeddings_and_output_weights is false') + return self.word_embeddings.weight + + + def initialize_word_embeddings(self): + args = get_args() + if not self.share_embeddings_and_output_weights: + raise Exception('initialize_word_embeddings() was called but ' + 'share_embeddings_and_output_weights is false') + + # This function just initializes the word embeddings in the final stage + # when we are using pipeline parallelism. Nothing to do if we aren't + # using pipeline parallelism. + if args.pipeline_model_parallel_size == 1: + # Zero out wgrad if sharing embeddings between two layers on same + # pipeline stage to make sure grad accumulation into main_grad is + # correct and does not include garbage values (e.g., from torch.empty). + self.shared_embedding_or_output_weight().zero_out_wgrad = True + return + + if mpu.is_pipeline_first_stage() and self.pre_process and not self.post_process: + self.shared_embedding_or_output_weight().shared_embedding = True + + # Parameters are shared between the word embeddings layers, and the + # heads at the end of the model. In a pipelined setup with more than + # one stage, the initial embedding layer and the head are on different + # workers, so we do the following: + # 1. Create a second copy of word_embeddings on the last stage, with + # initial parameters of 0.0. + # 2. Do an all-reduce between the first and last stage to ensure that + # the two copies of word_embeddings start off with the same + # parameter values. + # 3. In the training loop, before an all-reduce between the grads of + # the two word_embeddings layers to ensure that every applied weight + # update is the same on both stages. + if mpu.is_pipeline_last_stage() and not self.pre_process: + assert not mpu.is_pipeline_first_stage() + self._word_embeddings_for_head_key = 'word_embeddings_for_head' + # set word_embeddings weights to 0 here, then copy first + # stage's weights using all_reduce below. + self.word_embeddings = tensor_parallel.VocabParallelEmbedding( + args.padded_vocab_size, self.config.hidden_size, + config=self.config, init_method=self.config.init_method) + self.word_embeddings.weight.data.fill_(0) + self.word_embeddings.weight.shared = True + self.word_embeddings.weight.shared_embedding = True + + # Zero out initial weights for decoder embedding. + # NOTE: We don't currently support T5 with the interleaved schedule. + if not mpu.is_pipeline_first_stage(ignore_virtual=True) and \ + self.pre_process: + self.language_model.embedding.zero_parameters() + + if not torch.distributed.is_initialized(): + if not getattr(MegatronModule, "embedding_warning_printed", False): + print("WARNING! Distributed processes aren't initialized, so " + "word embeddings in the last layer are not initialized. " + "If you are just manipulating a model this is fine, but " + "this needs to be handled manually. If you are training " + "something is definitely wrong.") + MegatronModule.embedding_warning_printed = True + return + + # Ensure that first and last stages have the same initial parameter + # values. + if mpu.is_rank_in_embedding_group(): + self.shared_embedding_or_output_weight().data = self.shared_embedding_or_output_weight().data.cuda() + torch.distributed.all_reduce(self.shared_embedding_or_output_weight().data, + group=mpu.get_embedding_group()) + + # Ensure that encoder(first stage) and decoder(split stage) position + # embeddings have the same initial parameter values + # NOTE: We don't currently support T5 with the interleaved schedule. + if mpu.is_rank_in_position_embedding_group() and \ + args.pipeline_model_parallel_split_rank is not None: + # TODO: Support tokentype embedding. + self.language_model.embedding.cuda() + position_embeddings = self.language_model.embedding.position_embeddings + torch.distributed.all_reduce(position_embeddings.weight.data, + group=mpu.get_position_embedding_group()) + + +def conversion_helper(val, conversion): + """Apply conversion to val. Recursively apply conversion if `val` + #is a nested tuple/list structure.""" + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + + +def fp32_to_float16(val, float16_convertor): + """Convert fp32 `val` to fp16/bf16""" + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, _FLOAT_TYPES): + val = float16_convertor(val) + return val + return conversion_helper(val, half_conversion) + + +def float16_to_fp32(val): + """Convert fp16/bf16 `val` to fp32""" + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)): + val = val.float() + return val + return conversion_helper(val, float_conversion) + + + +class Float16Module(MegatronModule): + + def __init__(self, module, args): + super(Float16Module, self).__init__() + + if args.fp16: + self.add_module('module', module.half()) + def float16_convertor(val): + return val.half() + elif args.bf16: + self.add_module('module', module.bfloat16()) + def float16_convertor(val): + return val.bfloat16() + else: + raise Exception('should not be here') + + self.float16_convertor = float16_convertor + + + def set_input_tensor(self, input_tensor): + return self.module.set_input_tensor(input_tensor) + + + def forward(self, *inputs, **kwargs): + if mpu.is_pipeline_first_stage(): + inputs = fp32_to_float16(inputs, self.float16_convertor) + outputs = self.module(*inputs, **kwargs) + if mpu.is_pipeline_last_stage(): + outputs = float16_to_fp32(outputs) + return outputs + + + def state_dict(self, prefix='', keep_vars=False): + return self.module.state_dict(prefix=prefix, keep_vars=keep_vars) + + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + return self.module.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) diff --git a/megatron/legacy/model/multiple_choice.py b/megatron/legacy/model/multiple_choice.py new file mode 100644 index 0000000..bec0548 --- /dev/null +++ b/megatron/legacy/model/multiple_choice.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Multiple choice model.""" + +import torch + +from megatron.training import get_args, print_rank_last +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal +from .module import MegatronModule + + +class MultipleChoice(MegatronModule): + + def __init__(self, + config, + num_tokentypes=2, + pre_process=True, + post_process=True): + super(MultipleChoice, self).__init__(share_embeddings_and_output_weights=False) + args = get_args() + + self.pre_process = pre_process + self.post_process = post_process + + self.language_model, self._language_model_key = get_language_model( + config=config, + num_tokentypes=num_tokentypes, + add_pooler=True, + encoder_attn_mask_type=AttnMaskType.padding, + pre_process=self.pre_process, + post_process=self.post_process) + + # Multi-choice head. + if self.post_process: + self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) + self.multichoice_head = get_linear_layer(args.hidden_size, 1, + init_method) + self._multichoice_head_key = 'multichoice_head' + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, model_input, attention_mask, tokentype_ids=None): + + # [batch, choices, sequence] --> [batch * choices, sequence] --> + # transformer --> [batch, choices] --> softmax + + # Ensure the shape is [batch-size, choices, sequence] + assert len(attention_mask.shape) == 3 + num_choices = attention_mask.shape[1] + + # Reshape and treat choice dimension the same as batch. + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) + extended_attention_mask = bert_extended_attention_mask(attention_mask) + + input_ids = model_input + # Do the same as attention_mask for input_ids, tokentype_ids + assert len(input_ids.shape) == 3 + assert len(tokentype_ids.shape) == 3 + input_ids = input_ids.view(-1, input_ids.size(-1)) + tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1)) + position_ids = bert_position_ids(input_ids) + + lm_output = self.language_model( + input_ids, + position_ids, + extended_attention_mask, + tokentype_ids=tokentype_ids + ) + if self.post_process: + _, pooled_output = lm_output + multichoice_output = self.multichoice_dropout(pooled_output) + multichoice_logits = self.multichoice_head(multichoice_output) + + # Reshape back to separate choices. + multichoice_logits = multichoice_logits.view(-1, num_choices) + + return multichoice_logits + return lm_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process: + state_dict_[self._multichoice_head_key] \ + = self.multichoice_head.state_dict(prefix=prefix, keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + if self.post_process: + if self._multichoice_head_key in state_dict: + self.multichoice_head.load_state_dict( + state_dict[self._multichoice_head_key], strict=strict) + else: + print_rank_last('***WARNING*** could not find {} in the checkpoint, ' + 'initializing to random'.format( + self._multichoice_head_key)) diff --git a/megatron/legacy/model/realm_model.py b/megatron/legacy/model/realm_model.py new file mode 100644 index 0000000..5b2859a --- /dev/null +++ b/megatron/legacy/model/realm_model.py @@ -0,0 +1,204 @@ +import os +import torch + +from megatron.training import get_args, print_rank_0 +from megatron.training.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name +from megatron.legacy.model import BertModel +from .module import MegatronModule +from megatron.core import mpu +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import scaled_init_method_normal +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids + + +def general_ict_model_provider(only_query_model=False, only_block_model=False): + """Build the model.""" + args = get_args() + assert args.ict_head_size is not None, \ + "Need to specify --ict-head-size to provide an ICTBertModel" + assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \ + "Model parallel size > 1 not supported for ICT" + + print_rank_0('building ICTBertModel...') + + # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes + model = ICTBertModel( + ict_head_size=args.ict_head_size, + num_tokentypes=2, + parallel_output=True, + only_query_model=only_query_model, + only_block_model=only_block_model) + + return model + + +class ICTBertModel(MegatronModule): + """Bert-based module for Inverse Cloze task.""" + def __init__(self, + ict_head_size, + num_tokentypes=1, + parallel_output=True, + only_query_model=False, + only_block_model=False): + super(ICTBertModel, self).__init__() + bert_kwargs = dict( + ict_head_size=ict_head_size, + num_tokentypes=num_tokentypes, + parallel_output=parallel_output + ) + assert not (only_block_model and only_query_model) + self.use_block_model = not only_query_model + self.use_query_model = not only_block_model + + if self.use_query_model: + # this model embeds (pseudo-)queries - Embed_input in the paper + self.query_model = IREncoderBertModel(**bert_kwargs) + self._query_key = 'question_model' + + if self.use_block_model: + # this model embeds evidence blocks - Embed_doc in the paper + self.block_model = IREncoderBertModel(**bert_kwargs) + self._block_key = 'context_model' + + def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask): + """Run a forward pass for each of the models and return the respective embeddings.""" + query_logits = self.embed_query(query_tokens, query_attention_mask) + block_logits = self.embed_block(block_tokens, block_attention_mask) + return query_logits, block_logits + + def embed_query(self, query_tokens, query_attention_mask): + """Embed a batch of tokens using the query model""" + if self.use_query_model: + query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0) + query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types) + return query_ict_logits + else: + raise ValueError("Cannot embed query without query model.") + + def embed_block(self, block_tokens, block_attention_mask): + """Embed a batch of tokens using the block model""" + if self.use_block_model: + block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0) + block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types) + return block_ict_logits + else: + raise ValueError("Cannot embed block without block model.") + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """Save dict with state dicts of each of the models.""" + state_dict_ = {} + if self.use_query_model: + state_dict_[self._query_key] \ + = self.query_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + + if self.use_block_model: + state_dict_[self._block_key] \ + = self.block_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Load the state dicts of each of the models""" + if self.use_query_model: + print("Loading ICT query model", flush=True) + self.query_model.load_state_dict( + state_dict[self._query_key], strict=strict) + + if self.use_block_model: + print("Loading ICT block model", flush=True) + self.block_model.load_state_dict( + state_dict[self._block_key], strict=strict) + + def init_state_dict_from_bert(self): + """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining""" + args = get_args() + tracker_filename = get_checkpoint_tracker_filename(args.bert_load) + if not os.path.isfile(tracker_filename): + raise FileNotFoundError("Could not find BERT load for ICT") + with open(tracker_filename, 'r') as f: + iteration = int(f.read().strip()) + assert iteration > 0 + + checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False) + if mpu.get_data_parallel_rank() == 0: + print('global rank {} is loading checkpoint {}'.format( + torch.distributed.get_rank(), checkpoint_name)) + + try: + state_dict = torch.load(checkpoint_name, map_location='cpu') + except BaseException: + raise ValueError("Could not load checkpoint") + + # load the LM state dict into each model + model_dict = state_dict['model']['language_model'] + self.query_model.language_model.load_state_dict(model_dict) + self.block_model.language_model.load_state_dict(model_dict) + + # give each model the same ict_head to begin with as well + query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head'] + self.block_model.ict_head.load_state_dict(query_ict_head_state_dict) + + +class IREncoderBertModel(MegatronModule): + """BERT-based encoder for queries or blocks used for learned information retrieval.""" + def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True): + super(IREncoderBertModel, self).__init__() + args = get_args() + + self.ict_head_size = ict_head_size + self.parallel_output = parallel_output + init_method = init_method_normal(args.init_method_std) + scaled_init_method = scaled_init_method_normal(args.init_method_std, + args.num_layers) + + self.language_model, self._language_model_key = get_language_model( + num_tokentypes=num_tokentypes, + add_pooler=True, + encoder_attn_mask_type=AttnMaskType.padding, + init_method=init_method, + scaled_init_method=scaled_init_method) + + self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method) + self._ict_head_key = 'ict_head' + + def forward(self, input_ids, attention_mask, tokentype_ids=None): + extended_attention_mask = bert_extended_attention_mask( + attention_mask, next(self.language_model.parameters()).dtype) + position_ids = bert_position_ids(input_ids) + + lm_output, pooled_output = self.language_model( + input_ids, + position_ids, + extended_attention_mask, + tokentype_ids=tokentype_ids) + + # Output. + ict_logits = self.ict_head(pooled_output) + return ict_logits, None + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + state_dict_[self._ict_head_key] \ + = self.ict_head.state_dict(prefix=prefix, + keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + self.ict_head.load_state_dict( + state_dict[self._ict_head_key], strict=strict) + + diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py new file mode 100644 index 0000000..7e4424c --- /dev/null +++ b/megatron/legacy/model/rms_norm.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import torch +from torch import nn + +class RMSNorm(torch.nn.Module): + + def __init__(self, + dim: int, + eps: float = 1e-6, + sequence_parallel: bool = False): + """RMS Normaliation module + + Args: + dim (int): The width of input, i.e. hidden size + eps (float): epsilon to use for the norm, default to 1e-6 + sequence_parallel (bool): Set to true if sequence parallelism is being used, + this marks the weights as needing to be allreduced. + """ + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + setattr(self.weight, 'sequence_parallel', sequence_parallel) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight diff --git a/megatron/legacy/model/t5_model.py b/megatron/legacy/model/t5_model.py new file mode 100644 index 0000000..4c78922 --- /dev/null +++ b/megatron/legacy/model/t5_model.py @@ -0,0 +1,186 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""T5 model.""" + +import torch + +from megatron.training import get_args +from megatron.core import tensor_parallel +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import parallel_lm_logits, get_language_model +from megatron.legacy.model import LayerNorm +from megatron.legacy.model.utils import ( + openai_gelu, + get_linear_layer +) +from .module import MegatronModule + + +def t5_extended_attention_mask(attention_mask_list): + + def attn_mask_postprocess(attn_mask): + # [b, 1, s, s] + extended_attention_mask = attn_mask.unsqueeze(1) + return extended_attention_mask + + return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list] + + +def t5_position_ids(token_ids): + # Create position ids + seq_length = token_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, + device=token_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(token_ids) + + return position_ids + + +class T5LMHead(MegatronModule): + """Masked LM head for T5 + + Args: + mpu_vocab_size: model parallel size of vocabulary. + parallel_output: wether output logits being distributed or not. + """ + + def __init__(self, mpu_vocab_size, parallel_output): + super(T5LMHead, self).__init__() + + self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) + self.bias.model_parallel = True + self.bias.partition_dim = 0 + self.bias.stride = 1 + self.parallel_output = parallel_output + + def forward(self, hidden_states, word_embeddings_weight): + output = parallel_lm_logits(hidden_states, + word_embeddings_weight, + self.parallel_output, + bias=self.bias) + return output + + +class T5Model(MegatronModule): + """T5 Language model.""" + + def __init__(self, + config, + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True, + add_encoder=True, + add_decoder=True): + super().__init__(config=config) + args = get_args() + + self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + self.add_encoder = add_encoder + self.add_decoder = add_decoder + + self.language_model, self._language_model_key = get_language_model( + config=config, + num_tokentypes=num_tokentypes, + add_pooler=False, + add_encoder=add_encoder, + add_decoder=add_decoder, + encoder_attn_mask_type=AttnMaskType.padding, + pre_process=self.pre_process, + post_process=self.post_process) + + self.initialize_word_embeddings() + + if self.post_process and self.add_decoder: + self.lm_head = T5LMHead( + self.shared_embedding_or_output_weight().size(0), + parallel_output) + self._lm_head_key = 'lm_head' + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask, + decoder_attn_mask, encoder_decoder_attn_mask, + tokentype_ids=None, lm_labels=None, enc_hidden_states=None): + + # Converting the attention masks to proper parameter settings + encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask = t5_extended_attention_mask( + [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask]) + + encoder_position_ids = t5_position_ids(encoder_input_ids) + decoder_position_ids = t5_position_ids(decoder_input_ids) + + lm_output = self.language_model(encoder_input_ids, + encoder_position_ids, + encoder_attn_mask, + decoder_input_ids, + decoder_position_ids, + decoder_attn_mask, + encoder_decoder_attn_mask, + tokentype_ids=tokentype_ids, + enc_hidden_states=enc_hidden_states) + + if self.post_process and self.add_decoder: + decoder_output, encoder_output = lm_output + # Output. [s, b, h] + lm_logits = self.lm_head(decoder_output, + self.shared_embedding_or_output_weight()) + + if lm_labels is None: + # [s b h] => [b s h] + return lm_logits.transpose(0,1).contiguous() + else: + # [b s] => [s b] + lm_labels = lm_labels.transpose(0,1).contiguous() + if self.fp16_lm_cross_entropy: + assert lm_logits.dtype == torch.half + lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits, lm_labels) + else: + lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits.float(), + lm_labels) + # [s b] => [b s] + lm_loss = lm_loss.transpose(0,1).contiguous() + return lm_loss + elif self.add_decoder and not self.add_encoder: + decoder_output, encoder_output = lm_output + return decoder_output + else: + encoder_output = lm_output + return encoder_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load when model is combined with other heads, + add an extra key.""" + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process and self.add_decoder: + state_dict_[self._lm_head_key] \ + = self.lm_head.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + # Save word_embeddings. + if self.post_process and not self.pre_process and self.add_decoder: + state_dict_[self._word_embeddings_for_head_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + self.language_model.load_state_dict( + state_dict[self._language_model_key], strict=strict) + if self.post_process and self.add_decoder: + self.lm_head.load_state_dict(state_dict[self._lm_head_key], + strict=strict) + # Load word embeddings. + if self.post_process and not self.pre_process and self.add_decoder: + self.word_embeddings.load_state_dict( + state_dict[self._word_embeddings_for_head_key], strict=strict) diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py new file mode 100644 index 0000000..db46a72 --- /dev/null +++ b/megatron/legacy/model/transformer.py @@ -0,0 +1,1818 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Transformer.""" +from contextlib import nullcontext +import os +import math +import numpy as np +import torch +import torch.nn.functional as F +from typing import Optional + +from megatron import core +from megatron.training import get_timers, get_args +from .module import MegatronModule +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType +from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax +from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding, apply_rotary_pos_emb +from megatron.core.num_microbatches_calculator import get_num_microbatches +from megatron.legacy.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_norm +from megatron.core.tensor_parallel import ( + gather_from_sequence_parallel_region_to_moe, + reduce_scatter_to_sequence_parallel_region_from_moe, + get_cuda_rng_tracker, + get_data_parallel_rng_tracker_name +) +from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_and_expert_parallel_group +from megatron.core.jit import jit_fuser + +try: + from einops import rearrange +except ImportError: + rearrange = None + +try: + from flash_attn.flash_attn_interface import flash_attn_unpadded_func +except ImportError: + try: + from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func + except ImportError: + flash_attn_unpadded_func = None + +""" We use the following notation throughout this file: + h: hidden size + n: number of attention heads + p: number of model parallel partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + l: number of layers + Transformer takes input of size [s, b, h] and returns a + tensor of the same size. We use the following arguments: + hyperparameters: transformer hyperparameters +""" + +class DropPath(MegatronModule): + """Drop paths (Stochastic Depth) per sample + (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=0.): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_state): + if self.drop_prob == 0. or not self.training: + return hidden_state + keep_prob = 1 - self.drop_prob + # work with diff dim tensors, not just 2D ConvNets + # hidden_state: [s, b, h] + shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2) + random_tensor = keep_prob + \ + torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device) + random_tensor.floor_() # binarize + output = hidden_state.div(keep_prob) * random_tensor + return output + +class ParallelMLP(MegatronModule): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config, is_expert=False): + super(ParallelMLP, self).__init__() + args = get_args() + + self.add_bias = config.add_bias_linear + + ffn_hidden_size = config.ffn_hidden_size + if config.gated_linear_unit: + ffn_hidden_size *= 2 + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + ffn_hidden_size, + config=config, + init_method=config.init_method, + bias=self.add_bias, + gather_output=False, + skip_bias_add=True, + is_expert=is_expert, + ) + + self.bias_gelu_fusion = False + self.activation_func = None + self.swiglu = args.swiglu + + if args.openai_gelu: + self.activation_func = openai_gelu + elif args.onnx_safe: + self.activation_func = erf_gelu + elif args.swiglu: + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + self.activation_func = swiglu + elif args.squared_relu: + def squared_relu(x): + return torch.pow(F.relu(x), 2) + self.activation_func = squared_relu + else: + self.bias_gelu_fusion = args.bias_gelu_fusion + self.activation_func = F.gelu + + # Project back to h. + self.dense_4h_to_h = tensor_parallel.RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + config=config, + init_method=config.output_layer_init_method, + bias=self.add_bias, + skip_bias_add=True, + input_is_parallel=True, + is_expert=is_expert, + ) + + def forward(self, hidden_states): + + # [s, b, 4hp] + intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states) + + if self.bias_gelu_fusion: + assert self.add_bias is True + assert self.activation_func == F.gelu + intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) + else: + if bias_parallel is not None: + intermediate_parallel = intermediate_parallel + bias_parallel + intermediate_parallel = self.activation_func(intermediate_parallel) + + # [s, b, h] + output, output_bias = self.dense_4h_to_h(intermediate_parallel) + return output, output_bias + +def sinkhorn(cost, tol=0.0001): + cost = torch.exp(cost) + d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype) + d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype) + + eps = 0.00000001 + error = 1e9 + d1_old = d1 + while error > tol: + d0 = (1/d0.size(0))*1/(torch.sum(d1*cost,1) + eps) + d1 = (1/d1.size(0))*1/(torch.sum(d0.unsqueeze(1)*cost,0)+eps) + error = torch.mean(torch.abs(d1_old-d1)) + d1_old = d1 + return d1*cost*d0.unsqueeze(1) + + +def get_router_linear_layer(config): + args = get_args() + router = torch.nn.Linear(args.hidden_size, args.num_experts, bias=False) + with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()): + config.init_method(router.weight) + setattr(router.weight, 'sequence_parallel',config.sequence_parallel) + return router + + +class SwitchMLP(MegatronModule): + """ + Routes input to one of N MLP "experts" + """ + def __init__(self, config): + super(SwitchMLP, self).__init__() + args = get_args() + self.router = get_router_linear_layer(config) + self.expert_parallel_size = mpu.get_expert_model_parallel_world_size() + self.sequence_parallel = config.sequence_parallel + self.add_bias = config.add_bias_linear + + assert args.num_experts % self.expert_parallel_size == 0 + self.num_local_experts = args.num_experts // self.expert_parallel_size + local_expert_indices_offset = mpu.get_expert_model_parallel_rank() * self.num_local_experts + self.local_expert_indices = [local_expert_indices_offset + i for i in range(self.num_local_experts)] + + self.local_experts = torch.nn.ModuleList() + for i in range(self.num_local_experts): + self.local_experts.append(ParallelMLP(config, is_expert=True)) + + def gather_indices(self, local_indices): + """ Gather tensors and concatinate along the first dimension.""" + group = get_tensor_and_expert_parallel_group() + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return local_indices + + dim_size = list(local_indices.size()) + dim_size[0] = dim_size[0] * world_size + + # TODO pre allocate memory + output = torch.empty(dim_size, dtype=local_indices.dtype, + device=torch.cuda.current_device()) + torch.distributed._all_gather_base( + output, local_indices.contiguous(), group=group + ) + return output + + def forward(self, hidden_states): + # hidden_states: [b, s, h] + args = get_args() + s = hidden_states.size(0) + b = hidden_states.size(1) + h = hidden_states.size(2) + route = self.router(hidden_states).view(-1, args.num_experts) + + # TODO (rprenger) Right now we're just using the sinkhorn algorithm + # for load balancing. There should be an option to do no load balancing + # and the algorithm and parametets should be further tested + if self.training: + with torch.no_grad(): + sinkroute = sinkhorn(route.detach().to(dtype=torch.float32)) + _, max_ind = torch.max(sinkroute, dim=1) + route = torch.sigmoid(route) + max_prob = route[torch.arange(route.size(0)), max_ind] + else: + route = torch.sigmoid(route) + max_prob, max_ind = torch.max(route, dim=1) + + max_prob = torch.unsqueeze(max_prob, 1) + hidden_states = hidden_states.view(-1, hidden_states.size(2)) + + # TODO (rprenger) TODO this could be made easier to read + # Converting [s, b, h] to [s*b, h]. + # Each vector could be routed differently + if self.sequence_parallel or (self.expert_parallel_size > 1): + global_hidden_states = \ + gather_from_sequence_parallel_region_to_moe(hidden_states) + global_indices = self.gather_indices(max_ind) + else: + global_hidden_states = hidden_states + global_indices = max_ind + + output_total = torch.zeros_like(global_hidden_states) + if self.add_bias: + output_bias_total = torch.zeros_like(global_hidden_states) + + for expert_num, expert in enumerate(self.local_experts): + local_expert_index = self.local_expert_indices[expert_num] + local_indices = (global_indices == local_expert_index).nonzero() + hidden = global_hidden_states[local_indices, :] + output, output_bias = expert(hidden) + output_total[local_indices, :] = output + if self.add_bias: + output_bias = output_bias.expand_as(output) + output_bias_total[local_indices, :] = output_bias + + if self.sequence_parallel or (self.expert_parallel_size > 1): + output_total = \ + reduce_scatter_to_sequence_parallel_region_from_moe(output_total) + if self.add_bias: + output_bias_total = \ + reduce_scatter_to_sequence_parallel_region_from_moe(output_bias_total) + + # bias is duplicated across tensor parallelism ranks; + # reduce scatter reduces bias across tensor parallel_ranks + output_bias_total = \ + output_bias_total/mpu.get_tensor_model_parallel_world_size() + + output_total = output_total*max_prob + output_total = output_total.view(s, b, h) + if self.add_bias: + output_bias_total = output_bias_total*max_prob + output_bias_total = output_bias_total.view(s, b, h) + else: + output_bias_total = None + + return output_total, output_bias_total + + +class CoreAttention(MegatronModule): + + def __init__(self, layer_number, config, + attn_mask_type=AttnMaskType.padding): + super(CoreAttention, self).__init__() + self.fp16 = config.fp16 + self.bf16 = config.bf16 + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + self.attn_mask_type = attn_mask_type + self.sequence_parallel = config.sequence_parallel + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + world_size = mpu.get_tensor_model_parallel_world_size() + self.hidden_size_per_partition = core.utils.divide(projection_size, + world_size) + self.hidden_size_per_attention_head = core.utils.divide( + projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = core.utils.divide( + config.num_attention_heads, world_size) + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + self.fp16, self.bf16, + self.attn_mask_type, + config.masked_softmax_fusion, + attention_mask_func, + self.attention_softmax_in_fp32, + coeff) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, + value_layer, attention_mask): + + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.reshape(output_size[2], + output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], + output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor( + (output_size[0]*output_size[1], output_size[2], output_size[3]), + query_layer.dtype, "mpu") + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, alpha=(1.0/self.norm_factor)) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs = self.scale_mask_softmax(attention_scores, + attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + if not self.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), + value_layer.size(2), + query_layer.size(0), + value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), + output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class FlashSelfAttention(torch.nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, + device=None, dtype=None): + super().__init__() + assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, ' + 'e.g., with pip install flash-attn') + assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + self.causal = causal + self.softmax_scale = softmax_scale + self.dropout_p = attention_dropout + + def forward(self, q, k, v): + """Implements the multihead softmax attention. + Arguments + --------- + q, k, v: The tensor containing the query, key, and value. (B, S, H, D) + """ + + assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) + assert all((i.is_cuda for i in (q,k,v))) + + batch_size, seqlen_q = q.shape[0], q.shape[1] + seqlen_k = k.shape[1] + + q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, + device=q.device) + + if self.training: + # during training q,k,v always have same seqlen + assert seqlen_k == seqlen_q + + is_causal = self.causal + cu_seqlens_k = cu_seqlens_q + dropout_p = self.dropout_p + else: + # turn off FA causal mask after first inference autoregressive iteration + # only on first autoregressive step q,k,v have same seqlen + is_causal = seqlen_q == seqlen_k + cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, + device=q.device) + dropout_p = 0 + + output = flash_attn_unpadded_func( + q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, + dropout_p, + softmax_scale=self.softmax_scale, causal=is_causal + ) + + output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + return output + + +class ParallelAttention(MegatronModule): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config, layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.padding): + super(ParallelAttention, self).__init__() + args = get_args() + self.layer_number = max(1, layer_number) + self.attention_type = attention_type + self.attn_mask_type = attn_mask_type + self.params_dtype = config.params_dtype + self.sequence_parallel = config.sequence_parallel + self.config = config + self.group_query_attention = args.group_query_attention + self.num_query_groups = args.num_query_groups + + query_projection_size = config.kv_channels * config.num_attention_heads + if self.group_query_attention: + kv_projection_size = args.kv_channels * args.num_query_groups + else: + kv_projection_size = args.kv_channels * args.num_attention_heads + + self.use_flash_attn = args.use_flash_attn \ + and attention_type == AttnType.self_attn \ + and self.attn_mask_type == AttnMaskType.causal + if self.use_flash_attn: + if flash_attn_unpadded_func is None: + raise ImportError('FlashAttention is not installed, please install with ' + 'pip install flash-attn') + assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' + 'self-attention for now') + assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' + 'supports causal mask for now') + if rearrange is None: + raise ImportError('einops is not installed, please install with pip install einops') + + # Per attention head and per partition values. + world_size = mpu.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = core.utils.divide( + query_projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = core.utils.divide( + config.num_attention_heads, world_size) + + if self.group_query_attention: + if args.num_query_groups % world_size != 0: + raise NotImplementedError('Currently the num_query_groups should be ' + 'a multiple of the tensor parallel size') + self.num_query_groups_per_partition = core.utils.divide( + args.num_query_groups, world_size) + else: + self.num_query_groups_per_partition = self.num_attention_heads_per_partition + + # Strided linear layer. + if attention_type == AttnType.self_attn: + self.query_key_value = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + query_projection_size + 2 * kv_projection_size, + config=config, + init_method=config.init_method, + bias=args.add_bias_linear or args.add_qkv_bias, + gather_output=False) + else: + assert attention_type == AttnType.cross_attn + + if self.group_query_attention: + raise NotImplementedError("Grouped query attention not implemented for cross-attention.") + assert query_projection_size == kv_projection_size + + self.query = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + query_projection_size, + config=config, + init_method=config.init_method, + bias=config.add_bias_linear, + gather_output=False) + + self.key_value = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + 2 * kv_projection_size, + config=config, + init_method=config.init_method, + bias=config.add_bias_linear, + gather_output=False) + + self.core_attention = CoreAttention(self.layer_number, config, + self.attn_mask_type) + self.checkpoint_core_attention = config.recompute_granularity == 'selective' + + if self.use_flash_attn: + self.core_attention_flash = FlashSelfAttention( + causal=True, attention_dropout=config.attention_dropout + ) + + # Output. + self.dense = tensor_parallel.RowParallelLinear( + query_projection_size, + config.hidden_size, + config=config, + init_method=config.output_layer_init_method, + bias=args.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True) + + def _checkpointed_attention_forward(self, query_layer, key_layer, + value_layer, attention_mask, + rotary_pos_emb=None): + """Forward method with activation checkpointing.""" + def custom_forward(*inputs): + query_layer = inputs[0] + key_layer = inputs[1] + value_layer = inputs[2] + attention_mask = inputs[3] + output_ = self.core_attention(query_layer, key_layer, + value_layer, attention_mask) + return output_ + + q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \ + else rotary_pos_emb + + hidden_states = tensor_parallel.checkpoint( + custom_forward, + False, query_layer, key_layer, value_layer, attention_mask, + q_pos_emb, k_pos_emb) + + return hidden_states + + def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads): + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=self.params_dtype, + device=torch.cuda.current_device()) + + def forward(self, hidden_states, attention_mask, + encoder_output=None, inference_params=None, + rotary_pos_emb=None): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + is_first_step = False + if inference_params: + if self.layer_number not in inference_params.key_value_memory_dict: + inf_max_seq_len = inference_params.max_sequence_length + inf_max_batch_size = inference_params.max_batch_size + inference_key_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size, + self.num_query_groups_per_partition) + inference_value_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size, + self.num_query_groups_per_partition) + + inference_params.key_value_memory_dict[self.layer_number] = ( + inference_key_memory, inference_value_memory) + is_first_step = True + else: + inference_key_memory, inference_value_memory = \ + inference_params.key_value_memory_dict[self.layer_number] + + # ===================== + # Query, Key, and Value + # ===================== + if self.attention_type == AttnType.self_attn: + + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_x_layer, _ = self.query_key_value(hidden_states) + + # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + self.num_query_groups_per_partition, + ( + (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) + * self.hidden_size_per_attention_head + ), + ) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query_layer, + key_layer, + value_layer) = torch.split( + mixed_x_layer, + [ + ( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition + * self.hidden_size_per_attention_head + ), + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head + ], + dim=3) + + # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - + query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head) + else: + # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] + mixed_kv_layer, _ = self.key_value(encoder_output) + + # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] + new_tensor_shape = mixed_kv_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head) + mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) + + # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] + (key_layer, + value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2) + + # Attention head [sq, b, h] --> [sq, b, hp] + query_layer, _ = self.query(hidden_states) + # [sq, b, hp] --> [sq, b, np, hn] + new_tensor_shape = query_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + query_layer = query_layer.view(*new_tensor_shape) + + # ================================== + # Adjust key and value for inference + # ================================== + + # duplicate the pos_emb for self attention + if rotary_pos_emb is not None: + if isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = rotary_pos_emb + else: + rotary_pos_emb = ((rotary_pos_emb,) * 2) + + if inference_params: + batch_start = inference_params.batch_size_offset + batch_end = batch_start + key_layer.size(1) + assert batch_end <= inference_key_memory.size(1) + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + key_layer.size(0) + assert sequence_end <= inference_key_memory.size(0) + # Copy key and values. + inference_key_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = key_layer + inference_value_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = value_layer + key_layer = inference_key_memory[ + :sequence_end, batch_start:batch_end, ...] + value_layer = inference_value_memory[ + :sequence_end, batch_start:batch_end, ...] + + + # adjust the key rotary positional embedding + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + # need to cross check this condition during inference + # if not set_inference_key_value_memory: + if not is_first_step: + # In inference, we compute one token at a time. + # Select the correct positional embedding + # (only the last token in the sequence) + q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] + else: + # In the first forward pass of inference, + # we use the entire provided prefix. + # q_pos_emb here has the rope embeddings of the entire + # prefix + to-be-generated output so + # we slice to just the prefix. + q_pos_emb = q_pos_emb[:sequence_end, :, :, :] + k_pos_emb = k_pos_emb[:sequence_end, :, :, :] + rotary_pos_emb = (q_pos_emb, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + + # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn] + if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: + key_layer = key_layer.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, + dim = 2 + ) + value_layer = value_layer.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, + dim = 2 + ) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb,self.config) + key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb,self.config) + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + if not self.use_flash_attn: + if self.checkpoint_core_attention: + context_layer = self._checkpointed_attention_forward( + query_layer, key_layer, value_layer, attention_mask) + else: + context_layer = self.core_attention( + query_layer, key_layer, value_layer, attention_mask) + else: + q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() + for x in (query_layer, key_layer, value_layer)] + if not self.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + context_layer = self.core_attention_flash(q, k, v) + else: + context_layer = self.core_attention_flash(q, k, v) + context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + + # ================= + # Output. [sq, b, h] + # ================= + + output, bias = self.dense(context_layer) + + return output, bias + + +def bias_dropout_add(x, bias, residual, prob, training): + # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor + if bias is not None: + x = x + bias + out = torch.nn.functional.dropout(x, p=prob, training=training) + out = residual + out + return out + + +def get_bias_dropout_add(training): + def _bias_dropout_add(x, bias, residual, prob): + return bias_dropout_add(x, bias, residual, prob, training) + return _bias_dropout_add + + +@jit_fuser +def bias_dropout_add_fused_train(x: torch.Tensor, + bias: Optional[torch.Tensor], + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, True) + + +@jit_fuser +def bias_dropout_add_fused_inference(x: torch.Tensor, + bias: Optional[torch.Tensor], + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, False) + + +class ParallelTransformerLayer(MegatronModule): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config, + layer_number, layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.): + args = get_args() + + super(ParallelTransformerLayer, self).__init__() + self.layer_number = layer_number + self.layer_type = layer_type + + self.apply_residual_connection_post_norm \ + = config.apply_residual_connection_post_layernorm + + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + + # Normalize the input data. + self.input_norm = get_norm(config) + + # Self attention. + self.self_attention = ParallelAttention( + config, + layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=self_attn_mask_type) + self.hidden_dropout = config.hidden_dropout + self.bias_dropout_fusion = config.bias_dropout_fusion + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None + + # Normalize the attention output + self.post_attention_norm = get_norm(config) + + # Cross attention. + if self.layer_type in (LayerType.decoder, + LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever, + LayerType.retro_encoder): + self.inter_attention = ParallelAttention( + config, + layer_number, + attention_type=AttnType.cross_attn) + # Normalize the attention output. + self.post_inter_attention_norm = get_norm(config) + + # MLP + if args.num_experts is not None: + self.mlp = SwitchMLP(config) + else: + self.mlp = ParallelMLP(config) + + # Set bias+dropout+add fusion grad_enable execution handler. + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + self.bias_dropout_add_exec_handler = \ + nullcontext if use_nvfuser else torch.enable_grad + + if args.retro_add_retriever: + self.retro_num_neighbors = args.retro_num_neighbors + self.retro_chunk_length = args.retro_chunk_length + self.retro_retrieved_length = \ + args.retro_num_retrieved_chunks * args.retro_chunk_length + + # Retriever (bi-directional transformer with cross attention) + if layer_type == LayerType.retro_decoder_with_retriever: + self.retriever = ParallelTransformer( + config=config, + model_type=ModelType.retro_encoder, + self_attn_mask_type=AttnMaskType.padding, + pre_process=True, + post_process=False, + ) + self._retriever_key = 'retriever' + else: + self.retriever = None + + def default_decoder_cross_attention(self, + encoder_output, + enc_dec_attn_mask, + norm_input, + norm_output, + bias_dropout_add_func): + '''Cross attention for a standard encoder-decoder model.''' + + # Attention. + attention_output, attention_bias = \ + self.inter_attention(norm_output, + enc_dec_attn_mask, + encoder_output=encoder_output) + + # Residual connection. + if self.apply_residual_connection_post_norm: + residual = norm_output + else: + residual = norm_input + + if attention_bias is not None: + attention_bias = attention_bias.expand_as(residual) + + # Bias-dropout-add. + with self.bias_dropout_add_exec_handler(): + norm_input = bias_dropout_add_func( + attention_output, + attention_bias, + residual, + self.hidden_dropout) + + # Normalize. + norm_output = self.post_inter_attention_norm(norm_input) + + return norm_input, norm_output + + def retro_encoder_cross_attention(self, + retriever_output, + norm_input, + norm_output, + bias_dropout_add_func): + """Cross attention for Retro encoder. + + Notation: + ns : Sequence length. + bs : Batch size. + d : Hidden size. + l : Number of chunks per sample (i.e., seq_length/chunk_length). + k : Number of neighbors. + r : Number of retrieved tokens (neighbors + continuation). + """ + + ns, bs, d = norm_output.shape # [r, bs * l * k, d] + + # Divide sequence dimension into chunks. + chunked_outputs = norm_output.reshape(self.retro_retrieved_length, + -1, + self.retro_num_neighbors, + d) + chunked_outputs_before_norm = \ + norm_input.reshape(self.retro_retrieved_length, -1, + self.retro_num_neighbors, d) # [r, bs*l, k, d] + + # Per-chunk attention. + norm_inputs = [] + norm_outputs = [] + for k in range(self.retro_num_neighbors): + + # Attention. + chunked_output = chunked_outputs[:,:,k].contiguous() + attention_output, attention_bias = \ + self.inter_attention( + chunked_output, # Q (neighbor embedding) + None, + encoder_output=retriever_output) # K, V (hidden act) + + # Residual connection. + if self.apply_residual_connection_post_norm: + residual = chunked_output + else: + residual = chunked_outputs_before_norm[:,:,k] + + # Re-enable torch grad to enable fused optimization. + with torch.enable_grad(): + norm_input = bias_dropout_add_func( + attention_output, + None if attention_bias is None else attention_bias.expand_as(residual), + residual, + self.hidden_dropout) + norm_inputs.append(norm_input) + + # Layer norm. + norm_output = self.post_inter_attention_norm(norm_input) + norm_outputs.append(norm_output) + + # Concatenate layer norms. + # norm_input : [r, k * bs * l, d] + # norm_output : [r, k * bs * l, d] + norm_input = torch.stack(norm_inputs, dim=1).reshape(ns, bs, d) + norm_output = torch.stack(norm_outputs, dim=1).reshape(ns, bs, d) + + return norm_input, norm_output + + def retro_decoder_cross_attention(self, + retriever_input, + retriever_output, + retriever_attn_mask, + norm_input, + norm_output, + inference_params, + bias_dropout_add_func): + """Cross attention for Retro decoder. + + Notation: + ns : Sequence length. + bs : Batch size. + d : Hidden size. + l : Number of chunks per sample (i.e., seq_length/chunk_length). + m : Number of tokens per chunk. + k : Number of neighbors. + r : Number of retrieved tokens (neighbors + continuation). + """ + + ns, bs, d = norm_output.shape + l = int(np.ceil(ns / self.retro_chunk_length)) + + # Retrieve neighbors. + if self.layer_type == LayerType.retro_decoder_with_retriever: + first_ns = ns % self.retro_chunk_length + if first_ns > 0: + first_chunk, rest_chunk = \ + norm_output[:first_ns], norm_output[first_ns:] + first_chunk = torch.nn.functional.pad( + first_chunk, + (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), + 'constant', + 0) + chunked_output = \ + torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d] + else: + chunked_output = norm_output # [l * m, bs, d] + chunked_output = chunked_output \ + .reshape(l, self.retro_chunk_length, bs, d) \ + .permute(1, 2, 0, 3) \ + .reshape(self.retro_chunk_length, bs * l, d) \ + .contiguous() + + # Get Encoder Output + retriever_output = self.retriever( + hidden_states=retriever_input, + attention_mask=retriever_attn_mask, + retriever_output=chunked_output, + retriever_attn_mask=retriever_attn_mask, + inference_params=inference_params) # [r, k * bs * l , d] + retriever_output = retriever_output.reshape( + self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d] + + # Chunks. + pad = (ns - 1) % self.retro_chunk_length + attending_chunks = norm_output[pad:] + padded_chunks = torch.nn.functional.pad( + attending_chunks, + (0, 0, 0, 0, 0, self.retro_chunk_length - 1), + 'constant', 0) + padded_chunked_output = padded_chunks \ + .reshape(l, self.retro_chunk_length, bs, d) \ + .permute(1, 2, 0, 3) + padded_chunked_output = padded_chunked_output.reshape( + self.retro_chunk_length, bs * l, d).contiguous() + + # Encoder output. + attention_output, attention_bias = \ + self.inter_attention(padded_chunked_output, + None, + encoder_output=retriever_output) + + # Residual connection. + if self.apply_residual_connection_post_norm: + residual = norm_output + else: + residual = norm_input + + # Re-enable torch grad to enable fused optimization. + with torch.enable_grad(): + norm_input = bias_dropout_add_func( + attention_output, + None if attention_bias is None else attention_bias.expand_as(attention_output), + torch.zeros_like(attention_output), + self.hidden_dropout) + norm_input = norm_input \ + .reshape(self.retro_chunk_length, bs, l, d) \ + .permute(2, 0, 1, 3) # [l, m, bs, d] + norm_input = norm_input.reshape(self.retro_chunk_length * l, bs, d) + norm_input = torch.nn.functional.pad( + norm_input, + (0, 0, 0, 0, pad, 0), + 'constant', 0)[:ns] # [ns, b, d] + # TODO: better redesign with inference param + args = get_args() + norm_input = args.retro_attention_gate * norm_input + residual + + # Layer norm post the decoder attention + norm_output = self.post_inter_attention_norm(norm_input) + + return retriever_output, norm_input, norm_output + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None): + + # Update the params in case the retro param changes during inference + # TODO: better redesign with inference param + args = get_args() + if args.retro_add_retriever: + self.retro_num_neighbors = args.retro_num_neighbors + self.retro_chunk_length = args.retro_chunk_length + self.retro_retrieved_length = \ + args.retro_num_retrieved_chunks * args.retro_chunk_length + + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + norm_output = self.input_norm(hidden_states) + + # Self attention. + attention_output, attention_bias = \ + self.self_attention( + norm_output, + attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb) + + # Residual connection. + if self.apply_residual_connection_post_norm: + residual = norm_output + else: + residual = hidden_states + + if self.drop_path is None: + # jit scripting for a nn.module (with dropout) is not + # trigerring the fusion kernel. For now, we use two + # different nn.functional routines to account for varying + # dropout semantics during training and inference phases. + if self.bias_dropout_fusion: + if self.training: + bias_dropout_add_func = bias_dropout_add_fused_train + else: + bias_dropout_add_func = bias_dropout_add_fused_inference + else: + bias_dropout_add_func = get_bias_dropout_add(self.training) + + if attention_bias is not None: + attention_bias = attention_bias.expand_as(residual) + with self.bias_dropout_add_exec_handler(): + norm_input = bias_dropout_add_func( + attention_output, + attention_bias, + residual, + self.hidden_dropout) + else: + out = torch.nn.functional.dropout(attention_output + attention_bias, + p=self.hidden_dropout, + training=self.training) + norm_input = residual + self.drop_path(out) + + # Layer norm post the self attention. + norm_output = self.post_attention_norm(norm_input) + + # Cross attention. + if self.layer_type == LayerType.encoder: + pass + elif self.layer_type == LayerType.decoder: + norm_input, norm_output = \ + self.default_decoder_cross_attention( + encoder_output, + enc_dec_attn_mask, + norm_input, + norm_output, + bias_dropout_add_func) + elif self.layer_type == LayerType.retro_encoder: + norm_input, norm_output = \ + self.retro_encoder_cross_attention( + retriever_output, + norm_input, + norm_output, + bias_dropout_add_func) + elif self.layer_type in (LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever): + retriever_output, norm_input, norm_output = \ + self.retro_decoder_cross_attention( + retriever_input, + retriever_output, + retriever_attn_mask, + norm_input, + norm_output, + inference_params, + bias_dropout_add_func) + else: + raise Exception("Unsupported layer type, '%s'." % + self.layer_type.name) + + # MLP. + mlp_output, mlp_bias = self.mlp(norm_output) + + # Second residual connection. + if self.apply_residual_connection_post_norm: + residual = norm_output + else: + residual = norm_input + + if self.drop_path is None: + if mlp_bias is not None: + mlp_bias = mlp_bias.expand_as(residual) + with self.bias_dropout_add_exec_handler(): + output = bias_dropout_add_func( + mlp_output, + mlp_bias, + residual, + self.hidden_dropout) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = core.utils.make_viewless_tensor(inp = output, + requires_grad = output.requires_grad, + keep_graph = True) + + else: + if mlp_bias is not None: + mlp_output = mlp_output + mlp_bias + out = torch.nn.functional.dropout(mlp_output, + p=self.hidden_dropout, + training=self.training) + output = residual + self.drop_path(out) + + if self.layer_type == LayerType.retro_decoder_with_retriever: + return output, retriever_output + else: + return output + + +class NoopTransformerLayer(MegatronModule): + """A single 'no-op' transformer layer. + + The sole purpose of this layer is for when a standalone embedding layer + is used (i.e., args.standalone_embedding_stage == True). In this case, + zero transformer layers are assigned when pipeline rank == 0. Additionally, + when virtual pipeline rank >= 1, zero total model parameters are created + (virtual rank 0 contains the input embedding). This results in the model's + input and output tensors being the same, which causes an error when + performing certain memory optimiations on the output tensor (e.g., + deallocating it). Thus, this layer disconnects the input from the output + via a clone. Since ranks containing a no-op layer are generally under- + utilized (both compute and memory), there's no worry of any performance + degredation. + """ + + def __init__(self, layer_number): + super().__init__() + self.layer_number = layer_number + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + inference_params=None): + return hidden_states.clone() + + +def _get_num_layers(args, model_type, is_decoder=False): + """Compute the number of transformer layers resident on the current rank.""" + is_encoder_and_decoder_model = (model_type == ModelType.encoder_and_decoder) + if model_type == ModelType.retro_encoder: + num_layers = args.retro_encoder_layers + elif mpu.get_pipeline_model_parallel_world_size() > 1: + if is_encoder_and_decoder_model: + assert args.pipeline_model_parallel_split_rank is not None + + # When a standalone embedding stage is used, a rank is taken from + # the encoder's ranks, to be used for the encoder's embedding + # layer. This way, the rank referenced by the 'split rank' remains + # the same whether or not a standalone embedding stage is used. + num_ranks_in_encoder = ( + args.pipeline_model_parallel_split_rank - 1 + if args.standalone_embedding_stage else + args.pipeline_model_parallel_split_rank + ) + num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder + assert args.encoder_num_layers % num_ranks_in_encoder == 0, \ + 'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder) + assert args.decoder_num_layers % num_ranks_in_decoder == 0, \ + 'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder) + if mpu.is_pipeline_stage_before_split(): + num_layers = ( + 0 + if args.standalone_embedding_stage + and mpu.get_pipeline_model_parallel_rank() == 0 else + args.encoder_num_layers // num_ranks_in_encoder + ) + else: + num_layers = args.decoder_num_layers // num_ranks_in_decoder + else: + assert args.num_layers == args.encoder_num_layers + assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \ + 'num_layers must be divisible by transformer_pipeline_model_parallel_size' + + # When a standalone embedding stage is used, all transformer layers + # are divided among pipeline rank >= 1, while on pipeline rank 0, + # ranks either contain the input embedding layer (virtual pp rank 0), + # or no layers at all (virtual pp rank >= 1). + num_layers = ( + 0 + if args.standalone_embedding_stage + and mpu.get_pipeline_model_parallel_rank() == 0 else + args.num_layers // args.transformer_pipeline_model_parallel_size + ) + else: + if not is_decoder: + num_layers = args.encoder_num_layers + else: + num_layers = args.decoder_num_layers + return num_layers + + +def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, + layer_number): + args = get_args() + if args.retro_add_retriever and layer_number in retro_layer_numbers: + if model_type == ModelType.retro_decoder: + return LayerType.retro_decoder_with_retriever \ + if layer_number == retro_layer_numbers[0] \ + else LayerType.retro_decoder + elif model_type == ModelType.retro_encoder: + return LayerType.retro_encoder + else: + raise Exception("Unsupported model type, '%s'." % model_type) + else: + return default_layer_type + + +class ParallelTransformer(MegatronModule): + """Transformer class.""" + + def __init__(self, config, + model_type, layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + post_norm=True, + pre_process=True, + post_process=True, + drop_path_rate=0.0): + super(ParallelTransformer, self).__init__() + args = get_args() + + self.layer_type = layer_type + self.model_type = model_type + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + self.post_norm = post_norm + self.pre_process = pre_process + self.post_process = post_process + self.input_tensor = None + self.drop_path_rate = drop_path_rate + self.transformer_impl = args.transformer_impl + self.retro_add_retriever = args.retro_add_retriever + + # Store activation checkpoiting flag. + self.recompute_granularity = config.recompute_granularity + self.recompute_method = config.recompute_method + self.recompute_num_layers = config.recompute_num_layers + self.distribute_saved_activations = \ + config.distribute_saved_activations and not config.sequence_parallel + + self.sequence_parallel = config.sequence_parallel + + # Transformer Engine Init. + self.transformer_engine_v_0_10 = False + self.transformer_engine_v_0_11 = False + self.transformer_engine_v_0_8 = False + if self.transformer_impl == 'transformer_engine': + global transformer_engine + import transformer_engine + from importlib.metadata import version + from pkg_resources import packaging + + te_version = packaging.version.Version(version("transformer-engine")) + if te_version >= packaging.version.Version("0.8.0"): + self.transformer_engine_v_0_8 = True + if te_version >= packaging.version.Version("0.10.0"): + self.transformer_engine_v_0_10 = True + if te_version >= packaging.version.Version("0.11.0"): + self.transformer_engine_v_0_11 = True + + del version, packaging + + assert not args.squared_relu, "TransformerEngine does not support squared relu activation." + + self.use_fp8 = args.fp8 is not None + self.fp8_recipe = None + self.fp8_group = None + if self.use_fp8: + assert args.transformer_impl == 'transformer_engine', \ + 'transformer-engine required for fp8 training and inference' + self.fp8_group = mpu.get_amax_reduction_group() + if args.fp8 == "e4m3": + fp8_format = transformer_engine.common.recipe.Format.E4M3 + elif args.fp8 == "hybrid": + fp8_format = transformer_engine.common.recipe.Format.HYBRID + else: + raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.") + self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling( + margin=args.fp8_margin, + interval=args.fp8_interval, + fp8_format=fp8_format, + amax_history_len=args.fp8_amax_history_len, + amax_compute_algo=args.fp8_amax_compute_algo, + override_linear_precision=(False, False, not args.fp8_wgrad), + ) + + self.num_microbatches_in_previous_step = -1 + self.microbatch_count = 0 + self.checkpoint_core_attention = config.recompute_granularity == 'selective' + + # Number of layers. + self.num_layers = _get_num_layers(args, model_type, + layer_type==LayerType.decoder) + + self.drop_path_rates = [ + rate.item() for rate in + torch.linspace(0, self.drop_path_rate, config.num_layers)] + + self.retro_layer_numbers = None + if model_type == ModelType.retro_decoder: + retro_layer_start = 6 if config.num_layers <= 15 else 9 + self.retro_layer_numbers = \ + np.arange(retro_layer_start, args.num_layers + 1, 3).tolist() + if model_type == ModelType.retro_encoder: + self.retro_layer_numbers = [1] + + # Transformer layers. + if args.retro_add_retriever: + assert self.recompute_granularity != 'full', \ + "Full recompute not supported for Retro." + assert args.transformer_impl == 'local', \ + "Transformer engine does not support Retro layers." + def build_layer(layer_number): + if args.transformer_impl == 'local': + current_layer_type = _get_layer_type( + model_type, layer_type, self.retro_layer_numbers, + layer_number) + return ParallelTransformerLayer( + config, + layer_number, + layer_type=current_layer_type, + self_attn_mask_type=self_attn_mask_type, + drop_path_rate=self.drop_path_rates[layer_number - 1]) + else: + # This argument is only available from TE v0.10 onwards. + extra_transformer_engine_kwargs = {} + if self.transformer_engine_v_0_8: + extra_transformer_engine_kwargs["bias"] = args.add_bias_linear + if self.transformer_engine_v_0_10: + extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu" + if self.transformer_engine_v_0_11: + extra_transformer_engine_kwargs["normalization"] = args.normalization + assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32." + assert ( + (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling + ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is " + "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.") + return transformer_engine.pytorch.TransformerLayer( + config.hidden_size, + config.ffn_hidden_size, + config.num_attention_heads, + layernorm_epsilon=config.layernorm_epsilon, + hidden_dropout=config.hidden_dropout, + attention_dropout=config.attention_dropout, + init_method=config.init_method, + output_layer_init_method=config.output_layer_init_method, + layer_number=layer_number, + kv_channels=config.kv_channels, + self_attn_mask_type=self_attn_mask_type.name, + tp_group=mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None, + tp_size=mpu.get_tensor_model_parallel_world_size(), + get_rng_state_tracker=get_cuda_rng_tracker + if get_cuda_rng_tracker().is_initialized() + else None, + fuse_wgrad_accumulation=config.gradient_accumulation_fusion, + seq_length=args.seq_length, + micro_batch_size=args.micro_batch_size, + sequence_parallel=config.sequence_parallel, + params_dtype=config.params_dtype, + apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm, + output_layernorm=False, + layer_type="encoder", + drop_path_rate=self.drop_path_rates[layer_number - 1], + set_parallel_mode=True, + fuse_qkv_params=True, + **extra_transformer_engine_kwargs) + + if config.virtual_pipeline_model_parallel_size is not None: + assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \ + 'num_layers_per_stage must be divisible by ' \ + 'virtual_pipeline_model_parallel_size' + assert args.model_type != ModelType.encoder_and_decoder + # Number of layers in each model chunk is the number of layers in the stage, + # divided by the number of model chunks in a stage. + self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size + # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0] [2] [4] [6] + # Stage 1: [1] [3] [5] [7] + # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0, 1] [4, 5] + # Stage 1: [2, 3] [6, 7] + offset = mpu.get_virtual_pipeline_model_parallel_rank() * ( + config.num_layers // config.virtual_pipeline_model_parallel_size) + \ + (mpu.get_pipeline_model_parallel_rank() * self.num_layers) + else: + # Each stage gets a contiguous set of layers. + if args.model_type == ModelType.encoder_and_decoder and \ + mpu.get_pipeline_model_parallel_world_size() > 1: + pipeline_rank = mpu.get_pipeline_model_parallel_rank() + if layer_type == LayerType.encoder: + offset = pipeline_rank * self.num_layers + else: + num_ranks_in_enc = args.pipeline_model_parallel_split_rank + offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers + else: + offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers + + if self.num_layers == 0: + # When a standalone embedding stage is used (e.g., + # args.standalone_embedding_stage == True), virtual pipeline ranks + # on pipeline rank 0 will have zero transformer layers assigned to + # them. This results in the model's input and output tensors to be + # the same, which will cause failure for certain output tensor + # optimizations (e.g., pipeline output deallocation). To remedy + # this, we assign a 'no-op' layer on these ranks, which will + # disconnect the input tensor from the output tensor. + self.num_layers = 1 + self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ]) + else: + self.layers = torch.nn.ModuleList( + [build_layer(i + 1 + offset) for i in range(self.num_layers)]) + + # Update dropout rate for Retro encoder. + if model_type == ModelType.retro_encoder: + for layer in self.layers: + if layer.self_attention.use_flash_attn: + layer.self_attention.core_attention_flash.dropout_p = \ + torch.nn.Dropout(args.retro_encoder_attention_dropout) + else: + layer.self_attention.core_attention.attention_dropout.p =\ + args.retro_encoder_attention_dropout + layer.hidden_dropout = args.retro_encoder_hidden_dropout + + if self.post_process and self.post_norm: + # Final layer norm before output. + self.final_norm = get_norm(config) + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def _checkpointed_forward(self, hidden_states, attention_mask, + encoder_output, enc_dec_attn_mask, + rotary_pos_emb, is_first_microbatch): + """Forward method with activation checkpointing.""" + def custom(start, end): + def custom_forward(*args, **kwargs): + x_, *args = args + for index in range(start, end): + layer = self._get_layer(index) + x_ = layer(x_, *args, **kwargs) + return x_ + return custom_forward + + te_forward_kwargs = {} + if self.transformer_impl == 'transformer_engine': + te_forward_kwargs['is_first_microbatch'] = is_first_microbatch + if self.transformer_engine_v_0_10: + te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + + if self.recompute_method == 'uniform': + # Uniformly divide the total number of Transformer layers and + # checkpoint the input activation of each divided chunk. + # A method to further reduce memory usage reducing checkpoints. + l = 0 + while l < self.num_layers: + if self.transformer_impl == 'transformer_engine': + hidden_states = transformer_engine.pytorch.checkpoint( + custom(l, l + self.recompute_num_layers), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, attention_mask, encoder_output, + enc_dec_attn_mask, **te_forward_kwargs) + else: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + self.recompute_num_layers), + self.distribute_saved_activations, + hidden_states, attention_mask, + encoder_output, enc_dec_attn_mask, + None, None, None, None, rotary_pos_emb) + + l += self.recompute_num_layers + + elif self.recompute_method == 'block': + # Checkpoint the input activation of only a set number of individual + # Transformer layers and skip the rest. + # A method fully use the device memory removing redundant re-computation. + for l in range(self.num_layers): + if l < self.recompute_num_layers: + if self.transformer_impl == 'transformer_engine': + hidden_states = transformer_engine.pytorch.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, attention_mask, encoder_output, + enc_dec_attn_mask, **te_forward_kwargs) + else: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + hidden_states, attention_mask, + encoder_output, enc_dec_attn_mask, + None, None, None, None, rotary_pos_emb) + else: + if self.transformer_impl == 'transformer_engine': + hidden_states = custom(l, l + 1)( + hidden_states, attention_mask, encoder_output, + enc_dec_attn_mask, **te_forward_kwargs) + else: + hidden_states = custom(l, l + 1)( + hidden_states, attention_mask, + encoder_output, enc_dec_attn_mask, + None, None, None, None, rotary_pos_emb) + else: + raise ValueError("Invalid activation recompute method.") + + return hidden_states + + def set_input_tensor(self, input_tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None): + # hidden_states: [s, b, h] + + # Checks. + if inference_params: + assert self.recompute_granularity is None, \ + 'inference does not work with activation checkpointing' + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = core.utils.make_viewless_tensor( + hidden_states, + requires_grad=True, + keep_graph=True, + ) + + # RNG context. + if self.sequence_parallel: + rng_context = tensor_parallel.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + + # Forward layers. + with rng_context: + # The fp8_autocast context manager is a no-op when enabled=True + # The if...else serves to short circuit name resolution for fp8_autocast + with transformer_engine.pytorch.fp8_autocast( + enabled=self.use_fp8, + fp8_recipe=self.fp8_recipe, + fp8_group=self.fp8_group + ) if self.use_fp8 else nullcontext(): + # Determine if the current iteration is first microbatch + if self.num_microbatches_in_previous_step != get_num_microbatches(): + self.microbatch_count = 0 # Reset count on new batch size rampup interval + self.num_microbatches_in_previous_step = get_num_microbatches() + is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0 + + # Forward pass. + if self.recompute_granularity == 'full': + hidden_states = self._checkpointed_forward(hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch) + else: + forward_kwargs = { + 'encoder_output': encoder_output, + 'enc_dec_attn_mask': enc_dec_attn_mask, + 'inference_params': inference_params, + } + + if self.transformer_impl == 'transformer_engine': + forward_kwargs['is_first_microbatch'] = is_first_microbatch + forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention + if self.transformer_engine_v_0_10: + forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + else: + forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + forward_kwargs['retriever_input'] = retriever_input + forward_kwargs['retriever_output'] = retriever_output + forward_kwargs['retriever_attn_mask'] = retriever_attn_mask + + for index in range(self.num_layers): + layer = self._get_layer(index) + + hidden_states = layer( + hidden_states, + attention_mask, + **forward_kwargs) + + # First Retro decoder layer returns both hidden_states + # and retriever_output. Make retriever_output available + # to subsequence Retro layers. + if isinstance(hidden_states, tuple): + assert len(hidden_states) == 2 + hidden_states, retriever_output = hidden_states + forward_kwargs["retriever_output"] = retriever_output + + # Skip counter update for eval and activation checkpointing + if torch.is_grad_enabled() and self.training: + self.microbatch_count += 1 + + # Final layer norm. + if self.post_process and self.post_norm: + hidden_states = self.final_norm(hidden_states) + + return hidden_states + + def load_state_dict(self, state_dict, strict=True): + """Customize load.""" + + # Handle renaming layernorm -> norm in component names + state_dict_ = {} + for key in state_dict.keys(): + # Bypass TransformerEngine module parameters. + if "layernorm_qkv" in key or "layernorm_mlp" in key: + state_dict_[key] = state_dict[key] + continue + newkey = key.replace("layernorm", "norm") + state_dict_[newkey] = state_dict[key] + + super().load_state_dict(state_dict_, strict) diff --git a/megatron/legacy/model/utils.py b/megatron/legacy/model/utils.py new file mode 100644 index 0000000..5762000 --- /dev/null +++ b/megatron/legacy/model/utils.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for models.""" + +import math + +import torch + +from megatron.training import get_args +from megatron.legacy.model import LayerNorm, RMSNorm +from megatron.core.jit import jit_fuser + +def init_method_normal(sigma): + """Init method based on N(0, sigma).""" + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method_normal(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + + +def get_linear_layer(rows, columns, init_method): + """Simple linear layer with weight initialization.""" + layer = torch.nn.Linear(rows, columns) + if get_args().perform_initialization: + init_method(layer.weight) + with torch.no_grad(): + layer.bias.zero_() + return layer + + +@jit_fuser +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * + + (1.0 + 0.044715 * x * x))) +def openai_gelu(x): + return gelu_impl(x) + + +#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter +@jit_fuser +def erf_gelu(x): + return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) + + +def get_norm(config): + args = get_args() + if args.normalization == "LayerNorm": + return LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=not config.persist_layer_norm, + sequence_parallel=config.sequence_parallel, + apply_layernorm_1p=args.apply_layernorm_1p) + elif args.normalization == "RMSNorm": + if args.apply_layernorm_1p: + raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.') + + return RMSNorm(dim=config.hidden_size, + eps=config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel) + else: + raise Exception(f"unsupported norm type '{args.normalization}'.") diff --git a/megatron/legacy/model/vision/classification.py b/megatron/legacy/model/vision/classification.py new file mode 100644 index 0000000..f9419c7 --- /dev/null +++ b/megatron/legacy/model/vision/classification.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Vision Transformer(VIT) model.""" + +import torch +from torch.nn.init import trunc_normal_ +from megatron.training import get_args +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead +from megatron.legacy.model.vision.mit_backbone import mit_b3_avg +from megatron.legacy.model.module import MegatronModule + +class VitClassificationModel(MegatronModule): + """Vision Transformer Model.""" + + def __init__(self, config, num_classes, finetune=False, + pre_process=True, post_process=True): + super(VitClassificationModel, self).__init__() + args = get_args() + self.config = config + + self.hidden_size = args.hidden_size + self.num_classes = num_classes + self.finetune = finetune + self.pre_process = pre_process + self.post_process = post_process + self.backbone = VitBackbone( + config=config, + pre_process=self.pre_process, + post_process=self.post_process, + single_token_output=True + ) + + if self.post_process: + if not self.finetune: + self.head = VitMlpHead(config, self.hidden_size, self.num_classes) + else: + self.head = get_linear_layer( + self.hidden_size, + self.num_classes, + torch.nn.init.zeros_ + ) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.backbone.set_input_tensor(input_tensor) + + def forward(self, input): + hidden_states = self.backbone(input) + + if self.post_process: + hidden_states = self.head(hidden_states) + + return hidden_states + + +class MitClassificationModel(MegatronModule): + """Mix vision Transformer Model.""" + + def __init__(self, num_classes, + pre_process=True, post_process=True): + super(MitClassificationModel, self).__init__() + args = get_args() + + self.hidden_size = args.hidden_size + self.num_classes = num_classes + + self.backbone = mit_b3_avg() + self.head = torch.nn.Linear(512, num_classes) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, torch.nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, torch.nn.Linear) and m.bias is not None: + torch.nn.init.constant_(m.bias, 0) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + pass + + def forward(self, input): + hidden_states = self.backbone(input) + hidden_states = self.head(hidden_states) + + return hidden_states diff --git a/megatron/legacy/model/vision/dino.py b/megatron/legacy/model/vision/dino.py new file mode 100644 index 0000000..20ca210 --- /dev/null +++ b/megatron/legacy/model/vision/dino.py @@ -0,0 +1,291 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +# copied from https://github.com/facebookresearch/dino/blob/main/main_dino.py +# reworked/refactored some parts to make it run in Megatron. +import math +import apex +import einops +import torch +import numpy as np +import torch.nn.functional as F +from torch.nn.init import trunc_normal_ +from megatron.training import get_args, print_rank_0 +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.mit_backbone import mit_b5_avg +from megatron.legacy.model.vision.esvit_swin_backbone import get_swin + + +class DINOLoss(torch.nn.Module): + def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp, + warmup_teacher_temp_epochs, nepochs, student_temp=0.1, + center_momentum=0.9): + super().__init__() + self.student_temp = student_temp + self.center_momentum = center_momentum + self.ncrops = ncrops + self.register_buffer("center", torch.zeros(1, out_dim)) + # we apply a warm up for the teacher temperature because + # a too high temperature makes the training instable at the beginning + self.teacher_temp_schedule = np.concatenate(( + np.linspace(warmup_teacher_temp, + teacher_temp, warmup_teacher_temp_epochs), + np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp + )) + self.teacher_temp = teacher_temp + + def forward(self, student_output, teacher_output, iteration): + """ + Cross-entropy between softmax outputs of the teacher + and student network. + """ + args = get_args() + student_out = student_output / self.student_temp + student_out = student_out.chunk(self.ncrops) + + epoch = iteration // args.iter_per_epoch + + # teacher centering and sharpening + temp = self.teacher_temp_schedule[epoch] + teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1) + + teacher_out = teacher_out.detach().chunk(2) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_out): + for v in range(len(student_out)): + if v == iq: + # we skip cases where student and teacher operate on the same view + continue + loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1) + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + self.update_center(teacher_output) + return total_loss + + @torch.no_grad() + def update_center(self, teacher_output): + """ + Update center used for teacher output. + """ + batch_center = torch.sum(teacher_output, dim=0, keepdim=True) + torch.distributed.all_reduce(batch_center) + batch_center = batch_center / (len(teacher_output) * torch.distributed.get_world_size()) + self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum) + +class DINOHead(torch.nn.Module): + def __init__(self, in_dim, out_dim, norm_last_layer=True, nlayers=3): + super().__init__() + args = get_args() + hidden_dim = args.dino_head_hidden_size + bottleneck_dim = args.dino_bottleneck_size + nlayers = max(nlayers, 1) + if nlayers == 1: + self.mlp = torch.nn.Linear(in_dim, bottleneck_dim) + else: + layers = [torch.nn.Linear(in_dim, hidden_dim)] + layers.append(torch.nn.GELU()) + for _ in range(nlayers - 2): + layers.append(torch.nn.Linear(hidden_dim, hidden_dim)) + layers.append(torch.nn.GELU()) + layers.append(torch.nn.Linear(hidden_dim, bottleneck_dim)) + self.mlp = torch.nn.Sequential(*layers) + self.apply(self._init_weights) + self.last_layer = torch.nn.utils.weight_norm(torch.nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + if norm_last_layer: + self.last_layer.weight_g.requires_grad = False + + def _init_weights(self, m): + if isinstance(m, torch.nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, torch.nn.Linear) and m.bias is not None: + torch.nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + x = torch.nn.functional.normalize(x, dim=-1, p=2) + x = self.last_layer(x) + return x + + +class MultiCropWrapper(MegatronModule): + + """ + Perform forward pass separately on each resolution input. + The inputs corresponding to a single resolution are clubbed and single + forward is run on the same resolution inputs. Hence we do several + forward passes = number of different resolutions used. We then + concatenate all the output features and run the head forward on these + concatenated features. + """ + def __init__(self, backbone, head): + super(MultiCropWrapper, self).__init__() + # disable layers dedicated to ImageNet labels classification + #backbone.fc, backbone.head = torch.nn.Identity(), torch.nn.Identity() + self.backbone = backbone + self.head = head + + def forward(self, x): + # convert to list + if not isinstance(x, list): + x = [x] + idx_crops = torch.cumsum(torch.unique_consecutive( + torch.tensor([inp.shape[-1] for inp in x]), + return_counts=True, + )[1], 0) + + start_idx = 0 + for end_idx in idx_crops: + _out = self.backbone(torch.cat(x[start_idx: end_idx])) + if start_idx == 0: + output = _out + else: + output = torch.cat((output, _out)) + start_idx = end_idx + # Run the head forward on the concatenated features. + if self.training: + return self.head(output) + else: + return output + + +def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, + warmup_epochs=0, start_warmup_value=0): + warmup_schedule = np.array([]) + warmup_iters = warmup_epochs * niter_per_ep + if warmup_epochs > 0: + warmup_schedule = \ + np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(epochs * niter_per_ep - warmup_iters) + schedule = final_value + 0.5 * (base_value - final_value) \ + * (1 + np.cos(np.pi * iters / len(iters))) + + schedule = np.concatenate((warmup_schedule, schedule)) + assert len(schedule) == epochs * niter_per_ep + return schedule + + +def get_student_backbone_and_num_features(config, pre_process=True, post_process=True): + args = get_args() + + if args.vision_backbone_type == 'vit': + student = VitBackbone(config, + pre_process=pre_process, + post_process=post_process, + drop_path_rate=0.1, + single_token_output=True) + num_features = args.hidden_size + elif args.vision_backbone_type == 'mit': + student = mit_b5_avg(drop_path_rate=0.1) + num_features = 512 + elif args.vision_backbone_type == 'swin': + student = get_swin() + num_features = student.num_features + else: + raise Exception('{} vision backbone is not supported.'.format( + args.vision_backbone_type)) + + return student, num_features + +def get_teacher_backbone_and_num_features(config, pre_process=True, post_process=True): + args = get_args() + + if args.vision_backbone_type == 'vit': + teacher = VitBackbone(config, + pre_process=pre_process, + post_process=post_process, + single_token_output=True) + num_features = args.hidden_size + elif args.vision_backbone_type == 'mit': + teacher = mit_b5_avg(drop_path_rate=0.0) + num_features = 512 + elif args.vision_backbone_type == 'swin': + teacher = get_swin(is_teacher=True) + num_features = teacher.num_features + else: + raise Exception('{} vision backbone is not supported.'.format( + args.vision_backbone_type)) + return teacher, num_features + + +class DINOPretrainModel(MegatronModule): + def __init__(self, config, pre_process=True, post_process=True): + super(DINOPretrainModel, self).__init__() + args = get_args() + self.config = config + self.out_dim = 65536 + + self.dino_loss = DINOLoss( + self.out_dim, + args.dino_local_crops_number + 2, + args.dino_warmup_teacher_temp, + args.dino_teacher_temp, + args.dino_warmup_teacher_temp_epochs, + 300, + ) + + self.pre_process = pre_process + self.post_process = post_process + self.momentum_teacher = 0.996 + + student_backbone, num_features = \ + get_student_backbone_and_num_features(config, pre_process, post_process) + + self.student = MultiCropWrapper( + student_backbone, + DINOHead(num_features, self.out_dim, + norm_last_layer=args.dino_norm_last_layer) + ) + + self.momentum_schedule = cosine_scheduler( + self.momentum_teacher, 1, + args.train_iters // args.iter_per_epoch, + args.iter_per_epoch + ) + + teacher_backbone, num_features = \ + get_teacher_backbone_and_num_features(config, pre_process, post_process) + self.teacher = MultiCropWrapper( + teacher_backbone, + DINOHead(num_features, self.out_dim) + ) + self.teacher.load_state_dict(self.student.state_dict()) + + for p in self.teacher.parameters(): + if hasattr(p, "requires_grad") and p.requires_grad is not None: + p.requires_grad = False + + def set_input_tensor(self, tensor): + pass + + def forward(self, input): + student_output = None + if self.training: + student_output = self.student(input) + teacher_output = self.teacher(input[:2]) + else: + teacher_output = self.teacher(input) + return student_output, teacher_output + + def cancel_gradients_last_layer(self, iteration): + args = get_args() + epoch = iteration // args.iter_per_epoch + if epoch < args.dino_freeze_last_layer: + for n, p in self.student.named_parameters(): + if "last_layer" in n: + p.grad = None + + def update_momentum(self, iteration): + with torch.no_grad(): + m = self.momentum_schedule[iteration] + for param_q, param_k in zip(self.student.parameters(), self.teacher.parameters()): + param_k.data.mul_(m).add_((1 - m) * param_q.detach().data) + diff --git a/megatron/legacy/model/vision/esvit_swin_backbone.py b/megatron/legacy/model/vision/esvit_swin_backbone.py new file mode 100644 index 0000000..8793204 --- /dev/null +++ b/megatron/legacy/model/vision/esvit_swin_backbone.py @@ -0,0 +1,849 @@ +# Copyright (c) 2021 Microsoft +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# Modified by Chunyuan Li (chunyl@microsoft.com) +# Swin Transformer +# -------------------------------------------------------- + +import os +import logging +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial +import torch.distributed as dist +from torch.nn.init import trunc_normal_ +from megatron.legacy.model.transformer import DropPath +from megatron.training import get_args +from megatron.legacy.model import LayerNorm +import numpy as np +from math import sqrt + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, + out_features=None, act_layer=nn.GELU, drop=0.): + super(Mlp, self).__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): + + super(WindowAttention, self).__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2 Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0).type(attn.type()) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn_out = attn + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn_out + + def extra_repr(self) -> str: + return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + @staticmethod + def compute_macs(module, input, output): + B, N, C = input[0].shape + + module.__flops__ += module.flops(N) * B + + +class SwinTransformerBlock(nn.Module): + r"""Swin Transformer Block. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, window_size=(self.window_size, self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + self.H = input_resolution[0] + self.W = input_resolution[1] + + self.attn_mask_dict = {} + + + def create_attn_mask(self, H, W): + # calculate attention mask for SW-MSA + + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1)) # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + + return attn_mask + + + def forward(self, x): + B, L, C = x.shape + H = int(sqrt(L)) + W = H + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + + if H in self.attn_mask_dict.keys(): + attn_mask = self.attn_mask_dict[H] + else: + self.attn_mask_dict[H] = self.create_attn_mask(self.H, self.W).to(x.device) + attn_mask = self.attn_mask_dict[H] + + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows, attn = self.attn(x_windows, attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x, attn + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"window_size={self.window_size}, shift_size={self.shift_size} mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r"""Patch Merging Layer. + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + H = int(sqrt(L)) + W = H + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + x, _ = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def forward_with_features(self, x): + fea = [] + for blk in self.blocks: + x, _ = blk(x) + fea.append(x) + if self.downsample is not None: + x = self.downsample(x) + return x, fea + + def forward_with_attention(self, x): + attns = [] + for blk in self.blocks: + x, attn = blk(x) + attns.append(attn) + if self.downsample is not None: + x = self.downsample(x) + return x, attns + + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None): + super().__init__() + img_size = (img_size, img_size) + patch_size = (patch_size, patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + +class SwinTransformer(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + img_size (int | tuple(int)): Input image size. + patch_size (int | tuple(int)): Patch size. + in_chans (int): Number of input channels. + num_classes (int): Number of classes for classification head. + embed_dim (int): Embedding dimension. + depths (tuple(int)): Depth of Swin Transformer layers. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. + drop_path_rate (float): Stochastic depth rate. + norm_layer (nn.Module): normalization layer. + ape (bool): If True, add absolute position embedding to the patch embedding. + patch_norm (bool): If True, add normalization after patch embedding. + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, **kwargs): + super().__init__() + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None) + self.layers.append(layer) + + self.norm = norm_layer(self.num_features) + self.avgpool = nn.AdaptiveAvgPool1d(1) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + # todo: to be implemented + return {'relative_position_bias_table'} + + def forward(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x_region = self.norm(x) # B L C + x = self.avgpool(x_region.transpose(1, 2)) # B C 1 + x = torch.flatten(x, 1) + + return x + + + def forward_feature_maps(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x_grid = self.norm(x) # B L C + x = self.avgpool(x_grid.transpose(1, 2)) # B C 1 + x = torch.flatten(x, 1) + + return x, x_grid + + + def forward_selfattention(self, x, n=1): + # n=1 return the last layer attn map; otherwise return attn maps in all layers + + + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + if n==1: + return self.forward_last_selfattention(x) + else: + return self.forward_all_selfattention(x) + + def forward_last_selfattention(self, x): + + for i, layer in enumerate(self.layers): + if i < len(self.layers) - 1: + x = layer(x) + else: + x, attns = layer.forward_with_attention(x) + return attns[-1] + + def forward_all_selfattention(self, x): + attn_out = [] + + for layer in self.layers: + x, attns = layer.forward_with_attention(x) + attn_out += attns + + return attn_out + + + def forward_return_n_last_blocks(self, x, n=1, return_patch_avgpool=False, depth=[]): + + num_blks = sum(depth) + start_idx = num_blks - n + + sum_cur = 0 + for i, d in enumerate(depth): + sum_cur_new = sum_cur + d + if start_idx >= sum_cur and start_idx < sum_cur_new: + start_stage = i + start_blk = start_idx - sum_cur + sum_cur = sum_cur_new + + + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + # we will return the averaged token features from the `n` last blocks + # note: there is no [CLS] token in Swin Transformer + output = [] + s = 0 + for i, layer in enumerate(self.layers): + x, fea = layer.forward_with_features(x) + + if i >= start_stage: + for x_ in fea[start_blk:]: + + if i == len(self.layers)-1: # use the norm in the last stage + x_ = self.norm(x_) + + x_avg = torch.flatten(self.avgpool(x_.transpose(1, 2)), 1) # B C + # print(f'Stage {i}, x_avg {x_avg.shape}') + output.append(x_avg) + + start_blk = 0 + + return torch.cat(output, dim=-1) + + + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + if dist.get_rank() == 0: + print(f"GFLOPs layer_{i}: {layer.flops() / 1e9}") + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops + + def init_weights(self, pretrained='', pretrained_layers=[], verbose=True): + if os.path.isfile(pretrained): + pretrained_dict = torch.load(pretrained, map_location='cpu') + logging.info(f'=> loading pretrained model {pretrained}') + model_dict = self.state_dict() + pretrained_dict = { + k: v for k, v in pretrained_dict.items() + if k in model_dict.keys() + } + need_init_state_dict = {} + for k, v in pretrained_dict.items(): + need_init = ( + k.split('.')[0] in pretrained_layers + or pretrained_layers[0] is '*' + or 'relative_position_index' not in k + or 'attn_mask' not in k + ) + + if need_init: + if verbose: + logging.info(f'=> init {k} from {pretrained}') + + if 'relative_position_bias_table' in k and v.size() != model_dict[k].size(): + relative_position_bias_table_pretrained = v + relative_position_bias_table_current = model_dict[k] + L1, nH1 = relative_position_bias_table_pretrained.size() + L2, nH2 = relative_position_bias_table_current.size() + if nH1 != nH2: + logging.info(f"Error in loading {k}, passing") + else: + if L1 != L2: + logging.info( + '=> load_pretrained: resized variant: {} to {}' + .format((L1, nH1), (L2, nH2)) + ) + S1 = int(L1 ** 0.5) + S2 = int(L2 ** 0.5) + relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate( + relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1), + size=(S2, S2), + mode='bicubic') + v = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0) + + if 'absolute_pos_embed' in k and v.size() != model_dict[k].size(): + absolute_pos_embed_pretrained = v + absolute_pos_embed_current = model_dict[k] + _, L1, C1 = absolute_pos_embed_pretrained.size() + _, L2, C2 = absolute_pos_embed_current.size() + if C1 != C1: + logging.info(f"Error in loading {k}, passing") + else: + if L1 != L2: + logging.info( + '=> load_pretrained: resized variant: {} to {}' + .format((1, L1, C1), (1, L2, C2)) + ) + S1 = int(L1 ** 0.5) + S2 = int(L2 ** 0.5) + absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(-1, S1, S1, C1) + absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(0, 3, 1, 2) + absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate( + absolute_pos_embed_pretrained, size=(S2, S2), mode='bicubic') + v = absolute_pos_embed_pretrained_resized.permute(0, 2, 3, 1).flatten(1, 2) + + need_init_state_dict[k] = v + self.load_state_dict(need_init_state_dict, strict=False) + + def freeze_pretrained_layers(self, frozen_layers=[]): + for name, module in self.named_modules(): + if ( + name.split('.')[0] in frozen_layers + or '.'.join(name.split('.')[0:2]) in frozen_layers + or (len(frozen_layers) > 0 and frozen_layers[0] is '*') + ): + for _name, param in module.named_parameters(): + param.requires_grad = False + logging.info( + '=> set param {} requires grad to False' + .format(name) + ) + for name, param in self.named_parameters(): + if ( + name.split('.')[0] in frozen_layers + or (len(frozen_layers) > 0 and frozen_layers[0] is '*') + and param.requires_grad is True + ): + param.requires_grad = False + logging.info( + '=> set param {} requires grad to False' + .format(name) + ) + return self + + +def get_swin(is_teacher=False): + args = get_args() + + if args.swin_backbone_type == "tiny": + embed_dim = 96 + depths = [2, 2, 6, 2] + num_heads = [3, 6, 12, 24] + drop_path_rate = 0.1 + elif args.swin_backbone_type == 'h3': + embed_dim = 384 + depths = [2, 2, 18, 2] + num_heads = [6, 12, 24, 48] + drop_path_rate = 0.2 + else: + embed_dim = 128 + depths = [2, 2, 18, 2] + num_heads = [4, 8, 16, 32] + drop_path_rate = 0.2 + + swin = SwinTransformer( + img_size=224, + in_chans=3, + num_classes=1000, + patch_size=4, + embed_dim=embed_dim, + depths=depths, + num_heads=num_heads, + window_size=7, + mlp_ratio=4, + qkv_bias=True, + drop_rate=0, + attn_drop_rate=0, + drop_path_rate=(0.0 if is_teacher else drop_path_rate), + norm_layer=partial(LayerNorm, eps=1e-6), + ape=False, + patch_norm=True, + ) + + return swin + diff --git a/megatron/legacy/model/vision/inpainting.py b/megatron/legacy/model/vision/inpainting.py new file mode 100644 index 0000000..f71f5e3 --- /dev/null +++ b/megatron/legacy/model/vision/inpainting.py @@ -0,0 +1,152 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math +import apex +import einops +import torch +import torch.nn.functional as F +from megatron.training import get_args, print_rank_0 +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.mit_backbone import mit_b3 +from megatron.legacy.model.vision.utils import resize + + +class VitInpaintingModel(MegatronModule): + + def __init__(self, config, pre_process=True, post_process=True): + super(VitInpaintingModel, self).__init__() + args = get_args() + + self.config = config + self.pre_process = pre_process + self.post_process = post_process + self.hidden_size = config.hidden_size + self.backbone = VitBackbone( + config=config, + pre_process=self.pre_process, + post_process=self.post_process, + class_token=False, + ) + self.patch_dim = args.patch_dim + self.img_h = args.img_h + self.img_w = args.img_w + self.seq_length = args.seq_length + # full mask + + if self.post_process: + self.linear_decoder = get_linear_layer( + self.hidden_size, + self.backbone.flatten_dim, + torch.nn.init.zeros_ + ) + + def set_input_tensor(self, input_tensor): + self.backbone.set_input_tensor(input_tensor) + + def forward(self, input): + + hidden_states = self.backbone(input) + + if not self.post_process: + return hidden_states + decoded_output = self.linear_decoder(hidden_states) + output = einops.rearrange( + decoded_output, + "b (h w) (p1 p2 c) -> b c (h p1) (w p2)", + p1=self.patch_dim, + p2=self.patch_dim, + h=self.img_h//self.patch_dim, + w=self.img_w//self.patch_dim, + ) + + return output + + +class MLP(torch.nn.Module): + """ + Linear Embedding + """ + def __init__(self, input_dim=2048, embed_dim=768): + super().__init__() + self.proj = torch.nn.Linear(input_dim, embed_dim) + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class MitInpaintingModel(MegatronModule): + """Mix vision Transformer Model.""" + + def __init__(self, pre_process=True, post_process=True): + super(MitInpaintingModel, self).__init__() + self.pre_process = pre_process + self.post_process = post_process + + args = get_args() + self.patch_dim = args.patch_dim + self.img_h = args.img_h + self.img_w = args.img_w + self.flatten_dim = self.patch_dim * self.patch_dim * 3 + self.backbone = mit_b3() + + self.in_channels = [64, 128, 320, 512] + self.embedding_dim = 768 + + c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels + + self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=self.embedding_dim) + self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=self.embedding_dim) + self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=self.embedding_dim) + self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=self.embedding_dim) + + self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False) + self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim) + self.dropout = torch.nn.Dropout2d(0.1) + + self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + pass + + def forward(self, input): + c1, c2, c3, c4 = self.backbone(input) + + n, _, h, w = c4.shape + _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3]) + _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3]) + _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3]) + _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3]) + + _c = torch.cat([_c4, _c3, _c2, _c1], dim=1) + _c = self.conv_fuse(_c) + + x = self.norm(_c) + x = F.relu(x, inplace=True) + x = self.dropout(x) + + x = self.linear_pred(x) + + output = einops.rearrange( + x, + "b (c p1 p2) h w -> b c (h p1) (w p2)", + p1=self.patch_dim, + p2=self.patch_dim, + h=self.img_h//self.patch_dim, + w=self.img_w//self.patch_dim, + ) + + return output diff --git a/megatron/legacy/model/vision/knn_monitor.py b/megatron/legacy/model/vision/knn_monitor.py new file mode 100644 index 0000000..ad796d1 --- /dev/null +++ b/megatron/legacy/model/vision/knn_monitor.py @@ -0,0 +1,129 @@ +import torch.nn.functional as F +import torch +from megatron.training import print_rank_0, get_args +from megatron.core import mpu +from megatron.legacy.data.vit_dataset import ClassificationTransform +from megatron.legacy.data.image_folder import ImageFolder + +_FEATURE_BANK = None + + +def build_data_loader(dataset, drop_last=True, shuffle=False): + """Data loader. Note that batch-size is the local (per GPU) batch-size.""" + # Sampler. + args = get_args() + micro_batch_size = 16 + num_workers = args.num_workers + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, + drop_last=drop_last, shuffle=shuffle + ) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=micro_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=not drop_last, + pin_memory=True, + ) + return data_loader + + +def compute_feature_bank(model): + args = get_args() + global _FEATURE_BANK + feature_bank = [] + feature_label = [] + + train_ds = ImageFolder( + root=args.data_path[0], + transform=ClassificationTransform((args.img_h, args.img_w), train=False), + data_per_class_fraction=1.0 + ) + classes = len(train_ds.classes) + dataloader = build_data_loader(train_ds) + + for m in model: + m.eval() + + with torch.no_grad(): + for i, batch in enumerate(dataloader): + images = batch[0].cuda().contiguous() + labels = batch[1].cuda().contiguous() + student_feature, teacher_feature = model[0](images) + feature = F.normalize(teacher_feature.float(), dim=1) + feature_bank.append(feature) + feature_label.append(labels) + + for m in model: + m.train() + + # [N', D] + feature_bank = torch.cat(feature_bank, dim=0).contiguous() + feature_label = torch.cat(feature_label, dim=0).contiguous() + + feature_banks = [torch.zeros_like(feature_bank) + for i in range(mpu.get_data_parallel_world_size())] + torch.distributed.all_gather(feature_banks, + feature_bank, + group=mpu.get_data_parallel_group()) + + assert torch.all(torch.eq(feature_banks[mpu.get_data_parallel_rank()], + feature_bank)) + + feature_labels = [torch.zeros_like(feature_label) + for i in range(mpu.get_data_parallel_world_size())] + torch.distributed.all_gather(feature_labels, + feature_label, + group=mpu.get_data_parallel_group()) + + # [D, N] + feature_banks = torch.cat(feature_banks, dim=0).t().contiguous() + # [N] + feature_labels = torch.cat(feature_labels, dim=0).contiguous() + print_rank_0("feature_banks size is {}".format(feature_banks.size())) + print_rank_0("feature labels size is {}".format(feature_labels.size())) + + _FEATURE_BANK = (feature_banks, feature_labels, classes) + + +def get_feature_bank(): + global _FEATURE_BANK + assert _FEATURE_BANK is not None + return _FEATURE_BANK + + +# knn monitor as in InstDisc https://arxiv.org/abs/1805.01978 +# implementation follows http://github.com/zhirongw/lemniscate.pytorch and +# https://github.com/leftthomas/SimCLR +def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t): + # compute cos similarity between each feature vector and feature bank ---> [B, N] + sim_matrix = torch.mm(feature, feature_bank) + # [B, K] + sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1) + # [B, K] + sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), + dim=-1, + index=sim_indices) + sim_weight = (sim_weight / knn_t).exp() + + # counts for each class + one_hot_label = torch.zeros(feature.size(0) * knn_k, + classes, + device=sim_labels.device) + # [B*K, C] + one_hot_label = one_hot_label.scatter(dim=-1, + index=sim_labels.view(-1, 1), + value=1.0) + # weighted score ---> [B, C] + pred_scores = torch.sum( + one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1), + dim=1) + + pred_labels = pred_scores.argsort(dim=-1, descending=True) + return pred_labels diff --git a/megatron/legacy/model/vision/mit_backbone.py b/megatron/legacy/model/vision/mit_backbone.py new file mode 100644 index 0000000..3ca2303 --- /dev/null +++ b/megatron/legacy/model/vision/mit_backbone.py @@ -0,0 +1,415 @@ +# Copyright (c) 2023, NVIDIA Corporation. All rights reserved. + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial +from torch.nn.init import trunc_normal_ +from megatron.legacy.model.transformer import DropPath +from megatron.legacy.model import LayerNorm + + +class Mlp(nn.Module): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = self.fc1(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + sr_ratio=1): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = LayerNorm(dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + B, N, C = x.shape + q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + if self.sr_ratio > 1: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + else: + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + k, v = kv[0], kv[1] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm, sr_ratio=1): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + + return x + + +class OverlapPatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768): + super().__init__() + img_size = (img_size, img_size) + patch_size = (patch_size, patch_size) + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, + padding=(patch_size[0] // 2, patch_size[1] // 2)) + self.norm = LayerNorm(embed_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x): + x = self.proj(x) + _, _, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + + return x, H, W + + +class MixVisionTransformer(nn.Module): + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., + attn_drop_rate=0., drop_path_rate=0., norm_layer=LayerNorm, + depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], output_avg=False): + super().__init__() + self.num_classes = num_classes + self.depths = depths + self.output_avg = output_avg + + # patch_embed + self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans, + embed_dim=embed_dims[0]) + self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0], + embed_dim=embed_dims[1]) + self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1], + embed_dim=embed_dims[2]) + self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2], + embed_dim=embed_dims[3]) + + # transformer encoder + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + cur = 0 + self.block1 = nn.ModuleList([Block( + dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, + sr_ratio=sr_ratios[0]) + for i in range(depths[0])]) + self.norm1 = norm_layer(embed_dims[0]) + + cur += depths[0] + self.block2 = nn.ModuleList([Block( + dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, + sr_ratio=sr_ratios[1]) + for i in range(depths[1])]) + self.norm2 = norm_layer(embed_dims[1]) + + cur += depths[1] + self.block3 = nn.ModuleList([Block( + dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, + sr_ratio=sr_ratios[2]) + for i in range(depths[2])]) + self.norm3 = norm_layer(embed_dims[2]) + + cur += depths[2] + self.block4 = nn.ModuleList([Block( + dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, + sr_ratio=sr_ratios[3]) + for i in range(depths[3])]) + self.norm4 = norm_layer(embed_dims[3]) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def reset_drop_path(self, drop_path_rate): + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))] + cur = 0 + for i in range(self.depths[0]): + self.block1[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[0] + for i in range(self.depths[1]): + self.block2[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[1] + for i in range(self.depths[2]): + self.block3[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[2] + for i in range(self.depths[3]): + self.block4[i].drop_path.drop_prob = dpr[cur + i] + + def freeze_patch_emb(self): + self.patch_embed1.requires_grad = False + + def forward_features(self, x): + B = x.shape[0] + outs = [] + + # stage 1 + x, H, W = self.patch_embed1(x) + for i, blk in enumerate(self.block1): + x = blk(x, H, W) + x = self.norm1(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 2 + x, H, W = self.patch_embed2(x) + for i, blk in enumerate(self.block2): + x = blk(x, H, W) + x = self.norm2(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 3 + x, H, W = self.patch_embed3(x) + for i, blk in enumerate(self.block3): + x = blk(x, H, W) + x = self.norm3(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 4 + x, H, W = self.patch_embed4(x) + for i, blk in enumerate(self.block4): + x = blk(x, H, W) + x = self.norm4(x) + if not self.output_avg: + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + return outs + + def forward(self, x): + x = self.forward_features(x) + + if self.output_avg: + x = x[3].mean(dim=1) + + return x + + +class DWConv(nn.Module): + def __init__(self, dim=768): + super(DWConv, self).__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + x = x.transpose(1, 2).view(B, C, H, W) + x = self.dwconv(x) + x = x.flatten(2).transpose(1, 2) + + return x + +class mit_b0(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b0, self).__init__( + patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + + +class mit_b1(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b1, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + + +class mit_b2(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b2, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + + +class mit_b3(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b3, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + +class mit_b3_avg(MixVisionTransformer): + def __init__(self, drop_path_rate=0.1, **kwargs): + super(mit_b3_avg, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True) + +class mit_b4(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b4, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + +class mit_b5(MixVisionTransformer): + def __init__(self, **kwargs): + super(mit_b5, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=0.1) + +class mit_b5_avg(MixVisionTransformer): + def __init__(self, drop_path_rate=0.1, **kwargs): + super(mit_b5_avg, self).__init__( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True) + diff --git a/megatron/legacy/model/vision/swin_backbone.py b/megatron/legacy/model/vision/swin_backbone.py new file mode 100644 index 0000000..231802c --- /dev/null +++ b/megatron/legacy/model/vision/swin_backbone.py @@ -0,0 +1,625 @@ +# Copyright (c) 2021 Microsoft +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# Swin Transformer +# -------------------------------------------------------- + +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +from math import sqrt + +from megatron.training import get_args +from functools import partial + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, + out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + self.H = input_resolution[0] + self.W = input_resolution[1] + + self.attn_mask_dict = {} + + def create_attn_mask(self, H, W): + # calculate attention mask for SW-MSA + + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1)) # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + + return attn_mask + + + def forward(self, x): + B, L, C = x.shape + H = int(sqrt(L)) + W = H + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x_b4_ds = x + if self.downsample is not None: + x = self.downsample(x) + return x_b4_ds, x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class SwinTransformer(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3, + norm_layer=partial(nn.LayerNorm, eps=1e-6), ape=False, patch_norm=True, + use_checkpoint=False, output_avg=False, **kwargs): + super().__init__() + + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + self.img_size = to_2tuple(img_size) + self.patch_size = to_2tuple(patch_size) + self.output_avg = output_avg + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + def forward(self, x): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + h = self.img_size[0] // self.patch_size[0] + w = self.img_size[1] // self.patch_size[1] + outs = [] + + for i, layer in enumerate(self.layers): + px, x = layer(x) + b, n, c = px.shape + + if i != len(self.layers) - 1 or not self.output_avg: + px = px.permute(0, 2, 1).contiguous() + px = px.reshape(b, c, h, w) + # is this a fair assumption ?? i think it's baked into the architecture + h, w = h//2, w//2 + outs.append(px) + + if self.output_avg: + return outs[-1].mean(dim=1) + + return outs + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops + + +def get_swin(drop_path_rate=0.3, output_avg=False): + args = get_args() + + window_size = 7 + embed_dim = 128 + depths = [2, 2, 18, 2] + num_heads = [4, 8, 16, 32] + swin = SwinTransformer( + img_size=(args.img_h, args.img_w,), + in_chans=3, + patch_size=args.patch_dim, + embed_dim=embed_dim, + depths=depths, + num_heads=num_heads, + window_size=window_size, + drop_path_rate=drop_path_rate, + output_avg=output_avg, + ) + + return swin + diff --git a/megatron/legacy/model/vision/utils.py b/megatron/legacy/model/vision/utils.py new file mode 100644 index 0000000..b406891 --- /dev/null +++ b/megatron/legacy/model/vision/utils.py @@ -0,0 +1,27 @@ +import warnings +import torch +import torch.nn.functional as F + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + if isinstance(size, torch.Size): + size = tuple(int(x) for x in size) + return F.interpolate(input, size, scale_factor, mode, align_corners) diff --git a/megatron/legacy/model/vision/vit_backbone.py b/megatron/legacy/model/vision/vit_backbone.py new file mode 100644 index 0000000..b46f6f7 --- /dev/null +++ b/megatron/legacy/model/vision/vit_backbone.py @@ -0,0 +1,248 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Vision Transformer(VIT) model.""" + +import math +import einops +import torch +import apex +import torch.nn.functional as F +from megatron.training import get_args +from megatron.legacy.model.transformer import ParallelTransformer +from megatron.legacy.model.utils import ( + get_linear_layer, + init_method_normal, + scaled_init_method_normal, +) +from megatron.legacy.model.module import MegatronModule + +CLASS_TOKEN_LENGTH = 8 + +class VitMlpHead(MegatronModule): + """Pooler layer. + + Pool hidden states of a specific token (for example start of the + sequence) and add a linear transformation followed by a tanh. + + Args: + hidden_size: hidden size + init_method: weight initialization method for the linear layer. + bias is set to zero. + """ + + def __init__(self, config, hidden_size, num_classes): + super(VitMlpHead, self).__init__() + self.config = config + self.dense_in = torch.nn.Linear(hidden_size, hidden_size) + self.relu = torch.nn.ReLU() + self.dense_out = torch.nn.Linear(hidden_size, num_classes) + torch.nn.init.constant_(self.dense_out.bias, -10) + + def forward(self, hidden_states): + # hidden_states: [b, 1, h] + # sequence_index: index of the token to pool. + dense_in_result = self.dense_in(hidden_states) + tanh_result = torch.tanh(dense_in_result) + dense_out_result = self.dense_out(tanh_result) + return dense_out_result + + +def isPerfectSquare(x): + if(x >= 0): + sr = math.sqrt(x) + return (int(sr) * int(sr) == x) + return False + + +def twod_interpolate_position_embeddings_hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, +): + + args = get_args() + num_patches_per_dim_h = args.img_h // args.patch_dim + num_patches_per_dim_w = args.img_w // args.patch_dim + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + hidden_size = args.hidden_size + + key = prefix + "weight" + + assert key in state_dict + if key in state_dict: + input_param = state_dict[key] + + input_seq_len = input_param.shape[0] + assert(isPerfectSquare(input_seq_len) or isPerfectSquare(input_seq_len - CLASS_TOKEN_LENGTH)) + input_has_class_token = not isPerfectSquare(input_seq_len) + num_tok_input = input_seq_len - CLASS_TOKEN_LENGTH if input_has_class_token else input_seq_len + num_tok_output = num_patches + output_has_class_token = args.class_token_present + + # update input_param and load it to state_dict[key] + if input_has_class_token: + input_param_tok = input_param[:CLASS_TOKEN_LENGTH, :] + input_param_grid = input_param[CLASS_TOKEN_LENGTH:, :] + else: + input_param_tok = torch.zeros(CLASS_TOKEN_LENGTH, hidden_size) + input_param_grid = input_param + + assert input_param.shape[1] == hidden_size + + if num_tok_input != num_tok_output: + + gs_input = int(math.sqrt(num_tok_input)) + gs_new = (num_patches_per_dim_h, num_patches_per_dim_w) + + input_param_grid = input_param_grid.transpose(0, 1).contiguous() + input_param_grid = input_param_grid.reshape( + (1, -1, gs_input, gs_input) + ) + input_param_grid = input_param_grid.float() + scale_factor = (gs_new[0] / gs_input, gs_new[1] / gs_input) + + input_param_grid = F.interpolate( + input_param_grid, scale_factor=scale_factor, mode="bilinear" + ) + + input_param_grid = input_param_grid.half() + input_param_grid = input_param_grid.reshape((-1, num_tok_output)) + input_param_grid = input_param_grid.transpose(0, 1).contiguous() + + assert input_param_grid.shape[1] == hidden_size + + input_param = input_param_grid + assert ( + input_param.shape[0] == num_tok_output + and input_param.shape[1] == hidden_size + ) + + if output_has_class_token: + input_param = torch.cat((input_param_tok, input_param), dim=0) + + state_dict[key] = input_param + + +class VitBackbone(MegatronModule): + """Vision Transformer Model.""" + + def __init__(self, + config, + pre_process=True, + post_process=True, + class_token=True, + single_token_output=False, + post_layer_norm=True, + drop_path_rate=0.0): + super(VitBackbone, self).__init__(share_embeddings_and_output_weights=False) + args = get_args() + self.config = config + + self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy + + self.pre_process = pre_process + self.post_process = post_process + self.class_token = class_token + self.post_layer_norm = post_layer_norm + self.hidden_size = args.hidden_size + self.patch_dim = args.patch_dim + self.img_h = args.img_h + self.img_w = args.img_w + self.micro_batch_size = args.micro_batch_size + self.single_token_output = single_token_output + self.drop_path_rate = drop_path_rate + + assert self.img_h % self.patch_dim == 0 + assert self.img_w % self.patch_dim == 0 + self.num_patches_per_dim_h = self.img_h // self.patch_dim + self.num_patches_per_dim_w = self.img_w // self.patch_dim + self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w + self.seq_length = self.num_patches + (CLASS_TOKEN_LENGTH if self.class_token else 0) + self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels + self.input_tensor = None + self.position_ids = None + + if self.pre_process: + # cls_token + if self.class_token: + self.cls_token = torch.nn.Parameter( + torch.randn(1, CLASS_TOKEN_LENGTH, self.hidden_size) + ) + torch.nn.init.zeros_(self.cls_token) + self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() + + # Linear encoder + self.linear_encoder = torch.nn.Linear( + self.flatten_dim, self.hidden_size + ) + + # embedding + self.position_embeddings = torch.nn.Embedding( + self.seq_length, self.hidden_size + ) + init_method_normal(args.init_method_std)( + self.position_embeddings.weight + ) + + args.class_token_present = self.class_token + self.position_embeddings._register_load_state_dict_pre_hook( + twod_interpolate_position_embeddings_hook + ) + + self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout) + + # Transformer + self.transformer = ParallelTransformer( + config, + model_type=args.model_type, + pre_process=self.pre_process, + post_process=self.post_process, + post_layer_norm=self.post_layer_norm, + drop_path_rate=self.drop_path_rate + ) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + self.transformer.set_input_tensor(input_tensor) + + def forward(self, input): + + if self.pre_process: + rearranged_input = einops.rearrange( + input, + "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", + p1=self.patch_dim, + p2=self.patch_dim, + ) + + assert rearranged_input.dtype == torch.half + encoder_output = self.linear_encoder(rearranged_input) + + concatenated_tokens = encoder_output + if self.class_token: + cls_tokens = self.cls_token.expand(encoder_output.shape[0], -1, -1) + concatenated_tokens = torch.cat((cls_tokens, encoder_output), dim=1) + + token_embeddings = concatenated_tokens + \ + self.position_embeddings(self.position_ids[:, :concatenated_tokens.shape[1]]) + # [b, s, h] => [s, b, h] + token_embeddings = token_embeddings.transpose(0, 1).contiguous() + hidden_states = self.embedding_dropout(token_embeddings) + else: + hidden_states = input + + hidden_states = self.transformer(hidden_states, None) + + if self.post_process: + # [s b h] => [b s h] + if self.single_token_output: + hidden_states = hidden_states[0] + else: + hidden_states = hidden_states.transpose(0, 1).contiguous() + + return hidden_states + diff --git a/megatron/legacy/mpu/tests/__init__.py b/megatron/legacy/mpu/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/megatron/legacy/mpu/tests/commons.py b/megatron/legacy/mpu/tests/commons.py new file mode 100644 index 0000000..611daf0 --- /dev/null +++ b/megatron/legacy/mpu/tests/commons.py @@ -0,0 +1,70 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import argparse +import os +import random +import numpy +import torch + +import mpu + + +class IdentityLayer(torch.nn.Module): + def __init__(self, size, scale=1.0): + super(IdentityLayer, self).__init__() + self.weight = torch.nn.Parameter(scale * torch.randn(size)) + + def forward(self): + return self.weight + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + mpu.model_parallel_cuda_manual_seed(seed) + + +def initialize_distributed(backend='nccl'): + """Initialize torch.distributed.""" + # Get local rank in case it is provided. + parser = argparse.ArgumentParser() + parser.add_argument('--local_rank', type=int, default=None, + help='local rank passed from distributed launcher') + args = parser.parse_args() + local_rank = args.local_rank + + # Get rank and world size. + rank = int(os.getenv('RANK', '0')) + world_size = int(os.getenv("WORLD_SIZE", '1')) + + print('> initializing torch.distributed with local rank: {}, ' + 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) + + # Set the device id. + device = rank % torch.cuda.device_count() + if local_rank is not None: + device = local_rank + torch.cuda.set_device(device) + + # Call the init process. + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend=backend, + world_size=world_size, + rank=rank, + init_method=init_method) + + +def print_separator(message): + torch.distributed.barrier() + filler_len = (78 - len(message)) // 2 + filler = '-' * filler_len + string = '\n' + filler + ' {} '.format(message) + filler + if torch.distributed.get_rank() == 0: + print(string, flush=True) + torch.distributed.barrier() diff --git a/megatron/legacy/mpu/tests/test_cross_entropy.py b/megatron/legacy/mpu/tests/test_cross_entropy.py new file mode 100644 index 0000000..00ae422 --- /dev/null +++ b/megatron/legacy/mpu/tests/test_cross_entropy.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from commons import set_random_seed +from commons import IdentityLayer +from commons import print_separator +from commons import initialize_distributed +from mpu.cross_entropy import vocab_parallel_cross_entropy +import mpu +import torch.nn.functional as F +import torch +import random +import sys +sys.path.append("../..") + + +def torch_cross_entropy(batch_size, seq_length, vocab_size, + logits_scale, seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + target = torch.cuda.LongTensor( + size=(batch_size, seq_length)).random_(0, vocab_size) + loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), + target.view(-1), + reduction='none').view_as(target).mean() + loss.backward() + return loss, identity.weight.grad + + +def mpu_cross_entropy(batch_size, seq_length, vocab_size, + logits_scale, seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits) + target = torch.cuda.LongTensor( + size=(batch_size, seq_length)).random_(0, vocab_size) + loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() + loss.backward() + return loss, identity.weight.grad + + +def test_cross_entropy(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cross entropy with model parallel size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + batch_size = 13 + seq_length = 17 + vocab_size_per_partition = 11 + logits_scale = 1000.0 + vocab_size = vocab_size_per_partition * tensor_model_parallel_size + seed = 1234 + + loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, + vocab_size, logits_scale, + seed) + loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, + vocab_size, logits_scale, + seed) + + error = loss_torch.sub_(loss_mpu).abs().max() + print(' max error in loss on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = grad_torch.sub_(grad_mpu).abs().max() + print(' max error in grad on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_tensor_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test cross entropy') + test_cross_entropy(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 diff --git a/megatron/legacy/mpu/tests/test_data.py b/megatron/legacy/mpu/tests/test_data.py new file mode 100644 index 0000000..c30bf4b --- /dev/null +++ b/megatron/legacy/mpu/tests/test_data.py @@ -0,0 +1,75 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from commons import print_separator +from commons import initialize_distributed +from mpu import data as data_utils +import mpu +import torch +import functools +import operator +import sys +sys.path.append("../..") + + +def test_broadcast_data(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing broadcast_data with model parallel size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + torch.manual_seed(1234 + mpu.get_data_parallel_rank()) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + key_size_t = {'key1': [7, 11], + 'key2': [8, 2, 1], + 'key3': [13], + 'key4': [5, 1, 2], + 'key5': [5, 12]} + keys = list(key_size_t.keys()) + + data = {} + data_t = {} + for key in key_size_t: + data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) + data_t[key] = data[key].clone() + data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) + data_t['keyX'] = data['keyX'].clone() + if mpu.get_tensor_model_parallel_rank() != 0: + data = None + + data_utils._check_data_types(keys, data_t, torch.int64) + key_size, key_numel, \ + total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) + for key in keys: + assert key_size[key] == key_size_t[key] + total_numel_t = 0 + for key in keys: + target_size = functools.reduce(operator.mul, key_size_t[key], 1) + assert key_numel[key] == target_size + total_numel_t += target_size + assert total_numel == total_numel_t + + data_b = data_utils.broadcast_data(keys, data, torch.int64) + for key in keys: + tensor = data_t[key].cuda() + assert data_b[key].sub(tensor).abs().max() == 0 + + # Reset groups + mpu.destroy_tensor_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test test broadcast data') + test_broadcast_data(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 diff --git a/megatron/legacy/mpu/tests/test_initialize.py b/megatron/legacy/mpu/tests/test_initialize.py new file mode 100644 index 0000000..e5d2be3 --- /dev/null +++ b/megatron/legacy/mpu/tests/test_initialize.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from commons import print_separator +from commons import initialize_distributed +import mpu +import torch +import sys +sys.path.append("../..") + + +def test_initialize_model_parallel(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing initialize_model_parallel with size {} ...'.format( + tensor_model_parallel_size)) + tensor_model_parallel_size_ = min(tensor_model_parallel_size, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(tensor_model_parallel_size_) + assert mpu.model_parallel_is_initialized() + + # Checks. + def check(group, world_size, rank): + assert world_size == torch.distributed.get_world_size(group=group) + assert rank == torch.distributed.get_rank(group=group) + + # Model parallel. + world_size = tensor_model_parallel_size_ + rank = torch.distributed.get_rank() % tensor_model_parallel_size_ + assert world_size == mpu.get_tensor_model_parallel_world_size() + assert rank == mpu.get_tensor_model_parallel_rank() + check(mpu.get_tensor_model_parallel_group(), world_size, rank) + + # Data parallel. + world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ + rank = torch.distributed.get_rank() // tensor_model_parallel_size + assert world_size == mpu.get_data_parallel_world_size() + assert rank == mpu.get_data_parallel_rank() + check(mpu.get_data_parallel_group(), world_size, rank) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): + + if torch.distributed.get_rank() == 0: + print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( + tensor_model_parallel_size_)) + tensor_model_parallel_size = min(tensor_model_parallel_size_, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(tensor_model_parallel_size) + assert mpu.model_parallel_is_initialized() + + # Checks + src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() + assert mpu.get_tensor_model_parallel_src_rank() == src_rank + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test initialize model parallel') + test_initialize_model_parallel(tensor_model_parallel_size) + print_separator('test model parallel source rank') + test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 diff --git a/megatron/legacy/mpu/tests/test_layers.py b/megatron/legacy/mpu/tests/test_layers.py new file mode 100644 index 0000000..73ad4b9 --- /dev/null +++ b/megatron/legacy/mpu/tests/test_layers.py @@ -0,0 +1,517 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from mpu import layers +from commons import set_random_seed +from commons import print_separator +from commons import initialize_distributed +import mpu +from torch.nn.parameter import Parameter +import torch.nn.init as init +import torch +import random +import sys +sys.path.append("../..") + + +def test_parallel_embedding(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing parallel embedding with model parallel size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + batch_size = 17 + seq_length = 23 + vocab_size = 48 + hidden_size = 16 + seed = 1236 + + set_random_seed(123) + input_data = torch.LongTensor( + size=(batch_size, seq_length)).random_(0, vocab_size).cuda() + loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() + + set_random_seed(seed) + embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() + + output = embedding_original(input_data) + loss_original = torch.mul(output, loss_weight).sum() + loss_original.backward() + + set_random_seed(seed) + embedding_parallel = layers.ParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_parallel(input_data) + loss_parallel = torch.mul(output, loss_weight).sum() + loss_parallel.backward() + + set_random_seed(seed) + embedding_vocab_parallel = layers.VocabParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_vocab_parallel(input_data) + loss_vocab_parallel = torch.mul(output, loss_weight).sum() + loss_vocab_parallel.backward() + + torch.distributed.barrier() + error = loss_parallel.sub(loss_original).abs() + print(' error in loss (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + torch.distributed.barrier() + error = loss_vocab_parallel.sub(loss_original).abs() + print(' error in loss (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + hidden_size // tensor_model_parallel_size, + 1)[mpu.get_tensor_model_parallel_rank()] + error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max() + print(' error in grad (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + vocab_size // tensor_model_parallel_size, + 0)[mpu.get_tensor_model_parallel_rank()] + error = embedding_vocab_parallel.weight.grad.sub( + weight_grad_orig).abs().max() + print(' error in grad (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_initialize_affine_weight(tensor_model_parallel_size): + + mpu.initialize_model_parallel(tensor_model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing initialize_affine_weight with model parallel ' + 'size: {}'.format(tensor_model_parallel_size)) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed = 12345 + input_size_coeff = 13 + input_size = input_size_coeff * tensor_model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * tensor_model_parallel_size + + # --------------- + # Column parallel + # --------------- + weight = torch.empty(output_size_coeff, input_size) + set_random_seed(seed) + layers._initialize_affine_weight(weight, output_size, input_size, + + output_size_coeff, 0, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_tensor_model_parallel_rank() + my_weight = torch.split(master_weight, output_size_coeff, + dim=0)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' column parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # ------------ + # Row parallel + # ------------ + weight = torch.empty(output_size, input_size_coeff) + set_random_seed(seed) + mpu.layers._initialize_affine_weight(weight, output_size, input_size, + input_size_coeff, 1, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_tensor_model_parallel_rank() + my_weight = torch.split(master_weight, input_size_coeff, + dim=1)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' row parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer2D(torch.nn.Module): + def __init__(self, m, n): + super(IdentityLayer2D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n)) + torch.nn.init.xavier_normal_(self.weight) + + def forward(self): + return self.weight + + +def test_column_parallel_linear(tensor_model_parallel_size): + + mpu.initialize_model_parallel(tensor_model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing ColumnParallelLinear with model parallel ' + 'size: {}'.format(tensor_model_parallel_size)) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * tensor_model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * tensor_model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.ColumnParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_tensor_model_parallel_rank() + my_dLdA = torch.split(dLdA, output_size_coeff, + dim=0)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + my_dLdb = torch.split(dLdb, output_size_coeff, + dim=0)[rank].contiguous().clone() + error = my_dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +def test_row_parallel_linear(tensor_model_parallel_size): + + mpu.initialize_model_parallel(tensor_model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing RowParallelLinear with model parallel ' + 'size: {}'.format(tensor_model_parallel_size)) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * tensor_model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * tensor_model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.RowParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_tensor_model_parallel_rank() + my_dLdA = torch.split(dLdA, input_size_coeff, + dim=1)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer3D(torch.nn.Module): + def __init__(self, m, n, k): + super(IdentityLayer3D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n, k)) + torch.nn.init.xavier_normal_(self.weight) + + def forward(self): + return self.weight + + +def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, + sequence_length): + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * \ + torch.distributed.get_world_size() + hidden_size = hidden_size_per_att_head * num_att_heads + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, + dropout_prob).cuda() + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = attention_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_tensor_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, tensor_model_parallel_size, loss, \ + attention_layer, identity_layer + + +def test_parallel_self_attention(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelSelfAttention with model parallel ' + 'size: {}'.format(tensor_model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + dropout_prob = 0.0 # has to be zero + batch_size = 5 + sequence_length = 13 + + rank_1, hideen_size_1, tensor_model_parallel_size_1, loss_1, \ + attention_layer_1, identity_layer_1 = parallel_self_attention( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + + rank, hidden_size, tensor_model_parallel_size, loss, \ + attention_layer, identity_layer = parallel_self_attention( + tensor_model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + assert hideen_size_1 == hidden_size + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + my_lin_grad_list = torch.split( + attention_layer_1.query_key_value.weight.grad, + hidden_size // tensor_model_parallel_size, 0)[rank::tensor_model_parallel_size] + my_lin_grad = torch.cat(my_lin_grad_list, dim=0) + error = my_lin_grad.sub( + attention_layer.query_key_value.weight.grad).abs().max() + torch.distributed.barrier() + print(' weight gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length): + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * \ + torch.distributed.get_world_size() + hidden_size = hidden_size_per_att_head * num_att_heads + intermediate_size = 4 * hidden_size + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + transformer_layer = mpu.BertParallelTransformerLayer( + hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, + torch.nn.functional.relu, 1.0e-5).cuda() + + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = transformer_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_tensor_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, tensor_model_parallel_size, loss, \ + transformer_layer, identity_layer + + +def test_parallel_transformer_layer(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelTransformerLayer with model parallel ' + 'size: {}'.format(tensor_model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + batch_size = 5 + sequence_length = 13 + + rank_1, hidden_size_1, tensor_model_parallel_size_1, loss_1, \ + transformer_layer_1, identity_layer_1 = parallel_transformer( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + rank, hidden_size, tensor_model_parallel_size, loss, \ + transformer_layer, identity_layer = parallel_transformer( + tensor_model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +if __name__ == '__main__': + + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + print_separator('test initialize affine weight') + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + test_initialize_affine_weight(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test parallel embedding') + test_parallel_embedding(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + print_separator('test column-parallel linear') + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + test_column_parallel_linear(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + print_separator('test row-parallel linear') + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + test_row_parallel_linear(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + print_separator('test parallel self-attention') + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + test_parallel_self_attention(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + print_separator('test parallel transformer') + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + test_parallel_transformer_layer(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 diff --git a/megatron/legacy/mpu/tests/test_random.py b/megatron/legacy/mpu/tests/test_random.py new file mode 100644 index 0000000..2609277 --- /dev/null +++ b/megatron/legacy/mpu/tests/test_random.py @@ -0,0 +1,191 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from commons import print_separator +from commons import initialize_distributed +import mpu +import torch +import sys +sys.path.append("../..") + + +def test_set_cuda_rng_state(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing set_rng_state with size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + size = 123 + seed = 1234 + torch.cuda.manual_seed(1234) + tensor = torch.tensor(size, dtype=torch.float, device='cuda') + + # Get the state + rng_state = torch.cuda.get_rng_state() + rng_state_copy = rng_state.clone() + + # Do some stuff. + for _ in range(5): + torch.randn(size, out=tensor) + result_1 = tensor.clone() + + assert rng_state.sub(rng_state_copy).max() == 0 + assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 + + # State should be different. + new_rng_state = torch.cuda.get_rng_state() + max_diff = new_rng_state.sub(rng_state).max() + print(' max diff in rng state (should be non-zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), max_diff)) + assert max_diff > 0 + + # Reset the rng state and do the same stuff. + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + result_2 = tensor.clone() + + # Results should be the same + error = result_2.sub(result_1).abs().max() + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Input state should have remained intact. + error = rng_state.sub(rng_state_copy).max() + print(' max error in rng state (should be zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), error)) + assert error == 0 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_cuda_rng_tracker(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cuda rng tracker with size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + seed_1 = 1234 + seed_2 = 4321 + size = [12, 21] + tensor = torch.tensor(size, dtype=torch.float, device='cuda') + + # Set to seed_1 and generate two tensors. + torch.cuda.manual_seed(seed_1) + torch.randn(size, out=tensor) + target_11 = tensor.clone() + torch.randn(size, out=tensor) + target_12 = tensor.clone() + + # Set to seed_2 and generate two tensors. + torch.cuda.manual_seed(seed_2) + torch.randn(size, out=tensor) + target_21 = tensor.clone() + torch.randn(size, out=tensor) + target_22 = tensor.clone() + + # Now if we interleave seed_1 and seed_2, + # we should still get the same tensors + torch.cuda.manual_seed(seed_1) + mpu.get_cuda_rng_tracker().add('test', seed_2) + + torch.randn(size, out=tensor) + result_11 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_21 = tensor.clone() + + torch.randn(size, out=tensor) + result_12 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_22 = tensor.clone() + + diff = result_11.sub(result_21).abs().max() + diff = min(diff, result_12.sub(result_22).abs().max()) + print(' max diff in generated tensors (should be non-zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) + assert diff > 1.0e-6 + error = max(result_11.sub(target_11).abs().max(), + result_12.sub(target_12).abs().max()) + error = max(error, result_21.sub(target_21).abs().max()) + error = max(error, result_22.sub(target_22).abs().max()) + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing model parallel cuda manual seed with size {} ...'. + format(tensor_model_parallel_size)) + + mpu.initialize_model_parallel(tensor_model_parallel_size) + tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() + + mpu.model_parallel_cuda_manual_seed(12345) + assert torch.cuda.initial_seed() == 12345 + with mpu.get_cuda_rng_tracker().fork(): + assert torch.cuda.initial_seed() == (12345 + 2718 + + mpu.get_tensor_model_parallel_rank()) + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test set rng state') + test_set_cuda_rng_state(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test cuda rng tracker') + test_cuda_rng_tracker(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 + + tensor_model_parallel_size = 1 + while tensor_model_parallel_size <= world_size: + print_separator('test model parallel cuda manual seed') + test_model_parallel_cuda_manual_seed(tensor_model_parallel_size) + tensor_model_parallel_size *= 2 diff --git a/megatron/training/__init__.py b/megatron/training/__init__.py new file mode 100644 index 0000000..46cf5b5 --- /dev/null +++ b/megatron/training/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import torch + +from .global_vars import get_args +from .global_vars import get_signal_handler +from .global_vars import get_tokenizer +from .global_vars import get_tensorboard_writer +from .global_vars import get_wandb_writer +from .global_vars import get_one_logger +from .global_vars import get_adlr_autoresume +from .global_vars import get_timers +from .initialize import initialize_megatron +from .training import pretrain, get_model, get_train_valid_test_num_samples + +from .utils import (print_rank_0, + is_last_rank, + print_rank_last) diff --git a/megatron/training/activations.py b/megatron/training/activations.py new file mode 100644 index 0000000..e3f9a40 --- /dev/null +++ b/megatron/training/activations.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import torch +import torch.nn.functional as F + +try: + jit_fuser = torch.compile +except: + jit_fuser = torch.jit.script + + +@jit_fuser +def squared_relu(x: torch.Tensor) -> torch.Tensor: + return torch.pow(F.relu(x), 2) + + +@jit_fuser +def quick_gelu(x: torch.Tensor) -> torch.Tensor: + return x * torch.sigmoid(1.702 * x) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py new file mode 100644 index 0000000..89ed8c1 --- /dev/null +++ b/megatron/training/arguments.py @@ -0,0 +1,1793 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron arguments.""" + +import argparse +import dataclasses +import json +import logging +import os +import torch +import types + +import torch.nn.functional as F + +from megatron.core.dist_checkpointing.validation import StrictHandling +from megatron.core.models.retro.utils import ( + get_config_path as get_retro_config_path, + get_gpt_data_dir as get_retro_data_dir, +) +from megatron.core.transformer import TransformerConfig +from megatron.training.activations import squared_relu + + +def parse_args(extra_args_provider=None, ignore_unknown_args=False): + """Parse all arguments.""" + parser = argparse.ArgumentParser(description='Megatron-LM Arguments', + allow_abbrev=False) + + # Standard arguments. + parser = _add_network_size_args(parser) + parser = _add_regularization_args(parser) + parser = _add_training_args(parser) + parser = _add_initialization_args(parser) + parser = _add_learning_rate_args(parser) + parser = _add_checkpointing_args(parser) + parser = _add_mixed_precision_args(parser) + parser = _add_distributed_args(parser) + parser = _add_validation_args(parser) + parser = _add_data_args(parser) + parser = _add_autoresume_args(parser) + parser = _add_biencoder_args(parser) + parser = _add_vision_args(parser) + parser = _add_moe_args(parser) + parser = _add_logging_args(parser) + parser = _add_straggler_detector_args(parser) + parser = _add_inference_args(parser) + parser = _add_transformer_engine_args(parser) + parser = _add_retro_args(parser) + parser = _add_experimental_args(parser) + parser = _add_one_logger_args(parser) + + # Custom arguments. + if extra_args_provider is not None: + parser = extra_args_provider(parser) + + # Parse. + if ignore_unknown_args: + args, _ = parser.parse_known_args() + else: + args = parser.parse_args() + + # Experimental yaml + if args.yaml_cfg is not None: + from .yaml_arguments import load_yaml + assert args.yaml_cfg and not args.use_legacy_models, \ + "Yaml config is not supported with legacy models." + args = load_yaml(args.yaml_cfg) + + + # Args from environment + args.rank = int(os.getenv('RANK', '0')) + args.world_size = int(os.getenv("WORLD_SIZE", '1')) + + return args + + +def load_retro_config(retro_project_dir): + '''Load Retro's config.json.''' + + # Retro config path. + retro_config_path = get_retro_config_path(retro_project_dir) + assert os.path.exists(retro_config_path), \ + "Retro project dir missing config.json." + + # Load retro config. + with open(retro_config_path) as f: + retro_config = types.SimpleNamespace(**json.load(f)) + + return retro_config + + +def load_retro_args(args): + """Load predefined args from Retro config (if applicable). + + When using Retro (or GPT for comparison purposes), data arguments are + overridden by the saved config.json within the Retro project directory. This + is to ensure that the data used for pretraining is consistent with the data + that was preprocessed using the Retro preprocessing pipeline (see + `tools/retro/preprocess_data.py`). + """ + + # Return if no project directory is specified. + if args.retro_project_dir is None: + return + + # Load retro config. + retro_config = load_retro_config(args.retro_project_dir) + + # Retro data path is relative to project dir (via hard or soft links). + data_dir = get_retro_data_dir(args.retro_project_dir) + data_path = list(retro_config.retro_gpt_data_path) + if len(data_path) % 2 == 0: + for i in range(len(data_path) - 1, -1, -2): + data_path[i] = os.path.join(data_dir, data_path[i]) + else: + assert len(data_path) == 1 + data_path[0] = os.path.join(data_dir, data_path[0]) + + # Update args. + args.data_cache_path = retro_config.retro_gpt_data_cache_path + args.data_path = data_path if args.data_path is None else args.data_path + args.eval_interval = retro_config.retro_gpt_eval_interval + args.eval_iters = retro_config.retro_gpt_eval_iters + args.global_batch_size = retro_config.retro_gpt_global_batch_size + args.max_position_embeddings = retro_config.retro_gpt_seq_length + args.merge_file = os.path.join( + args.retro_project_dir, + retro_config.retro_gpt_merge_file, + ) if retro_config.retro_gpt_merge_file is not None else None + args.seed = retro_config.retro_gpt_seed + args.seq_length = retro_config.retro_gpt_seq_length + args.tokenizer_model = os.path.join( + args.retro_project_dir, + retro_config.retro_gpt_tokenizer_model, + ) if retro_config.retro_gpt_tokenizer_model is not None else None + args.tokenizer_type = retro_config.retro_gpt_tokenizer_type + args.train_samples = retro_config.retro_gpt_train_samples + args.vocab_file = os.path.join( + args.retro_project_dir, + retro_config.retro_gpt_vocab_file, + ) if retro_config.retro_gpt_vocab_file is not None else None + + # Retro-specific args. + args.retro_block_size = retro_config.retro_block_size + args.retro_chunk_length = retro_config.retro_gpt_chunk_length + args.retro_neighbor_dirs = retro_config.retro_neighbor_dirs + args.retro_split_preprocessing = retro_config.retro_gpt_split + args.retro_bert_tokenizer_type = retro_config.retro_bert_tokenizer_type + args.retro_bert_vocab_file = retro_config.retro_bert_vocab_file + + +def validate_args(args, defaults={}): + + # Load saved args from Retro (if applicable). + load_retro_args(args) + + # Tensor model parallel size. + args.tensor_model_parallel_size = min( + args.tensor_model_parallel_size, args.world_size) + assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\ + ' ({}) is not divisible by tensor model parallel size ({})'.format( + args.world_size, args.tensor_model_parallel_size) + + # Pipeline model parallel size. + args.pipeline_model_parallel_size = min( + args.pipeline_model_parallel_size, + (args.world_size // args.tensor_model_parallel_size)) + args.transformer_pipeline_model_parallel_size = ( + args.pipeline_model_parallel_size - 1 + if args.standalone_embedding_stage else + args.pipeline_model_parallel_size + ) + + # Checks. + model_parallel_size = args.pipeline_model_parallel_size * \ + args.tensor_model_parallel_size + assert args.world_size % (model_parallel_size * args.context_parallel_size) == 0, \ + 'world size ({}) is not divisible by tensor parallel size ({}) times ' \ + 'pipeline parallel size ({}) times context parallel size ({})'.format( + args.world_size, args.tensor_model_parallel_size, + args.pipeline_model_parallel_size, args.context_parallel_size) + args.data_parallel_size = args.world_size // (model_parallel_size * args.context_parallel_size) + if args.rank == 0: + print('using world size: {}, data-parallel size: {}, ' + 'context-parallel size: {} ' + 'tensor-model-parallel size: {}, ' + 'pipeline-model-parallel size: {} '.format( + args.world_size, args.data_parallel_size, + args.context_parallel_size, + args.tensor_model_parallel_size, + args.pipeline_model_parallel_size), flush=True) + if args.pipeline_model_parallel_size > 1: + if args.pipeline_model_parallel_split_rank is not None: + assert args.pipeline_model_parallel_split_rank < \ + args.pipeline_model_parallel_size, 'split rank needs'\ + ' to be less than pipeline model parallel size ({})'.format( + args.pipeline_model_parallel_size) + + if args.tp_comm_overlap: + assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled' + + # Deprecated arguments + assert args.batch_size is None, '--batch-size argument is no longer ' \ + 'valid, use --micro-batch-size instead' + del args.batch_size + assert args.warmup is None, '--warmup argument is no longer valid, use ' \ + '--lr-warmup-fraction instead' + del args.warmup + assert args.model_parallel_size is None, '--model-parallel-size is no ' \ + 'longer valid, use --tensor-model-parallel-size instead' + del args.model_parallel_size + + if args.checkpoint_activations: + if args.rank == 0: + print('--checkpoint-activations is no longer valid, use --recompute-activations, ' + 'or, for more control, --recompute-granularity and --recompute-method.') + exit() + del args.checkpoint_activations + + if args.recompute_activations: + args.recompute_granularity = 'selective' + del args.recompute_activations + + # Set input defaults. + for key in defaults: + # For default to be valid, it should not be provided in the + # arguments that are passed to the program. We check this by + # ensuring the arg is set to None. + if getattr(args, key, None) is not None: + if args.rank == 0: + print('WARNING: overriding default arguments for {key}:{v} \ + with {key}:{v2}'.format(key=key, v=defaults[key], + v2=getattr(args, key)), + flush=True) + else: + setattr(args, key, defaults[key]) + + if args.data_path is not None and args.split is None: + legacy_default_split_value = '969, 30, 1' + if args.rank == 0: + print('WARNING: Please specify --split when using --data-path. Using legacy default value ' + f'of "{legacy_default_split_value}"') + args.split = legacy_default_split_value + + # Batch size. + assert args.micro_batch_size is not None + assert args.micro_batch_size > 0 + if args.global_batch_size is None: + args.global_batch_size = args.micro_batch_size * args.data_parallel_size + if args.rank == 0: + print('setting global batch size to {}'.format( + args.global_batch_size), flush=True) + assert args.global_batch_size > 0 + if args.num_layers_per_virtual_pipeline_stage is not None: + if args.overlap_p2p_comm: + assert args.pipeline_model_parallel_size > 1, \ + 'when interleaved schedule is used, pipeline-model-parallel size '\ + 'should be greater than 1' + else: + assert args.pipeline_model_parallel_size > 2, \ + 'when interleaved schedule is used and p2p communication overlap is disabled, '\ + 'pipeline-model-parallel size should be greater than 2 to avoid having multiple '\ + 'p2p sends and recvs between same 2 ranks per communication batch' + assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \ + 'number of layers should be divisible by the pipeline parallel size' + num_layers_per_pipeline_stage = args.num_layers // args.transformer_pipeline_model_parallel_size + assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \ + 'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage' + args.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \ + args.num_layers_per_virtual_pipeline_stage + else: + args.virtual_pipeline_model_parallel_size = None + # Overlap P2P communication is disabled if not using the interleaved schedule. + args.overlap_p2p_comm = False + if args.rank == 0: + print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved ' + 'schedule does not support overlapping p2p communication') + + if args.overlap_param_gather: + assert args.use_distributed_optimizer, \ + '--overlap-param-gather only supported with distributed optimizer' + assert args.overlap_grad_reduce, \ + '--overlap-grad-reduce should be turned on when using --overlap-param-gather' + assert not args.use_legacy_models, \ + '--overlap-param-gather only supported with MCore models' + + # Parameters dtype. + args.params_dtype = torch.float + if args.fp16: + assert not args.bf16 + args.params_dtype = torch.half + # Turn off checking for NaNs in loss and grads if using dynamic loss scaling, + # where NaNs in grads / loss are signal to the loss scaler. + if not args.loss_scale: + args.check_for_nan_in_loss_and_grad = False + if args.rank == 0: + print('WARNING: Setting args.check_for_nan_in_loss_and_grad to False since ' + 'dynamic loss scaling is being used') + if args.bf16: + assert not args.fp16 + args.params_dtype = torch.bfloat16 + # bfloat16 requires gradient accumulation and all-reduce to + # be done in fp32. + if not args.accumulate_allreduce_grads_in_fp32: + args.accumulate_allreduce_grads_in_fp32 = True + if args.rank == 0: + print('accumulate and all-reduce gradients in fp32 for ' + 'bfloat16 data type.', flush=True) + + if args.rank == 0: + print('using {} for parameters ...'.format(args.params_dtype), + flush=True) + + if args.dataloader_type is None: + args.dataloader_type = 'single' + + # data + assert args.num_dataset_builder_threads > 0 + + # Consumed tokens. + args.consumed_train_samples = 0 + args.consumed_valid_samples = 0 + + # Support for variable sequence lengths across batches/microbatches. + # set it if the dataloader supports generation of variable sequence lengths + # across batches/microbatches. Due to additional communication overhead + # during pipeline parallelism, it should not be set if sequence length + # is constant during training. + args.variable_seq_lengths = False + + # Iteration-based training. + if args.train_iters: + # If we use iteration-based training, make sure the + # sample-based options are off. + assert args.train_samples is None, \ + 'expected iteration-based training' + assert args.lr_decay_samples is None, \ + 'expected iteration-based learning rate decay' + assert args.lr_warmup_samples == 0, \ + 'expected iteration-based learning rate warmup' + assert args.rampup_batch_size is None, \ + 'expected no batch-size rampup for iteration-based training' + if args.lr_warmup_fraction is not None: + assert args.lr_warmup_iters == 0, \ + 'can only specify one of lr-warmup-fraction and lr-warmup-iters' + + # Sample-based training. + if args.train_samples: + # If we use sample-based training, make sure the + # iteration-based options are off. + assert args.train_iters is None, \ + 'expected sample-based training' + assert args.lr_decay_iters is None, \ + 'expected sample-based learning rate decay' + assert args.lr_warmup_iters == 0, \ + 'expected sample-based learnig rate warmup' + if args.lr_warmup_fraction is not None: + assert args.lr_warmup_samples == 0, \ + 'can only specify one of lr-warmup-fraction ' \ + 'and lr-warmup-samples' + + if args.num_layers is not None: + assert args.encoder_num_layers is None, \ + 'cannot have both num-layers and encoder-num-layers specified' + args.encoder_num_layers = args.num_layers + else: + assert args.encoder_num_layers is not None, \ + 'either num-layers or encoder-num-layers should be specified' + args.num_layers = args.encoder_num_layers + + # Check required arguments. + required_args = ['num_layers', 'hidden_size', 'num_attention_heads', + 'max_position_embeddings'] + for req_arg in required_args: + _check_arg_is_not_none(args, req_arg) + + # Checks. + if args.ffn_hidden_size is None: + if args.swiglu: + # reduce the dimnesion for MLP since projections happens on + # two linear layers. this keeps the number of paramters in + # the same ballpark as the counterpart with 4*h size + # we keep it a multiple of 64, which means the actual tensor size + # will be a multiple of 64 / tp_size + args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64 + else: + args.ffn_hidden_size = 4 * args.hidden_size + + if args.kv_channels is None: + assert args.hidden_size % args.num_attention_heads == 0 + args.kv_channels = args.hidden_size // args.num_attention_heads + + if args.seq_length is not None and args.context_parallel_size > 1: + assert args.seq_length % (args.context_parallel_size * 2) == 0, \ + 'seq-length should be a multiple of 2 * context-parallel-size ' \ + 'if context-parallel-size > 1.' + + if args.seq_length is not None: + assert args.encoder_seq_length is None + args.encoder_seq_length = args.seq_length + else: + assert args.encoder_seq_length is not None + args.seq_length = args.encoder_seq_length + + if args.seq_length is not None: + assert args.max_position_embeddings >= args.seq_length + if args.decoder_seq_length is not None: + assert args.max_position_embeddings >= args.decoder_seq_length + if args.lr is not None: + assert args.min_lr <= args.lr + if args.save is not None: + assert args.save_interval is not None + # Mixed precision checks. + if args.fp16_lm_cross_entropy: + assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' + if args.fp32_residual_connection: + assert args.fp16 or args.bf16, \ + 'residual connection in fp32 only supported when using fp16 or bf16.' + + if args.moe_grouped_gemm: + assert args.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.' + dc = torch.cuda.get_device_capability() + assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels." + + if args.weight_decay_incr_style == 'constant': + assert args.start_weight_decay is None + assert args.end_weight_decay is None + args.start_weight_decay = args.weight_decay + args.end_weight_decay = args.weight_decay + else: + assert args.start_weight_decay is not None + assert args.end_weight_decay is not None + + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + # Persistent fused layer norm. + if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11): + args.no_persist_layer_norm = True + if args.rank == 0: + print('Persistent fused layer norm kernel is supported from ' + 'pytorch v1.11 (nvidia pytorch container paired with v1.11). ' + 'Defaulting to no_persist_layer_norm=True') + + # Activation recomputing. + if args.distribute_saved_activations: + assert args.tensor_model_parallel_size > 1, 'can distribute ' \ + 'recomputed activations only across tensor model ' \ + 'parallel groups' + assert args.recompute_granularity == 'full', \ + 'distributed recompute activations is only '\ + 'application to full recompute granularity' + assert args.recompute_method is not None, \ + 'for distributed recompute activations to work you '\ + 'need to use a recompute method ' + assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \ + 'distributed recompute activations are supported for pytorch ' \ + 'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \ + 'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR) + + if args.recompute_granularity == 'selective': + assert args.recompute_method is None, \ + 'recompute method is not yet supported for ' \ + 'selective recomputing granularity' + + # disable sequence parallelism when tp=1 + # to avoid change in numerics when + # sequence_parallelism is enabled. + if args.tensor_model_parallel_size == 1: + args.sequence_parallel = False + + # disable async_tensor_model_parallel_allreduce when + # model parallel memory optimization is enabled + if args.sequence_parallel: + args.async_tensor_model_parallel_allreduce = False + + if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": + if args.sequence_parallel: + raise RuntimeError( + "Using sequence parallelism requires setting the environment variable " + "CUDA_DEVICE_MAX_CONNECTIONS to 1") + if args.async_tensor_model_parallel_allreduce: + raise RuntimeError( + "Using async gradient all reduce requires setting the environment " + "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") + + # Disable bias gelu fusion if we are disabling bias altogether + if not args.add_bias_linear: + args.bias_gelu_fusion = False + + # Retro checks. + if args.retro_add_retriever: + + # Train samples should be auto-loaded. + assert args.train_samples is not None, \ + "args.train_samples should be auto-loaded from the retro config." + + # Sequence parallelism unsupported. + assert not args.sequence_parallel, \ + "retro currently does not support sequence parallelism." + + # Pipeline parallelism unsupported. + assert args.pipeline_model_parallel_size == 1, \ + "retro currently does not support pipeline parallelism." + + if args.decoupled_lr is not None or args.decoupled_min_lr is not None: + assert not args.use_legacy_models, \ + '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.' + assert not args.use_dist_ckpt, "Distributed checkpointing does not work with decoupled LR yet." + + # Legacy RoPE arguments + if args.use_rotary_position_embeddings: + args.position_embedding_type = 'rope' + if args.rotary_interleaved and args.apply_rope_fusion: + raise RuntimeError('--rotary-interleaved does not work with rope_fusion.') + if args.rotary_interleaved and args.use_legacy_models: + raise RuntimeError('--rotary-interleaved is not supported in legacy models.') + + # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now + # don't allow it to keep things simple + if not args.add_position_embedding and args.position_embedding_type != 'rope': + raise RuntimeError('--no-position-embedding is deprecated, use --position-embedding-type') + + # MoE Spec check + if args.num_experts == 0: + args.num_experts = None + if args.num_experts is not None: + assert args.spec is None, "Model Spec must be None when using MoEs" + + # Context parallel + if args.context_parallel_size > 1: + assert not args.use_legacy_models, "Context parallelism is not supported in legacy models." + + # Expert parallelism check + if args.expert_model_parallel_size > 1: + assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism" + assert args.num_experts % args.expert_model_parallel_size == 0, \ + "Number of experts should be a multiple of expert model parallel_size." + assert not args.fp16, \ + "Expert parallelism is not supported with fp16 training." + + # Distributed checkpointing checks + if args.use_dist_ckpt and args.use_legacy_models: + raise RuntimeError('--use-dist-ckpt is not supported in legacy models.') + + # Data blend checks + assert args.mock_data + \ + bool(args.data_path) + \ + any([args.train_data_path, args.valid_data_path, args.test_data_path]) \ + <= 1, "A single data source must be provided in training mode, else None" + + if args.use_tp_pp_dp_mapping: + assert args.context_parallel_size * args.expert_model_parallel_size <= 1, \ + "context_parallel and expert_model_parallel can't be used with tp-pp-dp mapping." + + # Deterministic mode + if args.deterministic_mode: + assert not args.use_flash_attn, 'Flash attention can not be used in deterministic mode.' + + all_reduce_choices = ["Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS"] + assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \ + f"NCCL_ALGO must be one of {all_reduce_choices}." + + # Update the printed args to reflect that `apply_query_key_layer_scaling` also controls `attention_softmax_in_fp32` + if args.apply_query_key_layer_scaling: + args.attention_softmax_in_fp32 = True + + # Checkpointing + if args.ckpt_fully_parallel_save_deprecated and args.rank == 0: + print('--ckpt-fully-parallel-save flag is deprecated and has no effect.' + ' Use --no-ckpt-fully-parallel-save to disable parallel save.') + if ( + args.use_dist_ckpt + and not args.ckpt_fully_parallel_save + and args.use_distributed_optimizer + and args.rank == 0 + ): + print('Warning: With non-parallel ckpt save and DistributedOptimizer,' + ' it will be impossible to resume training with different parallelism.' + ' Consider removing flag --no-ckpt-fully-parallel-save.') + + # Print arguments. + _print_args("arguments", args) + + return args + + +def _print_args(title, args): + """Print arguments.""" + if args.rank == 0: + print(f'------------------------ {title} ------------------------', + flush=True) + str_list = [] + for arg in vars(args): + dots = '.' * (48 - len(arg)) + str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg))) + for arg in sorted(str_list, key=lambda x: x.lower()): + print(arg, flush=True) + print(f'-------------------- end of {title} ---------------------', + flush=True) + + +def _check_arg_is_not_none(args, arg): + assert getattr(args, arg) is not None, '{} argument is None'.format(arg) + + +def core_transformer_config_from_args(args, config_class=None): + + # Config class. + config_class = config_class or TransformerConfig + + # Translate args to core transformer configuration + kw_args = {} + for f in dataclasses.fields(config_class): + if hasattr(args, f.name): + kw_args[f.name] = getattr(args, f.name) + kw_args['persist_layer_norm'] = not args.no_persist_layer_norm + kw_args['layernorm_zero_centered_gamma'] = args.apply_layernorm_1p + kw_args['layernorm_epsilon'] = args.norm_epsilon + kw_args['deallocate_pipeline_outputs'] = True + kw_args['pipeline_dtype'] = args.params_dtype + kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm + kw_args['num_moe_experts'] = args.num_experts + kw_args['rotary_interleaved'] = args.rotary_interleaved + if args.swiglu: + kw_args['activation_func'] = F.silu + kw_args['gated_linear_unit'] = True + kw_args['bias_activation_fusion'] = args.bias_swiglu_fusion + else: + kw_args['bias_activation_fusion'] = args.bias_gelu_fusion + if args.squared_relu: + assert not args.swiglu + kw_args['activation_func'] = squared_relu + if args.init_method_xavier_uniform: + kw_args['init_method'] = torch.nn.init.xavier_uniform_ + kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_ + if args.group_query_attention: + kw_args['num_query_groups'] = args.num_query_groups + else: + kw_args['num_query_groups'] = None + + # Return config. + return config_class(**kw_args) + + +def _add_transformer_engine_args(parser): + group = parser.add_argument_group(title='Transformer-Engine') + + group.add_argument('--fp8-format', default=None, + choices=['e4m3', 'hybrid'], + help='Which fp8 format scheme to use for FP8 tensors in the forward and backward pass', + dest='fp8') + group.add_argument('--fp8-margin', type=int, default=0, + help='Scaling margin for fp8', + dest='fp8_margin') + group.add_argument('--fp8-interval', type=int, default=1, + help='Scaling update interval for fp8', + dest='fp8_interval') + group.add_argument('--fp8-amax-history-len', type=int, default=1, + help='Number of steps for which amax history is recorded per tensor', + dest='fp8_amax_history_len') + group.add_argument('--fp8-amax-compute-algo', default='most_recent', + choices=['most_recent', 'max'], + help='Algorithm for computing amax from history', + dest='fp8_amax_compute_algo') + group.add_argument('--no-fp8-wgrad', action='store_false', + help='Execute wgrad in higher precision even for FP8 runs', + dest='fp8_wgrad') + group.add_argument('--transformer-impl', default='transformer_engine', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + + return parser + +def _add_inference_args(parser): + group = parser.add_argument_group(title='inference') + + group.add_argument('--inference-batch-times-seqlen-threshold', + type=int, default=512, + help='During inference, if batch-size times ' + 'sequence-length is smaller than this threshold ' + 'then we will not use pipelining, otherwise we will.') + group.add_argument('--max-tokens-to-oom', + type=int, default=12000, + help='Maximum number of tokens during inference' + 'tokens here is # in prompt + # to generate' + 'Allows us to throw an error before OOM crashes server') + group.add_argument('--output-bert-embeddings', action='store_true', + help='Output Bert embeddings (via mean pooling) from ' + 'model, rather than its binary head output or entire ' + 'hidden batch.') + group.add_argument('--bert-embedder-type', default="megatron", + choices=["megatron", "huggingface"], + help='Select either Megatron or Huggingface as the ' + 'Bert embedder.') + + return parser + + +def _add_retro_args(parser): + group = parser.add_argument_group(title='retro') + + group.add_argument('--retro-project-dir', default=None, + help='Retro project directory, which contains the ' + 'preprocessed data for pretraining. This directory ' + 'is built during preprocessing (see ' + 'tools/retro/README.md), and contains subdirectories ' + 'for the chunk database and pretraining neighbors.') + group.add_argument('--retro-add-retriever', + action='store_true', default=False, + help='Add a retriever to the transformer, for use in ' + 'pretraining a Retro model.') + group.add_argument('--retro-cyclic-train-iters', type=int, default=None, + help='Set number of training iterations for cyclic ' + 'Retro training.') + group.add_argument('--retro-encoder-layers', type=int, default=2, + help='Number of layers to use for the retrieval ' + 'encoder.') + group.add_argument('--retro-encoder-hidden-dropout', + type=float, default=0.1, help='Hidden dropout for ' + 'retrieval encoder.') + group.add_argument('--retro-encoder-attention-dropout', + type=float, default=0.1, help='Attention dropout for ' + 'retrieval encoder.') + group.add_argument("--retro-num-neighbors", type=int, default=2, + help='Number of neighbors to retrieve during ' + 'pretraining.') + group.add_argument("--retro-num-retrieved-chunks", type=int, default=2, + help='Number of chunks to retrieve from the retrieval ' + 'database.') + group.add_argument("--retro-attention-gate", type=float, default=1, + help="Gated cross attention.") + group.add_argument("--retro-no-verify-neighbor-count", action="store_false", + dest="retro_verify_neighbor_count", + help="Skip verifying that len(GPT dataset) == len(saved " + "neighbors).") + + # Enforce argument naming convention. + for action in group._group_actions: + prefix = action.dest.split("_")[0] + assert prefix == "retro", \ + "Retro args must be prefixed with '--retro-*', for consistent " \ + "styling. Please fix '%s'." % ", ".join(action.option_strings) + + return parser + + +def _add_network_size_args(parser): + group = parser.add_argument_group(title='network size') + + group.add_argument('--num-layers', type=int, default=None, + help='Number of transformer layers.') + group.add_argument('--encoder-num-layers', type=int, default=None, + help='Number of encoder transformer layers.') + group.add_argument('--decoder-num-layers', type=int, default=None, + help='Number of decoder transformer layers.') + group.add_argument('--hidden-size', type=int, default=None, + help='Tansformer hidden size.') + group.add_argument('--ffn-hidden-size', type=int, default=None, + help='Transformer Feed-Forward Network hidden size. ' + 'This is set to 4*hidden-size if not provided') + group.add_argument('--num-attention-heads', type=int, default=None, + help='Number of transformer attention heads.') + group.add_argument('--kv-channels', type=int, default=None, + help='Projection weights dimension in multi-head ' + 'attention. This is set to ' + ' args.hidden_size // args.num_attention_heads ' + 'if not provided.') + group.add_argument('--group-query-attention', action='store_true', + help='Use group-query attention.') + group.add_argument('--num-query-groups', type=int, default=1) + + group.add_argument('--max-position-embeddings', type=int, default=None, + help='Maximum number of position embeddings to use. ' + 'This is the size of position embedding.') + group.add_argument('--position-embedding-type', type=str, default='learned_absolute', + choices=['learned_absolute', 'rope', 'none'], + help='Position embedding type.') + group.add_argument('--use-rotary-position-embeddings', action='store_true', + help='Use rotary positional embeddings or not. ' + 'Deprecated: use --position-embedding-type') + group.add_argument('--rotary-base', type=int, default=10000, + help='Base to use for rotary positional embeddings, default 10000') + group.add_argument('--rotary-percent', type=float, default=1.0, + help='Percent of rotary dimension to use, default 100%%') + group.add_argument('--rotary-interleaved', action='store_true', + help='Use interleaved rotary embedding.') + group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None, + help='Sequence length interpolation factor for rotary embeddings.') + group.add_argument('--no-position-embedding', + action='store_false', + help='Disable position embedding. Deprecated: use --position-embedding-type', + dest='add_position_embedding') + group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, + help='Pad the vocab size to be divisible by this value.' + 'This is added for computational efficieny reasons.') + group.add_argument('--normalization', default='LayerNorm', + choices=['LayerNorm', 'RMSNorm'], + help='Which normalization technique to use.') + group.add_argument('--norm-epsilon', type=float, default=1e-5, + help='Epsilon for layer norm and RMS norm.') + group.add_argument('--apply-layernorm-1p', action='store_true', + help='Adjust LayerNorm weights such that they are centered ' + 'around zero. This improves numerical stability.') + group.add_argument('--apply-residual-connection-post-layernorm', + action='store_true', + help='If set, use original BERT residula connection ' + 'ordering.') + group.add_argument('--openai-gelu', action='store_true', + help='Use OpenAIs GeLU implementation. This option' + 'should not be used unless for backward compatibility' + 'reasons.') + group.add_argument('--squared-relu', action='store_true', + help='Use squared relu activation instead of default gelu') + group.add_argument('--swiglu', action='store_true', + help='Use gated linear units and SiLU activation instead of default gelu') + group.add_argument('--onnx-safe', type=bool, required=False, + help='Use workarounds for known problems with ' + 'Torch ONNX exporter') + group.add_argument('--bert-no-binary-head', action='store_false', + help='Disable BERT binary head.', + dest='bert_binary_head') + group.add_argument('--untie-embeddings-and-output-weights', action='store_true', + help='Untie embeddings and output weights.'), + return parser + +def _add_straggler_detector_args(parser): + group = parser.add_argument_group(title='straggler') + group.add_argument('--log-straggler', action='store_true', + help='If set, tracks and logs straggler per GPU.') + group.add_argument('--disable-straggler-on-startup', action='store_true', + help='If set, StragglerDetector is disabled on startup.') + group.add_argument('--straggler-ctrlr-port', type=int, default=65535, + help='Port number to toggle StragglerDetector on/off at runtime') + group.add_argument('--straggler-minmax-count', type=int, default=1, + help='Number of ranks to report with high/low estimated throughput') + return parser + +def _add_one_logger_args(parser): + group = parser.add_argument_group(title='one logger') + group.add_argument('--no-one-logger', action='store_false', + help='If set, disable using one_logger to track E2E metrics' + 'Note that one_logger is an internal tool and not ' + 'available externally. For installation, please go to ' + 'https://confluence.nvidia.com/display/MLWFO/Package+Repositories' + 'for more details', + dest='enable_one_logger') + group.add_argument('--one-logger-project', type=str, default='megatron-lm', + help='The one-logger project name. Will ignore if ' + '--no-one-logger is set') + group.add_argument('--one-logger-run-name', type=str, default=None, + help='The one-logger run name displayed. Will ignore if ' + '--no-one-logger is set') + group.add_argument('--one-logger-async', action='store_true', + help='If set, forces one_logger to use async mode.') + group.add_argument('--app-tag-run-name', type=str, default=None, + help='Jobs belonging to same training run, suppose to ' + 'have the same name. It will be used to track progress of ' + 'a training done over multiple different jobs') + group.add_argument('--app-tag-run-version', type=str, default='0.0.0', + help='The version of the training of which current job is ' + 'part of. It will be used to track the changes in the ' + 'application side which might change the performance ' + 'baseline') + return parser + +def _add_logging_args(parser): + group = parser.add_argument_group(title='logging') + + group.add_argument('--log-params-norm', action='store_true', + help='If set, calculate and log parameters norm.') + group.add_argument('--log-num-zeros-in-grad', action='store_true', + help='If set, calculate and log the number of zeros in gradient.') + group.add_argument('--log-throughput', action='store_true', + help='If set, calculate and log throughput per GPU.') + group.add_argument('--log-progress', action='store_true', + help='If set, log progress (in terms of number of processed tokens and ' + 'number of floating-point operations) to progress.txt file in checkpoint ' + 'directory.') + group.add_argument('--timing-log-level', type=int, + default=0, choices=range(0,3), + help='Granularity level to measure and report timing. ' + ' 0: report only iteration time and make sure timing ' + ' does not introduce extra overhead.' + ' 1: report timing for operations that are executed ' + ' very limited times (basically once) during ' + ' each iteration (such as gradient all-reduce) ' + ' 2: report timing for operations that migh be ' + ' executed numerous times during each iteration. ' + 'Note that setting the level to 1 or 2 might ' + 'cause increase in iteration time.') + group.add_argument('--no-barrier-with-level-1-timing', action='store_false', + help='If not set, use barrier with level 1 time ' + 'measurements. Note that this is up to the user ' + 'to make sure calling barrier with their timers ' + 'will not result in hangs. This can happen if for ' + 'example the user adds a level 1 timer that is not ' + 'called by all ranks.', + dest='barrier_with_L1_time') + group.add_argument('--timing-log-option', type=str, default='minmax', + choices=['max', 'minmax', 'all'], + help='Options for logging timing:' + ' max: report the max timing across all ranks' + ' minmax: report min and max timings across all ranks' + ' all: report timings of all ranks.') + group.add_argument('--tensorboard-log-interval', type=int, default=1, + help='Report to tensorboard interval.') + group.add_argument('--tensorboard-queue-size', type=int, default=1000, + help='Size of the tensorboard queue for pending events ' + 'and summaries before one of the ‘add’ calls forces a ' + 'flush to disk.') + group.add_argument('--log-timers-to-tensorboard', action='store_true', + help='If set, write timers to tensorboard.') + group.add_argument('--log-batch-size-to-tensorboard', action='store_true', + help='If set, write batch-size to tensorboard.') + group.add_argument('--no-log-learnig-rate-to-tensorboard', + action='store_false', + help='Disable learning rate logging to tensorboard.', + dest='log_learning_rate_to_tensorboard') + group.add_argument('--no-log-loss-scale-to-tensorboard', + action='store_false', + help='Disable loss-scale logging to tensorboard.', + dest='log_loss_scale_to_tensorboard') + group.add_argument('--log-validation-ppl-to-tensorboard', + action='store_true', + help='If set, write validation perplexity to ' + 'tensorboard.') + group.add_argument('--log-memory-to-tensorboard', + action='store_true', + help='Enable memory logging to tensorboard.') + group.add_argument('--log-world-size-to-tensorboard', + action='store_true', + help='Enable world size logging to tensorboard.') + group.add_argument('--wandb-project', type=str, default='', + help='The wandb project name. Ignore wandb by default.') + group.add_argument('--wandb-exp-name', type=str, default='', + help='The wandb experiment name.') + group.add_argument('--wandb-save-dir', type=str, default='', + help='Path to save the wandb results locally.') + group.add_argument('--logging-level', type=int, default=None, + help='Set default logging level') + return parser + + +def _add_regularization_args(parser): + group = parser.add_argument_group(title='regularization') + + group.add_argument('--attention-dropout', type=float, default=0.1, + help='Post attention dropout probability.') + group.add_argument('--hidden-dropout', type=float, default=0.1, + help='Dropout probability for hidden state transformer.') + group.add_argument('--weight-decay', type=float, default=0.01, + help='Weight decay coefficient for L2 regularization.') + group.add_argument('--start-weight-decay', type=float, + help='Initial weight decay coefficient for L2 regularization.') + group.add_argument('--end-weight-decay', type=float, + help='End of run weight decay coefficient for L2 regularization.') + group.add_argument('--weight-decay-incr-style', type=str, default='constant', + choices=['constant', 'linear', 'cosine'], + help='Weight decay increment function.') + group.add_argument('--clip-grad', type=float, default=1.0, + help='Gradient clipping based on global L2 norm.') + group.add_argument('--adam-beta1', type=float, default=0.9, + help='First coefficient for computing running averages ' + 'of gradient and its square') + group.add_argument('--adam-beta2', type=float, default=0.999, + help='Second coefficient for computing running averages ' + 'of gradient and its square') + group.add_argument('--adam-eps', type=float, default=1e-08, + help='Term added to the denominator to improve' + 'numerical stability') + group.add_argument('--sgd-momentum', type=float, default=0.9, + help='Momentum factor for sgd') + return parser + + +def _add_training_args(parser): + group = parser.add_argument_group(title='training') + + group.add_argument('--micro-batch-size', type=int, default=None, + help='Batch size per model instance (local batch size). ' + 'Global batch size is local batch size times data ' + 'parallel size times number of micro batches.') + group.add_argument('--batch-size', type=int, default=None, + help='Old batch size parameter, do not use. ' + 'Use --micro-batch-size instead') + group.add_argument('--global-batch-size', type=int, default=None, + help='Training batch size. If set, it should be a ' + 'multiple of micro-batch-size times data-parallel-size. ' + 'If this value is None, then ' + 'use micro-batch-size * data-parallel-size as the ' + 'global batch size. This choice will result in 1 for ' + 'number of micro-batches.') + group.add_argument('--rampup-batch-size', nargs='*', default=None, + help='Batch size ramp up with the following values:' + ' --rampup-batch-size ' + ' ' + ' ' + 'For example:' + ' --rampup-batch-size 16 8 300000 \ ' + ' --global-batch-size 1024' + 'will start with global batch size 16 and over ' + ' (1024 - 16) / 8 = 126 intervals will increase' + 'the batch size linearly to 1024. In each interval' + 'we will use approximately 300000 / 126 = 2380 samples.') + group.add_argument('--recompute-activations', action='store_true', + help='recompute activation to allow for training ' + 'with larger models, sequences, and batch sizes.') + group.add_argument('--recompute-granularity', type=str, default=None, + choices=['full', 'selective'], + help='Checkpoint activations to allow for training ' + 'with larger models, sequences, and batch sizes. ' + 'It is supported at two granularities 1) full: ' + 'whole transformer layer is recomputed, ' + '2) selective: core attention part of the transformer ' + 'layer is recomputed.') + group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false', + help='Check for NaNs in loss and grad', + dest='check_for_nan_in_loss_and_grad') + group.add_argument('--distribute-saved-activations', + action='store_true', + help='If set, distribute recomputed activations ' + 'across model parallel group.') + group.add_argument('--recompute-method', type=str, default=None, + choices=['uniform', 'block'], + help='1) uniform: uniformly divide the total number of ' + 'Transformer layers and recompute the input activation of ' + 'each divided chunk at specified granularity, ' + '2) recompute the input activations of only a set number of ' + 'individual Transformer layers per pipeline stage and do the ' + 'rest without any recomputing at specified granularity' + 'default) do not apply activations recompute to any layers') + group.add_argument('--recompute-num-layers', type=int, default=None, + help='1) uniform: the number of Transformer layers in each ' + 'uniformly divided recompute unit, ' + '2) block: the number of individual Transformer layers ' + 'to recompute within each pipeline stage.') + group.add_argument('--no-clone-scatter-output-in-embedding', action='store_false', + help='If not set, clone the output of the scatter in embedding layer to GC original tensor.', + dest='clone_scatter_output_in_embedding') + group.add_argument('--profile', action='store_true', + help='Enable nsys profiling. When using this option, nsys ' + 'options should be specified in commandline. An example ' + 'nsys commandline is `nsys profile -s none -t nvtx,cuda ' + '-o --force-overwrite true ' + '--capture-range=cudaProfilerApi ' + '--capture-range-end=stop`.') + group.add_argument('--profile-step-start', type=int, default=10, + help='Global step to start profiling.') + group.add_argument('--profile-step-end', type=int, default=12, + help='Global step to stop profiling.') + group.add_argument('--profile-ranks', nargs='+', type=int, default=[0], + help='Global ranks to profile.') + group.add_argument('--tp-comm-overlap', action='store_true', help='Enables the ' + ' overlap of Tensor parallel communication and GEMM kernels.') + group.add_argument('--tp-comm-overlap-cfg', type=str, default=None, + help='Config file when tp_comm_overlap is enabled.') + group.add_argument('--disable-tp-comm-overlap-ag', action='store_false', + help=('Disables the All-Gather overlap with GEMM by ' + 'pipelining the GEMM and All-Gather.'), + dest='tp_comm_overlap_ag') + group.add_argument('--disable-tp-comm-overlap-rs', action='store_false', + help=('Disables the Reduce-Scatter overlap with GEMM by ' + 'pipelining the GEMM and Reduce-Scatter.'), + dest='tp_comm_overlap_rs') + group.add_argument('--tp-comm-overlap-rs-dgrad', action='store_true', + help = 'Enables the Reduce-Scatter overlap with dgrad GEMM.', + dest='tp_comm_overlap_rs_dgrad') + group.add_argument('--disable-tp-comm-bulk-dgrad', action='store_false', + help='Disables the All-Gather overlap with bprop activation gradient GEMM.', + dest='tp_comm_bulk_dgrad') + group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false', + help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.', + dest='tp_comm_bulk_wgrad') + group.add_argument('--use-cpu-initialization', action='store_true', + default=None, + help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.') + group.add_argument('--empty-unused-memory-level', default=0, type=int, + choices=[0, 1, 2], + help='Call torch.cuda.empty_cache() each iteration ' + '(training and eval), to reduce fragmentation.' + '0=off, 1=moderate, 2=aggressive.') + group.add_argument('--deterministic-mode', action='store_true', + help='Choose code that has deterministic execution. This usually ' + 'means slower execution, but is good for debugging and testing.') + group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None, + help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.') + group.add_argument('--calculate-per-token-loss', action='store_true', + help=('Scale cross entropy loss by the number of non-padded tokens in the ' + 'global batch, versus the default behavior of assuming all tokens are non-padded.')) + + # deprecated + group.add_argument('--checkpoint-activations', action='store_true', + help='Checkpoint activation to allow for training ' + 'with larger models, sequences, and batch sizes.') + group.add_argument('--train-iters', type=int, default=None, + help='Total number of iterations to train over all ' + 'training runs. Note that either train-iters or ' + 'train-samples should be provided.') + group.add_argument('--train-samples', type=int, default=None, + help='Total number of samples to train over all ' + 'training runs. Note that either train-iters or ' + 'train-samples should be provided.') + group.add_argument('--log-interval', type=int, default=100, + help='Report loss and timing interval.') + group.add_argument('--exit-interval', type=int, default=None, + help='Exit the program after the iteration is divisible ' + 'by this value.') + group.add_argument('--exit-duration-in-mins', type=int, default=None, + help='Exit the program after this many minutes.') + group.add_argument('--exit-signal-handler', action='store_true', + help='Dynamically save the checkpoint and shutdown the ' + 'training if SIGTERM is received') + group.add_argument('--tensorboard-dir', type=str, default=None, + help='Write TensorBoard logs to this directory.') + group.add_argument('--no-masked-softmax-fusion', + action='store_false', + help='Disable fusion of query_key_value scaling, ' + 'masking, and softmax.', + dest='masked_softmax_fusion') + group.add_argument('--no-bias-gelu-fusion', action='store_false', + help='Disable bias and gelu fusion.', + dest='bias_gelu_fusion') + group.add_argument('--no-bias-swiglu-fusion', action='store_false', + help='Disable bias and swiglu fusion, the fusion is ' + 'available only when using megatron-core.', + dest='bias_swiglu_fusion') + group.add_argument('--no-bias-dropout-fusion', action='store_false', + help='Disable bias and dropout fusion.', + dest='bias_dropout_fusion') + group.add_argument('--no-rope-fusion', action='store_false', + help='Disable rope fusion, the fusion is available ' + 'only when using megatron-core.', + dest='apply_rope_fusion') + group.add_argument('--cross-entropy-loss-fusion', action='store_true', + help='Enabled fusion of cross entropy loss calculation.', + dest='cross_entropy_loss_fusion') + group.add_argument('--use-flash-attn', action='store_true', + help='use FlashAttention implementation of attention. ' + 'https://arxiv.org/abs/2205.14135') + group.add_argument('--disable-bias-linear', action='store_false', + help='Disable bias in the linear layers', + dest='add_bias_linear') + group.add_argument('--add-qkv-bias', action='store_true', + help='Enable bias only in the QKV linear layers', + dest='add_qkv_bias') + group.add_argument('--optimizer', type=str, default='adam', + choices=['adam', 'sgd'], + help='Optimizer function') + group.add_argument('--dataloader-type', type=str, default=None, + choices=['single', 'cyclic', 'external'], + help='Single pass vs multiple pass data loader') + group.add_argument('--no-async-tensor-model-parallel-allreduce', + action='store_false', + help='DEPRECATED. This flag is ignored.', + dest='async_tensor_model_parallel_allreduce') + group.add_argument('--no-persist-layer-norm', action='store_true', + help='Disable using persistent fused layer norm kernel. ' + 'This kernel supports only a set of hidden sizes. Please ' + 'check persist_ln_hidden_sizes if your hidden ' + 'size is supported.') + group.add_argument('--sequence-parallel', action='store_true', + help='Enable sequence parallel optimization.') + group.add_argument('--no-gradient-accumulation-fusion', + action='store_false', + help='Disable fusing gradient accumulation to weight ' + 'gradient computation of linear layers', + dest='gradient_accumulation_fusion') + group.add_argument('--use-mcore-models', action='store_true', + dest='deprecated_use_mcore_models', + help='DEPRECATED. Use the implementation from megatron core.' + 'Now ignored and mcore models are the default, use ' + '--use-legacy-models to not use core models.') + group.add_argument('--use-legacy-models', action='store_true', + help='Use the legacy Megatron models, not Megatron-Core models.') + group.add_argument('--manual-gc', action='store_true', + help='Disable the threshold-based default garbage ' + 'collector and trigger the garbage collection manually. ' + 'Manual garbage collection helps to align the timing of ' + 'the collection across ranks which mitigates the impact ' + 'of CPU-associated jitters. When the manual gc is enabled, ' + 'garbage collection is performed only at the start and the ' + 'end of the validation routine by default.') + group.add_argument('--manual-gc-interval', type=int, default=0, + help='Training step interval to trigger manual garbage ' + 'collection. When the value is set to 0, garbage ' + 'collection is not triggered between training steps.') + group.add_argument('--no-manual-gc-eval', action='store_false', + help='When using manual garbage collection, disable ' + 'garbage collection at the start and the end of each ' + 'evaluation run.', dest='manual_gc_eval') + group.add_argument('--disable-tp-comm-split-ag', action='store_false', + help='Disables the All-Gather overlap with fprop GEMM.', + dest='tp_comm_split_ag') + group.add_argument('--disable-tp-comm-split-rs', action='store_false', + help='Disables the Reduce-Scatter overlap with fprop GEMM.', + dest='tp_comm_split_rs') + + return parser + + +def _add_initialization_args(parser): + group = parser.add_argument_group(title='initialization') + + group.add_argument('--seed', type=int, default=1234, + help='Random seed used for python, numpy, ' + 'pytorch, and cuda.') + group.add_argument('--data-parallel-random-init', action='store_true', + help='Enable random initialization of params ' + 'across data parallel ranks') + group.add_argument('--init-method-std', type=float, default=0.02, + help='Standard deviation of the zero mean normal ' + 'distribution used for weight initialization.') + group.add_argument('--init-method-xavier-uniform', action='store_true', + help='Enable Xavier uniform parameter initialization') + + return parser + + +def _add_learning_rate_args(parser): + group = parser.add_argument_group(title='learning rate') + + group.add_argument('--lr', type=float, default=None, + help='Initial learning rate. Depending on decay style ' + 'and initial warmup, the learning rate at each ' + 'iteration would be different.') + group.add_argument('--lr-decay-style', type=str, default='linear', + choices=['constant', 'linear', 'cosine', 'inverse-square-root', 'WSD'], + help='Learning rate decay function.') + group.add_argument('--lr-wsd-decay-style', type=str, default='exponential', + choices=['exponential', 'linear', 'cosine'], + help='Decay style for the annealing phase of WSD'), + group.add_argument('--lr-decay-iters', type=int, default=None, + help='number of iterations to decay learning rate over,' + ' If None defaults to `--train-iters`') + group.add_argument('--lr-decay-samples', type=int, default=None, + help='number of samples to decay learning rate over,' + ' If None defaults to `--train-samples`') + group.add_argument('--lr-wsd-decay-samples', type=int, default=None, + help='number of samples for the annealing phase in the wsd schedule') + group.add_argument('--lr-wsd-decay-iters', type=int, default=None, + help='number of iterations for the annealing phase in the wsd schedule') + group.add_argument('--lr-warmup-fraction', type=float, default=None, + help='fraction of lr-warmup-(iters/samples) to use ' + 'for warmup (as a float)') + group.add_argument('--lr-warmup-iters', type=int, default=0, + help='number of iterations to linearly warmup ' + 'learning rate over.') + group.add_argument('--lr-warmup-samples', type=int, default=0, + help='number of samples to linearly warmup ' + 'learning rate over.') + group.add_argument('--lr-warmup-init', type=float, default=0.0, + help='Initial value for learning rate warmup. The ' + 'scheduler starts warmup from this value.') + group.add_argument('--warmup', type=int, default=None, + help='Old lr warmup argument, do not use. Use one of the' + '--lr-warmup-* arguments above') + group.add_argument('--min-lr', type=float, default=0.0, + help='Minimum value for learning rate. The scheduler' + 'clip values below this threshold.') + group.add_argument('--override-opt_param-scheduler', action='store_true', + help='Reset the values of the scheduler (learning rate,' + 'warmup iterations, minimum learning rate, maximum ' + 'number of iterations, and decay style from input ' + 'arguments and ignore values from checkpoints. Note' + 'that all the above values will be reset.') + group.add_argument('--use-checkpoint-opt_param-scheduler', action='store_true', + help='Use checkpoint to set the values of the scheduler ' + '(learning rate, warmup iterations, minimum learning ' + 'rate, maximum number of iterations, and decay style ' + 'from checkpoint and ignore input arguments.') + group.add_argument('--decoupled-lr', type=float, default=None, + help='Separate learning rate for the input and output layer') + group.add_argument('--decoupled-min-lr', type=float, default=None, + help='Minimum value for learning rate for the input and output layer. The scheduler' + 'clip values below this threshold') + + return parser + + +def _add_checkpointing_args(parser): + group = parser.add_argument_group(title='checkpointing') + + group.add_argument('--save', type=str, default=None, + help='Output directory to save checkpoints to.') + group.add_argument('--save-interval', type=int, default=None, + help='Number of iterations between checkpoint saves.') + group.add_argument('--no-save-optim', action='store_true', default=None, + help='Do not save current optimizer.') + group.add_argument('--no-save-rng', action='store_true', default=None, + help='Do not save current rng state.') + group.add_argument('--load', type=str, default=None, + help='Directory containing a model checkpoint.') + group.add_argument('--no-load-optim', action='store_true', default=None, + help='Do not load optimizer when loading checkpoint.') + group.add_argument('--no-load-rng', action='store_true', default=None, + help='Do not load rng state when loading checkpoint.') + group.add_argument('--finetune', action='store_true', + help='Load model for finetuning. Do not load optimizer ' + 'or rng state from checkpoint and set iteration to 0. ' + 'Assumed when loading a release checkpoint.') + group.add_argument('--pretrained-checkpoint', type=str, default=None, + help='Directory containing a pretrained model checkpoint for finetuning.') + group.add_argument('--ckpt-step', type=int, default=None, + help='Checkpoint step to load model from.') + group.add_argument('--no-initialization', action='store_false', + help='Do not perform initialization when building model, ' + 'can reduce startup time when definitely loading from a ' + 'checkpoint', + dest='perform_initialization') + group.add_argument('--use-checkpoint-args', action='store_true', + help='Override any command line arguments with arguments ' + 'from the checkpoint') + group.add_argument('--exit-on-missing-checkpoint', action='store_true', + help="If '--load' is set, but checkpoint is not found " + "(e.g., path typo), then exit instead of random " + "initialization.") + group.add_argument('--use-dist-ckpt', action='store_true', + help='Use distributed checkpoint format.') + group.add_argument('--auto-detect-ckpt-format', action='store_true', + help='Determine if the checkpoint format is in legacy or distributed format.' + ' If False, expects distributed checkpoint iff args.use_dist_ckpt.' + ' Might slow down loading a bit (double rank0 ckpt load).') + group.add_argument('--dist-ckpt-format', type=str, default='torch_dist', + choices=['zarr', 'torch_dist'], + help='Distributed checkpoint format to use.') + group.add_argument('--ckpt-fully-parallel-save', action='store_true', + dest='ckpt_fully_parallel_save_deprecated', + help='Deprecated: see --no-ckpt-fully-parallel-save.') + group.add_argument('--no-ckpt-fully-parallel-save', action='store_false', + dest='ckpt_fully_parallel_save', + help='Disable applying full save parallelization across DP for' + ' distributed checkpoints. Depending on ckpt format' + ' might decrease the number of files in the checkpoint.' + ' Makes DistributedOptimizer checkpoint non-reshardable.') + group.add_argument('--async-save', action='store_true', default=None, + help='Apply async checkpointing save. Currently works only with' + '`torch_dist` distributed checkpoint format.') + group.add_argument('--ckpt-fully-parallel-load', action='store_true', + help='Apply full load parallelization across DP for' + ' distributed checkpoints.') + group.add_argument('--ckpt-assume-constant-structure', action='store_true', + help='If the model and optimizer state dict structure is' + 'constant throughout a *single training job*, it allows for' + 'different checkpointing performance optimizations.') + group.add_argument('--dist-ckpt-strictness', type=str, default='assume_ok_unexpected', + choices=[e.value for e in StrictHandling], + help='Determine handling of key mismatch during checkpoint load.' + ' Check StrictHandling docs for flags meaning.' + ' NOTE: This flag controls only distributed checkpoint' + ' load from storage, not loading state dict into the model.') + return parser + + +def _add_mixed_precision_args(parser): + group = parser.add_argument_group(title='mixed precision') + + group.add_argument('--fp16', action='store_true', + help='Run model in fp16 mode.') + group.add_argument('--bf16', action='store_true', + help='Run model in bfloat16 mode.') + group.add_argument('--loss-scale', type=float, default=None, + help='Static loss scaling, positive power of 2 ' + 'values can improve fp16 convergence. If None, dynamic' + 'loss scaling is used.') + group.add_argument('--initial-loss-scale', type=float, default=2**32, + help='Initial loss-scale for dynamic loss scaling.') + group.add_argument('--min-loss-scale', type=float, default=1.0, + help='Minimum loss scale for dynamic loss scaling.') + group.add_argument('--loss-scale-window', type=float, default=1000, + help='Window over which to raise/lower dynamic scale.') + group.add_argument('--hysteresis', type=int, default=2, + help='hysteresis for dynamic loss scaling') + group.add_argument('--fp32-residual-connection', action='store_true', + help='Move residual connections to fp32.') + group.add_argument('--apply-query-key-layer-scaling', action='store_true', + help='Scale Q * K^T by 1 / layer-number. ' + 'Useful for fp16 training. Also sets `attention_softmax_in_fp32` to True.') + group.add_argument('--attention-softmax-in-fp32', action='store_true', + help='Run attention masking and softmax in fp32.') + group.add_argument('--accumulate-allreduce-grads-in-fp32', + action='store_true', + help='Gradient accumulation and all-reduce in fp32.') + group.add_argument('--fp16-lm-cross-entropy', action='store_true', + help='Move the cross entropy unreduced loss calculation' + 'for lm head to fp16.') + + return parser + + +def _add_distributed_args(parser): + group = parser.add_argument_group(title='distributed') + + group.add_argument('--tensor-model-parallel-size', type=int, default=1, + help='Degree of tensor model parallelism.') + group.add_argument('--pipeline-model-parallel-size', type=int, default=1, + help='Degree of pipeline model parallelism.') + group.add_argument('--pipeline-model-parallel-split-rank', + type=int, default=None, + help='Rank where encoder and decoder should be split.') + group.add_argument('--model-parallel-size', type=int, default=None, + help='Old model parallel argument, do not use. Use ' + '--tensor-model-parallel-size instead.') + group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None, + help='Number of layers per virtual pipeline stage') + group.add_argument('--no-overlap-p2p-communication', action='store_false', + help='overlap pipeline parallel communication with forward and backward chunks', + dest='overlap_p2p_comm') + group.add_argument('--distributed-backend', default='nccl', + choices=['nccl', 'gloo'], + help='Which backend to use for distributed training.') + group.add_argument('--distributed-timeout-minutes', type=int, default=10, + help='Timeout minutes for torch.distributed.') + group.add_argument('--overlap-grad-reduce', action='store_true', + default=False, help='If set, overlap DDP grad reduce.') + group.add_argument('--defer-embedding-wgrad-compute', action='store_true', + default=False, help='If set, defers the vocabulary projection linear layer weight' + 'gradient compute to pipeline flush.', dest='defer_embedding_wgrad_compute') + group.add_argument('--wgrad-deferral-limit', type=int, default=0, help='Number of micro-batches for which' + 'weight gradient computation of vocabulary projection is deferred, defaults to 0 which' + 'means all the micro-batches are deferred. Invalid if `defer-embedding-wgrad-compute`' + 'is not set') + group.add_argument('--no-delay-grad-reduce', action='store_false', + help='If not set, delay / synchronize grad reductions in all but first PP stage.', + dest='delay_grad_reduce') + group.add_argument('--ddp-bucket-size', type=int, default=None, + help='Bucket size for data-parallel communication') + group.add_argument('--ddp-average-in-collective', action='store_true', + default=False, help='If set, average directly in data-parallel communication collective.') + group.add_argument('--overlap-param-gather', action='store_true', + default=False, help='If set, overlap param all-gather in distributed optimizer.') + group.add_argument('--delay-param-gather', action='store_true', + default=False, help='If set, delay / synchronize param all-gathers in all but first PP stage.') + group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false', + help='If not set, use scatter/gather to optimize communication of tensors in pipeline.', + dest='scatter_gather_tensors_in_pipeline') + group.add_argument('--use-ring-exchange-p2p', action='store_true', + default=False, help='If set, use custom-built ring exchange ' + 'for p2p communications. Note that this option will require ' + 'a custom built image that support ring-exchange p2p.') + group.add_argument('--local_rank', type=int, default=None, + help='local rank passed from distributed launcher.') + group.add_argument('--lazy-mpu-init', type=bool, required=False, + help='If set to True, initialize_megatron() ' + 'skips DDP initialization and returns function to ' + 'complete it instead.Also turns on ' + '--use-cpu-initialization flag. This is for ' + 'external DDP manager.' ) + group.add_argument('--standalone-embedding-stage', action='store_true', + default=False, help='If set, *input* embedding layer ' + 'is placed on its own pipeline stage, without any ' + 'transformer layers. (For T5, this flag currently only ' + 'affects the encoder embedding.)') + group.add_argument('--use-distributed-optimizer', action='store_true', + help='Use distributed optimizer.') + group.add_argument('--context-parallel-size', type=int, default=1, + help='Degree of context parallelism.') + group.add_argument('--nccl-communicator-config-path', type=str, default=None, + help='Path to the yaml file with NCCL communicator ' + 'configurations. The number of min/max thread groups and thread ' + 'group cluster size of each communicator can be configured by ' + 'setting `min_ctas`, `max_ctas`, and `cga_cluster_size`.') + group.add_argument('--use-tp-pp-dp-mapping', action='store_true', default=False, + help='If set, distributed ranks initialize order is changed ' + 'from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren\'t used ' + 'with this option enabled') + return parser + + +def _add_validation_args(parser): + group = parser.add_argument_group(title='validation') + + group.add_argument('--eval-iters', type=int, default=100, + help='Number of iterations to run for evaluation' + 'validation/test for.') + group.add_argument('--eval-interval', type=int, default=1000, + help='Interval between running evaluation on ' + 'validation set.') + group.add_argument("--test-mode", action="store_true", help='Run all real-time test alongside the experiment.') + group.add_argument('--skip-train', action='store_true', + default=False, help='If set, bypass the training loop, ' + 'optionally do evaluation for validation/test, and exit.') + + return parser + + +def _add_data_args(parser): + group = parser.add_argument_group(title='data and dataloader') + + group.add_argument('--data-path', nargs='*', default=None, + help='The weight and prefix list for a set of train, validation, and test' + 'datasets which split according to --split. The accepted formats are: ' + '(1) a single prefix, ' + '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, ' + '(3) a list of prefixes e.g. prefix1 prefix2. ' + 'For (3), weights are inferred from the lengths of the contributing datasets. ' + 'This argument is exclusive to the other independent --*-data-path arguments.') + group.add_argument('--split', type=str, default=None, + help='Comma-separated list of proportions for training,' + ' validation, and test split. For example the split ' + '`90,5,5` will use 90%% of data for training, 5%% for ' + 'validation and 5%% for test.') + group.add_argument('--train-data-path', nargs='*', default=None, + help='The weight and prefix list for an independent train dataset. ' + 'Follows the same pattern rules as --data-path.') + group.add_argument('--valid-data-path', nargs='*', default=None, + help='The weight and prefix list for an independent validation dataset. ' + 'Follows the same pattern rules as --data-path.') + group.add_argument('--test-data-path', nargs='*', default=None, + help='The weight and prefix list for an independent test dataset. ' + 'Follows the same pattern rules as --data-path.') + group.add_argument('--data-cache-path', default=None, + help='Path to a directory to hold cached index files.') + group.add_argument('--no-mmap-bin-files', action='store_false', + help='Disable mmap-ing of .bin files.', + dest='mmap_bin_files') + group.add_argument('--mock-data', action='store_true', + help='Skip data loading and validation and opt for artificial ' + 'generation of mock data when an implementation is available.') + group.add_argument('--vocab-size', type=int, default=None, + help='Size of vocab before EOD or padding.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file.') + group.add_argument('--merge-file', type=str, default=None, + help='Path to the BPE merge file.') + group.add_argument('--vocab-extra-ids', type=int, default=0, + help='Number of additional vocabulary tokens. ' + 'They are used for span masking in the T5 model') + group.add_argument('--seq-length', type=int, default=None, + help='Maximum sequence length to process.') + group.add_argument('--encoder-seq-length', type=int, default=None, + help='Maximum encoder sequence length to process.' + 'This should be exclusive of --seq-length') + group.add_argument('--decoder-seq-length', type=int, default=None, + help="Maximum decoder sequence length to process.") + group.add_argument('--retriever-seq-length', type=int, default=256, + help='Maximum sequence length for the biencoder model ' + 'for retriever') + group.add_argument('--sample-rate', type=float, default=1.0, + help='sample rate for training data. Supposed to be 0 ' + ' < sample_rate < 1') + group.add_argument('--mask-prob', type=float, default=0.15, + help='Probability of replacing a token with mask.') + group.add_argument('--short-seq-prob', type=float, default=0.1, + help='Probability of producing a short sequence.') + group.add_argument('--num-workers', type=int, default=2, + help="Dataloader number of workers.") + group.add_argument('--tokenizer-type', type=str, + default=None, + choices=['BertWordPieceLowerCase', + 'BertWordPieceCase', + 'GPT2BPETokenizer', + 'SentencePieceTokenizer', + 'GPTSentencePieceTokenizer', + 'HuggingFaceTokenizer', + 'Llama2Tokenizer', + 'Llama3Tokenizer', + 'MistralTokenizer', + 'TikTokenizer', + 'NullTokenizer'], + help='What type of tokenizer to use.') + group.add_argument('--tokenizer-model', type=str, default=None, + help='Sentencepiece tokenizer model.') + group.add_argument('--tiktoken-pattern', type=str, default=None, + help='Which tiktoken pattern to use. Options: [v1, v2]') + group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000, + help='Number of special tokens in tiktoken tokenizer') + group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None, + help='List of tiktoken special tokens, needs to have ["", "", ""]') + group.add_argument('--reset-position-ids', action='store_true', + help='Reset posistion ids after end-of-document token.') + group.add_argument('--reset-attention-mask', action='store_true', + help='Reset self attention maske after ' + 'end-of-document token.') + group.add_argument('--eod-mask-loss', action='store_true', + help='Mask loss for the end of document tokens.') + group.add_argument('--no-create-attention-mask-in-dataloader', action='store_false', + help='If set, do not create attention_masks in dataloader.', + dest='create_attention_mask_in_dataloader') + group.add_argument('--num-dataset-builder-threads', type=int, default=1, + help='Number of parallel threads per rank for dataset builder') + group.add_argument('--s3-cache-path', type=str, default=None, + help='Path to cache index files when using s3 dataloader') + return parser + + +def _add_autoresume_args(parser): + group = parser.add_argument_group(title='autoresume') + + group.add_argument('--adlr-autoresume', action='store_true', + help='Enable autoresume on adlr cluster.') + group.add_argument('--adlr-autoresume-interval', type=int, default=1000, + help='Intervals over which check for autoresume' + 'termination signal') + + return parser + + +def _add_biencoder_args(parser): + group = parser.add_argument_group(title='biencoder') + + # network size + group.add_argument('--ict-head-size', type=int, default=None, + help='Size of block embeddings to be used in ICT and ' + 'REALM (paper default: 128)') + group.add_argument('--biencoder-projection-dim', type=int, default=0, + help='Size of projection head used in biencoder (paper' + ' default: 128)') + group.add_argument('--biencoder-shared-query-context-model', action='store_true', + help='Whether to share the parameters of the query ' + 'and context models or not') + + # checkpointing + group.add_argument('--ict-load', type=str, default=None, + help='Directory containing an ICTBertModel checkpoint') + group.add_argument('--bert-load', type=str, default=None, + help='Directory containing an BertModel checkpoint ' + '(needed to start ICT and REALM)') + + # data + group.add_argument('--titles-data-path', type=str, default=None, + help='Path to titles dataset used for ICT') + group.add_argument('--query-in-block-prob', type=float, default=0.1, + help='Probability of keeping query in block for ' + 'ICT dataset') + group.add_argument('--use-one-sent-docs', action='store_true', + help='Whether to use one sentence documents in ICT') + group.add_argument('--evidence-data-path', type=str, default=None, + help='Path to Wikipedia Evidence frm DPR paper') + + # training + group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int, + default=[], help="Which top-k accuracies to report " + "(e.g. '1 5 20')") + group.add_argument('--retriever-score-scaling', action='store_true', + help='Whether to scale retriever scores by inverse ' + 'square root of hidden size') + + # faiss index + group.add_argument('--block-data-path', type=str, default=None, + help='Where to save/load BlockData to/from') + group.add_argument('--embedding-path', type=str, default=None, + help='Where to save/load Open-Retrieval Embedding' + ' data to/from') + + # indexer + group.add_argument('--indexer-batch-size', type=int, default=128, + help='How large of batches to use when doing indexing ' + 'jobs') + group.add_argument('--indexer-log-interval', type=int, default=1000, + help='After how many batches should the indexer ' + 'report progress') + return parser + + +def _add_vision_args(parser): + group = parser.add_argument_group(title="vision") + + # general vision arguements + group.add_argument('--num-classes', type=int, default=1000, + help='num of classes in vision classificaiton task') + group.add_argument('--img-h', type=int, default=224, + help='Image height for vision classification task') + group.add_argument('--img-w', type=int, default=224, + help='Image height for vision classification task') + group.add_argument('--num-channels', type=int, default=3, + help='Number of channels in input image data') + group.add_argument('--patch-dim', type=int, default=16, + help='patch dimension') + group.add_argument('--classes-fraction', type=float, default=1.0, + help='training with fraction of classes.') + group.add_argument('--data-per-class-fraction', type=float, default=1.0, + help='training with fraction of data per class.') + group.add_argument('--no-data-sharding', action='store_false', + help='Disable data sharding.', + dest='data_sharding') + group.add_argument('--head-lr-mult', type=float, default=1.0, + help='learning rate multiplier for head during finetuning') + + # pretraining type and backbone selection` + group.add_argument('--vision-pretraining', action='store_true', + help='flag to indicate vision pretraining') + group.add_argument('--vision-pretraining-type', type=str, default='classify', + choices=['classify', 'inpaint', 'dino'], + help='pretraining objectives') + group.add_argument('--vision-backbone-type', type=str, default='vit', + choices=['vit', 'mit', 'swin'], + help='backbone types types') + group.add_argument('--swin-backbone-type', type=str, default='tiny', + choices=['tiny', 'base', 'h3'], + help='pretraining objectives') + # inpainting arguments + group.add_argument('--mask-type', type=str, default='random', + choices=['random', 'row'], + help='mask types') + group.add_argument('--mask-factor', type=float, default=1.0, + help='mask size scaling parameter') + + # dino arguments + group.add_argument('--iter-per-epoch', type=int, default=1250, + help='iterations per epoch') + group.add_argument('--dino-local-img-size', type=int, default=96, + help='Image size for vision classification task') + group.add_argument('--dino-local-crops-number', type=int, default=10, + help='Number of local crops') + group.add_argument('--dino-head-hidden-size', type=int, default=2048, + help='Hidden dimension size in dino head') + group.add_argument('--dino-bottleneck-size', type=int, default=256, + help='Bottle neck dimension in dino head ') + group.add_argument('--dino-freeze-last-layer', type=float, default=1, + help='Freezing last layer weights') + group.add_argument('--dino-norm-last-layer', action='store_true', + help='Disable Norm in last layer.') + group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04, + help='warump teacher temperature') + group.add_argument('--dino-teacher-temp', type=float, default=0.07, + help='teacher temperature') + group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30, + help='warmup teacher temperaure epochs') + + # regularization arguments + group.add_argument('--qk-layernorm', action='store_true', + help='Whether to layer normalize the q and k attention embeddings.') + + return parser + +def _add_moe_args(parser): + group = parser.add_argument_group(title="moe") + group.add_argument('--expert-model-parallel-size', type=int, default=1, + help='Degree of expert model parallelism.') + group.add_argument('--num-experts', type=int, default=None, + help='Number of Experts in MoE (None means no MoE)') + group.add_argument('--moe-router-load-balancing-type', type=str, + choices=['aux_loss', 'sinkhorn', 'none'], + default='aux_loss', + help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".') + group.add_argument('--moe-router-topk', type=int, default=2, + help='Number of experts to route to for each token. The default is 2.') + group.add_argument('--moe-router-pre-softmax', action='store_true', + help='Enable pre-softmax routing for MoE, which means the top-k selection is before the softmax. By default, top-k is done after the softmax.') + group.add_argument('--moe-grouped-gemm', action='store_true', + help='When there are multiple experts per rank, compress multiple local (potentially small) gemms in a single kernel launch to improve the utilization and performance by leveraging the Grouped GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).') + group.add_argument('--moe-aux-loss-coeff', type=float, default=0.0, + help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.') + group.add_argument('--moe-z-loss-coeff', type=float, default=None, + help='Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended.') + group.add_argument('--moe-input-jitter-eps', type=float, default=None, + help='Add noise to the input tensor by applying jitter with a specified epsilon value.') + group.add_argument('--moe-token-dispatcher-type', type=str, + choices=['allgather', 'alltoall'], + default='allgather', + help='.') + group.add_argument('--moe-per-layer-logging', action='store_true', + help='Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.') + # Token dropping arguments + group.add_argument('--moe-expert-capacity-factor', type=float, default=None, + help='The capacity factor for each expert, None means no token will be dropped.') + group.add_argument('--moe-pad-expert-input-to-capacity', action='store_true', + help='Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set.') + group.add_argument('--moe-token-drop-policy', type=str, default='probs', choices=['probs', 'position'], + help='The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.') + group.add_argument('--moe-layer-recompute', action='store_true', + help='Enable checkpointing for moe_layer, should be used when memory is not sufficient.') + group.add_argument('--moe-extended-tp', action='store_true', + help='Alternative to expert parallelism, all experts are sharded across TPXEP domain.') + + return parser + +def _add_experimental_args(parser): + group = parser.add_argument_group(title='experimental') + + group.add_argument('--spec', type=str, default=None, nargs='*', + help='Specify the pair ' + 'that returns a spec to customize a model, transformer ' + 'block, or transformer layer, depending on the use case.' + 'To use local spec specify local as the argument.' + 'For more details, see the model class, ' + '`transformer_block.py`, or `transformer_layer.py`') + group.add_argument('--hybrid-attention-ratio', type=float, default=0.0, + help='Ratio of attention layers to total layers, in the ' + 'range [0.0, 1.0].') + group.add_argument('--hybrid-mlp-ratio', type=float, default=0.0, + help='Ratio of mlp layers to total layers, in the ' + 'range [0.0, 1.0].') + group.add_argument('--hybrid-override-pattern', type=str, default=None, + help='Force a specific hybrid layer pattern. If a value' + 'greater than 0.0 is supplied to any of the hybrid ratio' + 'arguments, then the number of each type of layer in the' + 'override pattern must match number in the overidden' + 'pattern') + group.add_argument('--yaml-cfg', type=str, default=None, + help = 'Config file to add additional arguments') + + return parser diff --git a/megatron/training/async_utils.py b/megatron/training/async_utils.py new file mode 100644 index 0000000..44530ad --- /dev/null +++ b/megatron/training/async_utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +This module provides a singleton instance of AsyncCallsQueue which manages +the async checkpoint save calls. +""" +import logging + +from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest +from megatron.training import get_args +from megatron.training.utils import print_rank_0 + +logger = logging.getLogger(__name__) + +# Singleton manager of async calls +_async_calls_queue = AsyncCallsQueue() + + +def schedule_async_save(async_request: AsyncRequest): + """ Schedule the async save request. + + Args: + async_request (AsyncRequest): the async save request. + """ + _async_calls_queue.schedule_async_request(async_request) + + +def maybe_finalize_async_save(blocking: bool = False): + """ Finalizes active async save calls. + + Args: + blocking (bool, optional): if True, will wait until all active requests + are done. Otherwise, finalizes only the async request that already + finished. Defaults to False. + """ + args = get_args() + if not args.async_save: + return + + if blocking and _async_calls_queue.get_num_unfinalized_calls() > 0: + print_rank_0('Unfinalized async checkpoint saves. Finalizing them synchronously now.') + + _async_calls_queue.maybe_finalize_async_calls(blocking) diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py new file mode 100644 index 0000000..75847ec --- /dev/null +++ b/megatron/training/checkpointing.py @@ -0,0 +1,972 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Input/output checkpointing.""" + +from logging import getLogger +import os +import random +import sys +import numpy as np +from time import time + +import torch + +from megatron.core import mpu, tensor_parallel, dist_checkpointing +from megatron.core.dist_checkpointing.mapping import ShardedObject +from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy +from megatron.core.dist_checkpointing.strategies.fully_parallel import \ + FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper +from megatron.core.num_microbatches_calculator import update_num_microbatches +from .async_utils import schedule_async_save +from .global_vars import get_args, get_one_logger +from .utils import unwrap_model, print_rank_0, append_to_progress_log, is_last_rank +from ..core.dist_checkpointing.serialization import \ + get_default_save_sharded_strategy +from .one_logger_utils import on_save_checkpoint_start, on_save_checkpoint_success + +# [ModelOpt]: Import +try: + from modelopt.torch.opt.plugins import ( + save_modelopt_state, + save_sharded_modelopt_state, + restore_modelopt_state, + restore_sharded_modelopt_state, + ) + has_nvidia_modelopt = True +except Exception: + has_nvidia_modelopt = False + +_CHECKPOINT_VERSION = None + +logger = getLogger(__name__) + +def set_checkpoint_version(value): + global _CHECKPOINT_VERSION + if _CHECKPOINT_VERSION is not None: + assert _CHECKPOINT_VERSION == value, \ + "checkpoint versions do not match" + _CHECKPOINT_VERSION = value + + +def get_checkpoint_version(): + global _CHECKPOINT_VERSION + return _CHECKPOINT_VERSION + + +def check_checkpoint_args(checkpoint_args): + """Ensure fixed arguments for a model are the same for the input + arguments and the one retrieved from checkpoint.""" + args = get_args() + + def _compare(arg_name, old_arg_name=None, default=None): + if old_arg_name is not None: + ckpt_arg_name = old_arg_name + else: + ckpt_arg_name = arg_name + if default is not None: + checkpoint_value = getattr(checkpoint_args, ckpt_arg_name, default) + else: + checkpoint_value = getattr(checkpoint_args, ckpt_arg_name) + args_value = getattr(args, arg_name) + error_message = '{} value from checkpoint ({}) is not equal to the ' \ + 'input argument value ({}).'.format( + arg_name, checkpoint_value, args_value) + assert checkpoint_value == args_value, error_message + + _compare('num_layers') + _compare('hidden_size') + _compare('num_attention_heads') + _compare('add_position_embedding', default=True) + if args.vocab_file: + _compare('max_position_embeddings') + _compare('make_vocab_size_divisible_by') + if not args.use_dist_ckpt: + _compare('padded_vocab_size') + _compare('tokenizer_type') + if args.data_parallel_random_init: + _compare('data_parallel_random_init') + if get_checkpoint_version() < 3.0: + _compare('tensor_model_parallel_size', + old_arg_name='model_parallel_size') + if get_checkpoint_version() >= 3.0 and not args.use_dist_ckpt: + _compare('tensor_model_parallel_size') + _compare('pipeline_model_parallel_size') + +def ensure_directory_exists(filename, check_parent=True): + """Build filename's path if it does not already exists.""" + dirname = os.path.dirname(filename) if check_parent else filename + os.makedirs(dirname, exist_ok=True) + + +def get_checkpoint_name(checkpoints_path, iteration, release=False, + pipeline_parallel=None, + tensor_rank=None, pipeline_rank=None, + expert_parallel=None, expert_rank=None, + return_base_dir=False): + """Determine the directory name for this rank's checkpoint.""" + if release: + directory = 'release' + else: + directory = 'iter_{:07d}'.format(iteration) + if return_base_dir: + common_path = os.path.join(checkpoints_path, directory) + return common_path + + # Use both the tensor and pipeline MP rank. + if pipeline_parallel is None: + pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1) + if tensor_rank is None: + tensor_rank = mpu.get_tensor_model_parallel_rank() + if pipeline_rank is None: + pipeline_rank = mpu.get_pipeline_model_parallel_rank() + if expert_parallel is None: + expert_parallel = (mpu.get_expert_model_parallel_world_size() > 1) + if expert_rank is None: + expert_rank = mpu.get_expert_model_parallel_rank() + + # Use both the tensor and pipeline MP rank. If using the distributed + # optimizer, then the optimizer's path must additionally include the + # data parallel rank. + if not pipeline_parallel: + common_path = os.path.join(checkpoints_path, directory, + f'mp_rank_{tensor_rank:02d}') + else: + common_path = os.path.join(checkpoints_path, directory, + f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}') + + if expert_parallel: + common_path = common_path + f'_{expert_rank:03d}' + + return os.path.join(common_path, "model_optim_rng.pt") + + +def get_distributed_optimizer_checkpoint_name(model_checkpoint_name): + return os.path.join(os.path.dirname(model_checkpoint_name), + "distrib_optim.pt") + + +def find_checkpoint_rank_0(checkpoints_path, iteration, release=False): + """Finds the checkpoint for rank 0 without knowing if we are using + pipeline parallelism/expert parallelism or not. + + Since the checkpoint naming scheme changes if pipeline or expert + parallelism is present, we need to look for both naming schemes if + we don't know if the checkpoint has pipeline or expert parallelism. + """ + + # Look for checkpoint with no pipelining and no expert parallelism + filename = get_checkpoint_name(checkpoints_path, iteration, release, + pipeline_parallel=False, + tensor_rank=0, pipeline_rank=0, + expert_parallel=False, expert_rank=0) + if os.path.isfile(filename): + return filename + + # Look for checkpoint with no pipelining and expert parallelism + filename = get_checkpoint_name(checkpoints_path, iteration, release, + pipeline_parallel=False, + tensor_rank=0, pipeline_rank=0, + expert_parallel=True, expert_rank=0) + if os.path.isfile(filename): + return filename + + # Look for checkpoint with pipelining and no expert parallelism + filename = get_checkpoint_name(checkpoints_path, iteration, release, + pipeline_parallel=True, + tensor_rank=0, pipeline_rank=0, + expert_parallel=False, expert_rank=0) + if os.path.isfile(filename): + return filename + + # Look for checkpoint with pipelining and expert parallelism + filename = get_checkpoint_name(checkpoints_path, iteration, release, + pipeline_parallel=True, + tensor_rank=0, pipeline_rank=0, + expert_parallel=True, expert_rank=0) + if os.path.isfile(filename): + return filename + + # Look for a distributed checkpoint + filename = get_checkpoint_name(checkpoints_path, iteration, release, + pipeline_parallel=True, + return_base_dir=True) + if dist_checkpointing.check_is_distributed_checkpoint(filename): + return filename + + return None + + +def get_checkpoint_tracker_filename(checkpoints_path): + + """Tracker file rescords the latest chckpoint during + training to restart from.""" + return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt') + + +def checkpoint_exists(checkpoints_path): + if checkpoints_path is None: + return False + load_step = 'latest_checkpointed_iteration.txt' + return os.path.exists(os.path.join(checkpoints_path, load_step)) + + +def read_metadata(tracker_filename): + # Read the tracker file and either set the iteration or + # mark it as a release checkpoint. + iteration = 0 + release = False + with open(tracker_filename, 'r') as f: + metastring = f.read().strip() + try: + iteration = int(metastring) + except ValueError: + release = metastring == 'release' + if not release: + print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format( + tracker_filename)) + sys.exit() + assert iteration > 0 or release, 'error parsing metadata file {}'.format( + tracker_filename) + + # Get the max iteration retrieved across the ranks. + if torch.distributed.is_initialized(): + iters_cuda = torch.tensor([iteration], dtype=torch.long, device='cuda') + torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX) + max_iter = iters_cuda[0].item() + + # We should now have all the same iteration. + # If not, print a warning and chose the maximum + # iteration across all ranks. + if iteration != max_iter: + rank = torch.distributed.get_rank() + print('WARNING: on rank {} found iteration {} in the ' + 'metadata while max iteration across the ranks ' + 'is {}, replacing it with max iteration.'.format( + rank, iteration, max_iter), flush=True) + else: + # When loading a checkpoint outside of training (for example, + # when editing it), we might not have torch distributed + # initialized, in this case, just assume we have the latest + max_iter = iteration + return max_iter, release + + +def get_rng_state(use_dist_ckpt: bool = False): + """ collect rng state across data parallel ranks """ + args = get_args() + rng_state = { + 'random_rng_state': random.getstate(), + 'np_rng_state': np.random.get_state(), + 'torch_rng_state': torch.get_rng_state(), + 'cuda_rng_state': torch.cuda.get_rng_state(), + 'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()} + + rng_state_list = None + if torch.distributed.is_initialized() and \ + mpu.get_data_parallel_world_size() > 1 and \ + args.data_parallel_random_init: + rng_state_list = \ + [None for i in range(mpu.get_data_parallel_world_size())] + torch.distributed.all_gather_object( + rng_state_list, + rng_state, + group=mpu.get_data_parallel_group()) + else: + rng_state_list = [rng_state] + + if use_dist_ckpt: + pp_rank = mpu.get_pipeline_model_parallel_rank() + pp_size = mpu.get_pipeline_model_parallel_world_size() + tp_rank = mpu.get_tensor_model_parallel_rank() + tp_size = mpu.get_tensor_model_parallel_world_size() + rng_state_list = ShardedObject('rng_state', rng_state_list, (pp_size, tp_size), (pp_rank, tp_rank), + replica_id=mpu.get_data_parallel_rank(with_context_parallel=True)) + + return rng_state_list + + +def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far, checkpointing_context=None, + pipeline_rank=None,expert_rank=None, tensor_rank=None, pipeline_parallel=None, expert_parallel=None): + """Save a model checkpoint. + + Checkpointing context is used to persist some checkpointing state + throughout a single job. Must be initialized externally (not used if None). + """ + start_ckpt = time() + args = get_args() + + # Prepare E2E metrics at start of save checkpoint + productive_metrics = on_save_checkpoint_start(args.async_save) + + # Only rank zero of the data parallel writes to the disk. + model = unwrap_model(model) + + ckpt_format = args.dist_ckpt_format if args.use_dist_ckpt else 'torch' + print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format( + iteration, args.save, ckpt_format)) + + # Collect rng state across data parallel ranks. + rng_state = get_rng_state(args.use_dist_ckpt) + + # Checkpoint name. + checkpoint_name = get_checkpoint_name(args.save, iteration, release=False, pipeline_parallel=pipeline_parallel, + tensor_rank=tensor_rank, pipeline_rank=pipeline_rank, expert_parallel=expert_parallel, expert_rank=expert_rank, return_base_dir=args.use_dist_ckpt) + + # Save distributed optimizer's custom parameter state. + if args.use_distributed_optimizer and not args.no_save_optim and optimizer is not None and not args.use_dist_ckpt: + optim_checkpoint_name = \ + get_distributed_optimizer_checkpoint_name(checkpoint_name) + ensure_directory_exists(optim_checkpoint_name) + optimizer.save_parameter_state(optim_checkpoint_name) + + async_save_request = None + if args.async_save: + if not args.use_dist_ckpt: + raise NotImplementedError('Async checkpoint save not implemented for legacy checkpoints') + elif args.dist_ckpt_format != 'torch_dist': + raise NotImplementedError(f'Async checkpoint save not implemented for {args.dist_ckpt_format} distributed checkpoint format') + + # Collect args, model, RNG. + if not torch.distributed.is_initialized() \ + or mpu.get_data_modulo_expert_parallel_rank(with_context_parallel=True) == 0 \ + or args.use_dist_ckpt: + + optim_sd_kwargs = {} + if args.use_dist_ckpt and args.use_distributed_optimizer: + optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space' + if args.ckpt_fully_parallel_save + else 'dp_zero_gather_scatter') + print_rank_0(f'Storing distributed optimizer sharded state of type {optim_sd_kwargs["sharding_type"]}') + state_dict = generate_state_dict(args, model, optimizer, opt_param_scheduler, rng_state, + args.use_dist_ckpt, iteration, optim_sd_kwargs=optim_sd_kwargs) + + state_dict['num_floating_point_operations_so_far'] = num_floating_point_operations_so_far + if args.use_dist_ckpt: + if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + ensure_directory_exists(checkpoint_name, check_parent=False) + validate_sharding_integrity = True + save_strategy = (checkpointing_context or {}).get('save_strategy', + get_default_save_sharded_strategy(args.dist_ckpt_format)) + if args.ckpt_assume_constant_structure and args.dist_ckpt_format == 'torch_dist': + save_strategy.use_cached_ckpt_structure = args.ckpt_assume_constant_structure + if args.ckpt_fully_parallel_save: + if checkpointing_context is not None and 'save_strategy' in checkpointing_context: + # Already saved once before - don't need to rerun sharding validation + validate_sharding_integrity = not args.ckpt_assume_constant_structure + else: + save_strategy = FullyParallelSaveStrategyWrapper(save_strategy, mpu.get_data_parallel_group(with_context_parallel=True), + args.ckpt_assume_constant_structure) + # Store save strategy for future checkpoint saves + if checkpointing_context is not None: + checkpointing_context['save_strategy'] = save_strategy + end_ckpt = time() + logger.debug(f"rank: {torch.distributed.get_rank()}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt ") + async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy, + async_sharded_save=args.async_save) + + # [ModelOpt]: save sharded modelopt_state + if has_nvidia_modelopt: + save_sharded_modelopt_state(model, checkpoint_name, (args.dist_ckpt_format, 1)) + else: + # [ModelOpt]: Inject modelopt_state into state_dict + if has_nvidia_modelopt: + save_modelopt_state(model, state_dict) + + # Save. + ensure_directory_exists(checkpoint_name) + torch.save(state_dict, checkpoint_name) + start_misc = time() + if not args.async_save: + assert async_save_request is None + # Wait so everyone is done (necessary) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + # And update the latest iteration + if not torch.distributed.is_initialized() \ + or torch.distributed.get_rank() == 0: + tracker_filename = get_checkpoint_tracker_filename(args.save) + + def iter_finalize_fn(): + with open(tracker_filename, 'w') as f: + f.write(str(iteration)) + print_rank_0(' successfully saved checkpoint from iteration {:7d} to {}' + .format(iteration, args.save)) + if args.log_progress and args.async_save: + append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}', + barrier=False) + + if args.async_save: + assert async_save_request is not None + async_save_request.add_finalize_fn(iter_finalize_fn) + else: + iter_finalize_fn() + + # Additional callback for one_logger (last rank) + if not torch.distributed.is_initialized() \ + or is_last_rank(): + def onelogger_finalize_fn(): + on_save_checkpoint_success(productive_metrics, args.async_save) + if args.async_save: + assert async_save_request is not None + async_save_request.add_finalize_fn(onelogger_finalize_fn) + else: + onelogger_finalize_fn() + + if args.async_save: + schedule_async_save(async_save_request) + print_rank_0(' scheduled an async checkpoint save at iteration {:7d} to {}' \ + .format(iteration, args.save)) + + # Wait so everyone is done (not necessary) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + end_misc = time() + logger.debug(f"rank: {torch.distributed.get_rank()}, takes {end_misc - start_misc} to finalize ckpt save ") + +def generate_state_dict(args, model, optimizer, opt_param_scheduler, + rng_state, use_dist_ckpt=False, iteration=None, + optim_sd_kwargs=None): + # Arguments, iteration, and model. + state_dict = {} + state_dict['args'] = args + state_dict['checkpoint_version'] = 3.0 + if iteration is not None: + state_dict['iteration'] = iteration + + if len(model) == 1: + state_dict['model'] = (model[0].sharded_state_dict() + if use_dist_ckpt else + model[0].state_dict_for_save_checkpoint()) + else: + for i in range(len(model)): + mpu.set_virtual_pipeline_model_parallel_rank(i) + state_dict['model%d' % i] = ( + model[i].sharded_state_dict() + if use_dist_ckpt else + model[i].state_dict_for_save_checkpoint()) + # Optimizer stuff. + if not args.no_save_optim: + if optimizer is not None: + state_dict['optimizer'] = (optimizer.sharded_state_dict(state_dict, **(optim_sd_kwargs or {})) + if use_dist_ckpt else + optimizer.state_dict()) + if opt_param_scheduler is not None: + state_dict['opt_param_scheduler'] = \ + opt_param_scheduler.state_dict() + # RNG states. + if not args.no_save_rng: + state_dict["rng_state"] = rng_state + return state_dict + + +def _transpose_first_dim(t, num_splits, num_splits_first, model): + input_shape = t.size() + # We use a self_attention module but the values extracted aren't + # specific to self attention so should work for cross attention as well + while hasattr(model, 'module'): + model = model.module + attention_module = model.language_model.encoder.layers[0].self_attention + hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head + num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition + if num_splits_first: + """[num_splits * np * hn, h] + -->(view) [num_splits, np, hn, h] + -->(tranpose) [np, num_splits, hn, h] + -->(view) [np * num_splits * hn, h] """ + + intermediate_shape = \ + (num_splits, num_attention_heads_per_partition, + hidden_size_per_attention_head) + input_shape[1:] + + t = t.view(*intermediate_shape) + t = t.transpose(0, 1).contiguous() + else: + """[np * hn * num_splits, h] + -->(view) [np, hn, num_splits, h] + -->(tranpose) [np, num_splits, hn, h] + -->(view) [np * num_splits * hn, h] """ + + intermediate_shape = \ + (num_attention_heads_per_partition, + hidden_size_per_attention_head, num_splits) +\ + input_shape[1:] + + t = t.view(*intermediate_shape) + t = t.transpose(1, 2).contiguous() + t = t.view(*input_shape) + + return t + + +def fix_query_key_value_ordering(model, checkpoint_version): + """Fix up query/key/value matrix ordering if checkpoint + version is smaller than 2.0 + """ + if checkpoint_version < 2.0: + if isinstance(model, list): + assert len(model)==1 + model = model[0] + for name, param in model.named_parameters(): + if name.endswith(('.query_key_value.weight', '.query_key_value.bias')): + if checkpoint_version == 0: + fixed_param = _transpose_first_dim(param.data, 3, True, model) + elif checkpoint_version == 1.0: + fixed_param = _transpose_first_dim(param.data, 3, False, model) + else: + print_rank_0(f"Invalid checkpoint version {checkpoint_version}.") + sys.exit() + param.data.copy_(fixed_param) + if name.endswith(('.key_value.weight', '.key_value.bias')): + if checkpoint_version == 0: + fixed_param = _transpose_first_dim(param.data, 2, True, model) + elif checkpoint_version == 1.0: + fixed_param = _transpose_first_dim(param.data, 2, False, model) + else: + print_rank_0(f"Invalid checkpoint version {checkpoint_version}.") + sys.exit() + param.data.copy_(fixed_param) + print_rank_0(" succesfully fixed query-key-values ordering for" + " checkpoint version {}".format(checkpoint_version)) + + +def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None, + exit_on_missing_checkpoint=False, checkpoint_step = None): + """ Load the base state_dict from the given directory + + If rank0 is true, just loads rank 0 checkpoint, ignoring arguments. + """ + # Read the tracker file and set the iteration. + tracker_filename = get_checkpoint_tracker_filename(load_dir) + + # If no tracker file, return nothing + if not os.path.isfile(tracker_filename): + if not rank0: + print_rank_0('WARNING: could not find the metadata file {} '.format( + tracker_filename)) + print_rank_0(' will not load any checkpoints and will start from ' + 'random') + + # Conditionally exit if checkpoint not found. + if exit_on_missing_checkpoint: + print_rank_0(">> '--exit-on-missing-checkpoint' set ... exiting. <<") + if torch.distributed.is_initialized(): + torch.distributed.barrier() + sys.exit() + + return None, "", False + + # Otherwise, read the tracker file and either set the iteration or + # mark it as a release checkpoint. + if checkpoint_step is not None: + iteration = checkpoint_step + release = False + else: + iteration, release = read_metadata(tracker_filename) + + # Checkpoint. + if rank0: + checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release) + is_dist_ckpt = checkpoint_name is not None and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name) + else: + checkpoint_name = get_checkpoint_name(load_dir, iteration, release, + return_base_dir=True) + is_dist_ckpt = dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name) + if not is_dist_ckpt: + checkpoint_name = get_checkpoint_name(load_dir, iteration, release, + return_base_dir=False) + dist_infix = "distributed " if is_dist_ckpt else "" + if release: + print_rank_0(f' loading release {dist_infix}checkpoint from {load_dir}') + else: + print_rank_0(f' loading {dist_infix}checkpoint from {load_dir} at iteration {iteration}') + + # Load the checkpoint. + if is_dist_ckpt: + if rank0: + state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name) + return state_dict, checkpoint_name, release + + # at this point args are available + args = get_args() + if sharded_state_dict is None: + assert not args.auto_detect_ckpt_format and not args.use_dist_ckpt, (args.auto_detect_ckpt_format, args.use_dist_ckpt) + raise RuntimeError('Detected load from a distributed checkpoint, but neither --use-dist-ckpt nor --auto-detect-ckpt-format is set.') + + load_strategy = get_default_load_sharded_strategy(checkpoint_name) + if args.ckpt_fully_parallel_load: + load_strategy = FullyParallelLoadStrategyWrapper(load_strategy, + mpu.get_data_parallel_group(with_context_parallel=True)) + state_dict = dist_checkpointing.load(sharded_state_dict, checkpoint_name, load_strategy, strict=args.dist_ckpt_strictness) + return state_dict, checkpoint_name, release + + try: + state_dict = torch.load(checkpoint_name, map_location='cpu') + except ModuleNotFoundError: + from megatron.legacy.fp16_deprecated import loss_scaler + # For backward compatibility. + if not rank0: + print_rank_0(' > deserializing using the old code structure ...') + sys.modules['fp16.loss_scaler'] = sys.modules[ + 'megatron.legacy.fp16_deprecated.loss_scaler'] + sys.modules['megatron.fp16.loss_scaler'] = sys.modules[ + 'megatron.legacy.fp16_deprecated.loss_scaler'] + sys.modules['megatron.model'] = sys.modules['megatron.legacy.model'] + state_dict = torch.load(checkpoint_name, map_location='cpu') + sys.modules.pop('fp16.loss_scaler', None) + sys.modules.pop('megatron.fp16.loss_scaler', None) + sys.modules.pop('megatron.model', None) + except BaseException as e: + print('could not load the checkpoint') + print(e) + sys.exit() + + return state_dict, checkpoint_name, release + + +def load_args_from_checkpoint(args, load_arg='load', + exit_on_missing_checkpoint=False): + """Set required arguments from the checkpoint specified in the + arguments. + + Will overwrite arguments that have a non-None default value, but + will leave any arguments that default to None as set. + + Returns the same args NameSpace with the new values added/updated. + + If no checkpoint is specified in args, or if the checkpoint is + there but invalid, the arguments will not be modified + + """ + load_dir = getattr(args, load_arg) + + if load_dir is None: + print_rank_0('No load directory specified, using provided arguments.') + return args + + state_dict, checkpoint_name, release = _load_base_checkpoint( + load_dir, + rank0=True, + exit_on_missing_checkpoint=exit_on_missing_checkpoint, + checkpoint_step=args.ckpt_step + ) + + # Args. + if not state_dict: + print_rank_0('Checkpoint not found to provide arguments, using provided arguments.') + return args + + if 'args' not in state_dict: + print_rank_0('Checkpoint provided does not have arguments saved, using provided arguments.') + return args + + checkpoint_args = state_dict['args'] + checkpoint_version = state_dict.get('checkpoint_version', 0) + args.iteration = state_dict['iteration'] + + # One-off conversion for foundation models + if hasattr(checkpoint_args, 'disable_bias_linear'): + setattr(checkpoint_args, 'add_bias_linear', not getattr(checkpoint_args, 'disable_bias_linear')) + + def _set_arg(arg_name, old_arg_name=None, force=False): + if not force and getattr(args, arg_name, None) is not None: + return + + if old_arg_name is not None: + checkpoint_value = getattr(checkpoint_args, old_arg_name, None) + else: + checkpoint_value = getattr(checkpoint_args, arg_name, None) + + if checkpoint_value is not None: + print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint") + setattr(args, arg_name, checkpoint_value) + else: + print_rank_0(f"Checkpoint did not provide arguments {arg_name}") + + _set_arg('num_layers') + _set_arg('hidden_size') + _set_arg('ffn_hidden_size') + _set_arg('seq_length') + _set_arg('num_attention_heads') + _set_arg('num_query_groups', force=True) + _set_arg('group_query_attention', force=True) + _set_arg('kv_channels') + _set_arg('max_position_embeddings') + _set_arg('position_embedding_type', force=True) + _set_arg('add_position_embedding', force=True) + _set_arg('use_rotary_position_embeddings', force=True) + _set_arg('rotary_percent', force=True) + _set_arg('rotary_interleaved', force=True) + _set_arg('add_bias_linear', force=True) + _set_arg('add_qkv_bias', force=True) + _set_arg('swiglu', force=True) + _set_arg('untie_embeddings_and_output_weights', force=True) + _set_arg('apply_layernorm_1p', force=True) + _set_arg('normalization', force=True) + _set_arg('tokenizer_type') + _set_arg('padded_vocab_size') + _set_arg('apply_query_key_layer_scaling', force=True) + if checkpoint_version < 3.0: + _set_arg('tensor_model_parallel_size', + 'model_parallel_size') + else: + _set_arg('tensor_model_parallel_size', force=True) + _set_arg('pipeline_model_parallel_size', force=True) + _set_arg('virtual_pipeline_model_parallel_size', force=True) + _set_arg('num_layers_per_virtual_pipeline_stage') + return args, checkpoint_args + + +def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True): + """Load a model checkpoint and return the iteration. + strict (bool): whether to strictly enforce that the keys in + :attr:`state_dict` of the checkpoint match the names of + parameters and buffers in model. + """ + args = get_args() + load_dir = getattr(args, load_arg) + + # Finetuning directories + pretrained_dir = getattr(args,'pretrained_checkpoint', None) + if pretrained_dir is not None and not checkpoint_exists(load_dir): + print_rank_0(f'Checkpoint file not found in load directory {load_dir} attempting to finetune with checkpoint in {pretrained_dir}') + load_dir = pretrained_dir + if not checkpoint_exists(load_dir): + raise FileNotFoundError("No checkpoint found in load directory or pretrained directory") + args.finetune = True + + + model = unwrap_model(model) + + load_kwargs = {} + is_dist_ckpt = False + if args.auto_detect_ckpt_format or args.use_dist_ckpt: + state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=True, exit_on_missing_checkpoint=args.exit_on_missing_checkpoint) + is_dist_ckpt = dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name) + if is_dist_ckpt: + ckpt_tp_pp = (state_dict['args'].tensor_model_parallel_size, state_dict['args'].pipeline_model_parallel_size) + run_tp_pp = (mpu.get_tensor_model_parallel_world_size(), mpu.get_pipeline_model_parallel_world_size()) + mismatch_msg = "(TP, PP) mismatch after resume ({} vs {} from checkpoint)".format(ckpt_tp_pp, run_tp_pp) + + # Determine if RNG state will be loaded + if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng + and not getattr(state_dict['args'], 'no_save_rng', False)): + gen_sd_rng_state = get_rng_state(True) # we can load the rng state + else: + gen_sd_rng_state = None + if ckpt_tp_pp != run_tp_pp: + print_rank_0("{}: RNG state will be ignored".format(mismatch_msg)) + + optim_sd_kwargs = dict(is_loading=True) + # Determine if optimizer state will be loaded + if (not release and not args.finetune and not args.no_load_optim + and not getattr(state_dict['args'], 'no_save_optim', False)): + gen_sd_optim = optimizer + gen_sd_opt_param_scheduler = opt_param_scheduler + + if args.use_distributed_optimizer: + optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space' + if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False) + else 'dp_zero_gather_scatter') + # This is for backwards-compatibility. Can be removed once 'fully_sharded_bucket_space' loading is removed + for maybe_dist_opt_optim_state in (state_dict['optimizer'], *state_dict['optimizer'].values()): + if 'param_state_sharding_type' in maybe_dist_opt_optim_state: + if maybe_dist_opt_optim_state['param_state_sharding_type'] == 'fully_sharded_bucket_space': + print_rank_0('Detected deprecated `fully_sharded_bucket_space` DistributedOptimizer checkpoint format') + optim_sd_kwargs['sharding_type'] = maybe_dist_opt_optim_state['param_state_sharding_type'] + break + + if ckpt_tp_pp != run_tp_pp and optim_sd_kwargs['sharding_type'] != 'fully_sharded_model_space': + raise RuntimeError(f"{mismatch_msg}: not supported for DistributedOptimizer with sharding type {optim_sd_kwargs['sharding_type']}." + f" Please use `--ckpt-fully-parallel-save` flag during checkpoint saving.") + else: + gen_sd_optim = None + gen_sd_opt_param_scheduler = None + load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, gen_sd_optim, gen_sd_opt_param_scheduler, + gen_sd_rng_state, True, optim_sd_kwargs=optim_sd_kwargs) + load_kwargs['exit_on_missing_checkpoint'] = args.exit_on_missing_checkpoint + + state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False, **load_kwargs) + + # Checkpoint not loaded. + if state_dict is None: + # Iteration and num_floating_point_operations_so_far default to 0. + return 0, 0 + + # Set checkpoint version. + set_checkpoint_version(state_dict.get('checkpoint_version', 0)) + + # Set iteration. + if args.finetune or release: + iteration = 0 + else: + try: + iteration = state_dict['iteration'] + except KeyError: + try: # Backward compatible with older checkpoints + iteration = state_dict['total_iters'] + except KeyError: + print_rank_0('A metadata file exists but unable to load ' + 'iteration from checkpoint {}, exiting'.format(checkpoint_name)) + sys.exit() + num_floating_point_operations_so_far = state_dict.get('num_floating_point_operations_so_far', 0) + + # Check arguments. + assert args.consumed_train_samples == 0 + assert args.consumed_valid_samples == 0 + if 'args' in state_dict and not args.finetune: + checkpoint_args = state_dict['args'] + check_checkpoint_args(checkpoint_args) + args.consumed_train_samples = getattr(checkpoint_args, + 'consumed_train_samples', 0) + update_num_microbatches(consumed_samples=args.consumed_train_samples) + args.consumed_valid_samples = getattr(checkpoint_args, + 'consumed_valid_samples', 0) + else: + print_rank_0('could not find arguments in the checkpoint ...') + + # [ModelOpt]: loading modelopt_state (sharded or not) + if has_nvidia_modelopt: + if args.use_dist_ckpt: + restore_sharded_modelopt_state(model, checkpoint_name) + else: + restore_modelopt_state(model, state_dict) + + # Model. + strict = False if args.retro_add_retriever else strict + if len(model) == 1: + model[0].load_state_dict(state_dict['model'], strict=strict) + else: + for i in range(len(model)): + mpu.set_virtual_pipeline_model_parallel_rank(i) + model[i].load_state_dict(state_dict['model%d' % i], strict=strict) + + # Fix up query/key/value matrix ordering if needed. + checkpoint_version = get_checkpoint_version() + print_rank_0(f' checkpoint version {checkpoint_version}') + fix_query_key_value_ordering(model, checkpoint_version) + + # Optimizer. + if not release and not args.finetune and not args.no_load_optim: + try: + # Load state dict. + if optimizer is not None: + optimizer.load_state_dict(state_dict['optimizer']) + + # Load distributed optimizer's custom parameter state. + # For distributed checkpoint it's already loaded in load_state_dict above + if args.use_distributed_optimizer and not is_dist_ckpt: + tracker_filename = get_checkpoint_tracker_filename(load_dir) + iteration, release = read_metadata(tracker_filename) + model_checkpoint_name = \ + get_checkpoint_name(load_dir, iteration, release) + optim_checkpoint_name = \ + get_distributed_optimizer_checkpoint_name( + model_checkpoint_name) + optimizer.load_parameter_state(optim_checkpoint_name) + + # Load scheduler. + if opt_param_scheduler is not None: + if 'lr_scheduler' in state_dict: # backward compatbility + opt_param_scheduler.load_state_dict(state_dict['lr_scheduler']) + else: + opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler']) + except KeyError: + print_rank_0('Unable to load optimizer from checkpoint {}. ' + 'Specify --no-load-optim or --finetune to prevent ' + 'attempting to load the optimizer state, ' + 'exiting ...'.format(checkpoint_name)) + sys.exit() + else: + if (args.fp16 or args.bf16) and optimizer is not None: + optimizer.reload_model_params() + + # rng states. + if not release and not args.finetune and not args.no_load_rng: + try: + if 'rng_state' in state_dict: + # access rng_state for data parallel rank + if args.data_parallel_random_init: + rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()] + else: + rng_state = state_dict['rng_state'][0] + random.setstate(rng_state['random_rng_state']) + np.random.set_state(rng_state['np_rng_state']) + torch.set_rng_state(rng_state['torch_rng_state']) + torch.cuda.set_rng_state(rng_state['cuda_rng_state']) + # Check for empty states array + if not rng_state['rng_tracker_states']: + raise KeyError + tensor_parallel.get_cuda_rng_tracker().set_states( + rng_state['rng_tracker_states']) + else: # backward compatability + random.setstate(state_dict['random_rng_state']) + np.random.set_state(state_dict['np_rng_state']) + torch.set_rng_state(state_dict['torch_rng_state']) + torch.cuda.set_rng_state(state_dict['cuda_rng_state']) + # Check for empty states array + if not state_dict['rng_tracker_states']: + raise KeyError + tensor_parallel.get_cuda_rng_tracker().set_states( + state_dict['rng_tracker_states']) + except KeyError: + print_rank_0('Unable to load rng state from checkpoint {}. ' + 'Specify --no-load-rng or --finetune to prevent ' + 'attempting to load the rng state, ' + 'exiting ...'.format(checkpoint_name)) + sys.exit() + + # Some utilities want to load a checkpoint without distributed being initialized + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + print_rank_0(f' successfully loaded checkpoint from {load_dir} ' + f'[ t {mpu.get_tensor_model_parallel_rank()}, ' + f'p {mpu.get_pipeline_model_parallel_rank()} ] ' + f'at iteration {iteration}') + + return iteration, num_floating_point_operations_so_far + + +def load_biencoder_checkpoint(model, only_query_model=False, + only_context_model=False, custom_load_path=None): + """ + selectively load retrieval models for indexing/retrieving + from saved checkpoints + """ + + args = get_args() + + model = unwrap_model(model) + + load_path = custom_load_path if custom_load_path is not None else args.load + + tracker_filename = get_checkpoint_tracker_filename(load_path) + with open(tracker_filename, 'r') as f: + iteration = int(f.read().strip()) + + checkpoint_name = get_checkpoint_name(load_path, iteration, + args.use_distributed_optimizer, + release=False) + + if mpu.get_data_parallel_rank() == 0: + print('global rank {} is loading checkpoint {}'.format( + torch.distributed.get_rank(), checkpoint_name)) + + state_dict = torch.load(checkpoint_name, map_location='cpu') + ret_state_dict = state_dict['model'] + + if only_query_model: + ret_state_dict.pop('context_model') + if only_context_model: + ret_state_dict.pop('query_model') + + assert len(model) == 1 + model[0].load_state_dict(ret_state_dict) + torch.distributed.barrier() + + if mpu.get_data_parallel_rank() == 0: + print(' successfully loaded {}'.format(checkpoint_name)) + + return model diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py new file mode 100644 index 0000000..a60204f --- /dev/null +++ b/megatron/training/dist_signal_handler.py @@ -0,0 +1,81 @@ +import signal + +import torch + + +def get_world_size(): + if torch.distributed.is_available() and torch.distributed.is_initialized(): + world_size = torch.distributed.get_world_size() + else: + world_size = 1 + return world_size + + +def get_device(local_rank=None): + backend = torch.distributed.get_backend() + if backend == 'nccl': + if local_rank is None: + device = torch.device('cuda') + else: + device = torch.device(f'cuda:{local_rank}') + elif backend == 'gloo': + device = torch.device('cpu') + else: + raise RuntimeError + return device + + +def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): + if not torch.distributed.is_available() or \ + not torch.distributed.is_initialized(): + return [item] + + device = get_device(local_rank) + + if group is not None: + group_size = group.size() + else: + group_size = get_world_size() + + tensor = torch.tensor([item], device=device, dtype=dtype) + output_tensors = [ + torch.zeros(1, dtype=tensor.dtype, device=tensor.device) + for _ in range(group_size) + ] + torch.distributed.all_gather(output_tensors, tensor, group, async_op) + output = [elem.item() for elem in output_tensors] + return output + + +class DistributedSignalHandler: + def __init__(self, sig=signal.SIGTERM): + self.sig = sig + + def signals_received(self): + all_received = all_gather_item( + self._signal_received, dtype=torch.int32 + ) + return all_received + + def __enter__(self): + self._signal_received = False + self.released = False + self.original_handler = signal.getsignal(self.sig) + + def handler(signum, frame): + self._signal_received = True + + signal.signal(self.sig, handler) + + return self + + def __exit__(self, type, value, tb): + self.release() + + def release(self): + if self.released: + return False + + signal.signal(self.sig, self.original_handler) + self.released = True + return True diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py new file mode 100644 index 0000000..afd7a23 --- /dev/null +++ b/megatron/training/global_vars.py @@ -0,0 +1,223 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron global variables.""" + +import os +import sys +import torch + +from megatron.core import Timers, init_num_microbatches_calculator +from megatron.training import dist_signal_handler +from megatron.training.tokenizer import build_tokenizer + +_GLOBAL_ARGS = None +_GLOBAL_TOKENIZER = None +_GLOBAL_TENSORBOARD_WRITER = None +_GLOBAL_WANDB_WRITER = None +_GLOBAL_ONE_LOGGER = None +_GLOBAL_ADLR_AUTORESUME = None +_GLOBAL_TIMERS = None +_GLOBAL_SIGNAL_HANDLER = None + +def get_args(): + """Return arguments.""" + _ensure_var_is_initialized(_GLOBAL_ARGS, 'args') + return _GLOBAL_ARGS + + +def get_tokenizer(): + """Return tokenizer.""" + _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer') + return _GLOBAL_TOKENIZER + + +def get_tensorboard_writer(): + """Return tensorboard writer. It can be None so no need + to check if it is initialized.""" + return _GLOBAL_TENSORBOARD_WRITER + + +def get_wandb_writer(): + """Return tensorboard writer. It can be None so no need + to check if it is initialized.""" + return _GLOBAL_WANDB_WRITER + + +def get_one_logger(): + """Return one logger. It can be None so no need + to check if it is initialized.""" + return _GLOBAL_ONE_LOGGER + +def get_adlr_autoresume(): + """ADLR autoresume object. It can be None so no need + to check if it is initialized.""" + return _GLOBAL_ADLR_AUTORESUME + + +def get_timers(): + """Return timers.""" + _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers') + return _GLOBAL_TIMERS + + +def get_signal_handler(): + _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') + return _GLOBAL_SIGNAL_HANDLER + + +def _set_signal_handler(): + global _GLOBAL_SIGNAL_HANDLER + _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') + _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() + + + +def set_global_variables(args, build_tokenizer=True): + """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers.""" + + assert args is not None + + _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args') + set_args(args) + + init_num_microbatches_calculator( + args.rank, + args.rampup_batch_size, + args.global_batch_size, + args.micro_batch_size, + args.data_parallel_size, + ) + if build_tokenizer: + _ = _build_tokenizer(args) + _set_tensorboard_writer(args) + _set_wandb_writer(args) + _set_one_logger(args) + _set_adlr_autoresume(args) + _set_timers(args) + + if args.exit_signal_handler: + _set_signal_handler() + + +def set_args(args): + global _GLOBAL_ARGS + _GLOBAL_ARGS = args + + +def _build_tokenizer(args): + """Initialize tokenizer.""" + global _GLOBAL_TOKENIZER + _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer') + _GLOBAL_TOKENIZER = build_tokenizer(args) + return _GLOBAL_TOKENIZER + + +def rebuild_tokenizer(args): + global _GLOBAL_TOKENIZER + _GLOBAL_TOKENIZER = None + return _build_tokenizer(args) + + +def _set_tensorboard_writer(args): + """Set tensorboard writer.""" + global _GLOBAL_TENSORBOARD_WRITER + _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, + 'tensorboard writer') + + if hasattr(args, 'tensorboard_dir') and \ + args.tensorboard_dir and args.rank == (args.world_size - 1): + try: + from torch.utils.tensorboard import SummaryWriter + print('> setting tensorboard ...') + _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( + log_dir=args.tensorboard_dir, + max_queue=args.tensorboard_queue_size) + except ModuleNotFoundError: + print('WARNING: TensorBoard writing requested but is not ' + 'available (are you using PyTorch 1.1.0 or later?), ' + 'no TensorBoard logs will be written.', flush=True) + + +def _set_wandb_writer(args): + global _GLOBAL_WANDB_WRITER + _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER, + 'wandb writer') + if getattr(args, 'wandb_project', '') and args.rank == (args.world_size - 1): + if args.wandb_exp_name == '': + raise ValueError("Please specify the wandb experiment name!") + + import wandb + if args.wandb_save_dir: + save_dir = args.wandb_save_dir + else: + # Defaults to the save dir. + save_dir = os.path.join(args.save, 'wandb') + wandb_kwargs = { + 'dir': save_dir, + 'name': args.wandb_exp_name, + 'project': args.wandb_project, + 'config': vars(args)} + os.makedirs(wandb_kwargs['dir'], exist_ok=True) + wandb.init(**wandb_kwargs) + _GLOBAL_WANDB_WRITER = wandb + + +def _set_one_logger(args): + global _GLOBAL_ONE_LOGGER + _ensure_var_is_not_initialized(_GLOBAL_ONE_LOGGER, 'one logger') + + if args.enable_one_logger and args.rank == (args.world_size - 1): + if args.one_logger_async or getattr(args, 'wandb_project', ''): + one_logger_async = True + else: + one_logger_async = False + try: + from one_logger import OneLogger + config = { + 'project': args.one_logger_project, + 'name': args.one_logger_run_name, + 'async': one_logger_async, + } + one_logger = OneLogger(config=config) + _GLOBAL_ONE_LOGGER = one_logger + except BaseException: + print('WARNING: one_logger package is required to enable e2e metrics ' + 'tracking. please go to ' + 'https://confluence.nvidia.com/display/MLWFO/Package+Repositories' + ' for details to install it') + +def _set_adlr_autoresume(args): + """Initialize ADLR autoresume.""" + global _GLOBAL_ADLR_AUTORESUME + _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume') + + if args.adlr_autoresume: + if args.rank == 0: + print('enabling autoresume ...', flush=True) + sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.')) + try: + from userlib.auto_resume import AutoResume + except BaseException: + print('ADLR autoresume is not available, exiting ...') + sys.exit() + + _GLOBAL_ADLR_AUTORESUME = AutoResume + + +def _set_timers(args): + """Initialize timers.""" + global _GLOBAL_TIMERS + _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers') + _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option) + + +def _ensure_var_is_initialized(var, name): + """Make sure the input variable is not None.""" + assert var is not None, '{} is not initialized.'.format(name) + + +def _ensure_var_is_not_initialized(var, name): + """Make sure the input variable is not None.""" + assert var is None, '{} is already initialized.'.format(name) + + diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py new file mode 100644 index 0000000..ed69b63 --- /dev/null +++ b/megatron/training/initialize.py @@ -0,0 +1,423 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron initialization.""" +import logging +import random +import os +import time + +import numpy as np +import torch +from datetime import timedelta + +from megatron.legacy import fused_kernels +from megatron.training import get_adlr_autoresume +from megatron.training import get_args +from megatron.training import get_tensorboard_writer +from megatron.core import mpu, tensor_parallel +from megatron.training.arguments import parse_args, validate_args +from megatron.training.yaml_arguments import validate_yaml +from megatron.training.checkpointing import load_args_from_checkpoint +from megatron.training.global_vars import set_global_variables +from megatron.legacy.model.transformer import bias_dropout_add_fused_train +from megatron.legacy.model.fused_bias_gelu import bias_gelu + +logger = logging.getLogger(__name__) + + +def initialize_megatron( + extra_args_provider=None, + args_defaults={}, + ignore_unknown_args=False, + allow_no_cuda=False, + skip_mpu_initialization=False, +): + """Set global variables, initialize distributed, and + set autoresume and random seeds. + `allow_no_cuda` should not be set unless using megatron for cpu only + data processing. In general this arg should not be set unless you know + what you are doing. + Returns a function to finalize distributed env initialization + (optionally, only when args.lazy_mpu_init == True) + """ + if not allow_no_cuda: + # Make sure cuda is available. + assert torch.cuda.is_available(), "Megatron requires CUDA." + + # Parse arguments + args = parse_args(extra_args_provider, ignore_unknown_args) + + if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False): + assert args.load is not None, "--use-checkpoints-args requires --load argument" + load_args_from_checkpoint(args) + + if args.yaml_cfg is not None: + args = validate_yaml(args, args_defaults) + else: + validate_args(args, args_defaults) + + + # set global args, build tokenizer, and set adlr-autoresume, + # tensorboard-writer, and timers. + set_global_variables(args) + + # set logging level + setup_logging() + + # torch.distributed initialization + def finish_mpu_init(): + args = get_args() + # Pytorch distributed. + _initialize_distributed() + + # Random seeds for reproducibility. + if args.rank == 0: + print("> setting random seeds to {} ...".format(args.seed)) + _set_random_seed(args.seed, args.data_parallel_random_init) + + if skip_mpu_initialization: + return None + + args = get_args() + if args.lazy_mpu_init: + # TODO is this still a necessary option? + args.use_cpu_initialization = True + # delayed initialization of DDP-related stuff + # We only set basic DDP globals + mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size) + # and return function for external DDP manager + # to call when it has DDP initialized + mpu.set_tensor_model_parallel_rank(args.rank) + return finish_mpu_init + else: + # Megatron's MPU is the master. Complete initialization right away. + finish_mpu_init() + + # Autoresume. + _init_autoresume() + + # Compile dependencies. + _compile_dependencies() + + if args.tp_comm_overlap: + _initialize_tp_communicators() + + # No continuation function + return None + + +def _compile_dependencies(): + + args = get_args() + + # ========================= + # Compile dataset C++ code. + # ========================= + # TODO: move this to ninja + if torch.distributed.get_rank() == 0: + start_time = time.time() + print("> compiling dataset index builder ...") + from megatron.core.datasets.utils import compile_helpers + + compile_helpers() + print( + ">>> done with dataset index builder. Compilation time: {:.3f} " + "seconds".format(time.time() - start_time), + flush=True, + ) + + # ================== + # Load fused kernels + # ================== + + # Custom kernel constraints check. + seq_len = args.seq_length + attn_batch_size = ( + args.num_attention_heads / args.tensor_model_parallel_size + ) * args.micro_batch_size + # Constraints on sequence length and attn_batch_size to enable warp based + # optimization and upper triangular optimization (for causal mask) + custom_kernel_constraint = ( + seq_len > 16 + and seq_len <= 16384 + and seq_len % 4 == 0 + and attn_batch_size % 4 == 0 + ) + # Print a warning. + if not ( + (args.fp16 or args.bf16) + and custom_kernel_constraint + and args.masked_softmax_fusion + ): + if args.rank == 0: + print( + "WARNING: constraints for invoking optimized" + " fused softmax kernel are not met. We default" + " back to unfused kernel invocations.", + flush=True, + ) + + # Always build on rank zero first. + if torch.distributed.get_rank() == 0: + start_time = time.time() + print("> compiling and loading fused kernels ...", flush=True) + fused_kernels.load(args) + torch.distributed.barrier() + else: + torch.distributed.barrier() + fused_kernels.load(args) + # Simple barrier to make sure all ranks have passed the + # compilation phase successfully before moving on to the + # rest of the program. We think this might ensure that + # the lock is released. + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print( + ">>> done with compiling and loading fused kernels. " + "Compilation time: {:.3f} seconds".format(time.time() - start_time), + flush=True, + ) + +def _initialize_tp_communicators(): + """ initializing the communicators with user buffers for high-performance tensor-model-parallel + communication overlap """ + + try: + import yaml + + import transformer_engine + from transformer_engine.pytorch import module as te_module + + except ImportError: + raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and " + "'transformer_engine' packages") + + args = get_args() + + if args.tp_comm_overlap_cfg is not None: + with open(args.tp_comm_overlap_cfg,"r") as stream: + ub_cfgs = yaml.safe_load(stream) + else: + ub_cfgs = {} + + input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size] + + #We create a MPI process group, which is needed to bootstrap the pipelined + #tensor-model-parallel communication overlap + torch.distributed.new_group(backend='mpi') + + te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size, + use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,) + +def _initialize_distributed(): + """Initialize torch.distributed and core model parallel.""" + args = get_args() + + device_count = torch.cuda.device_count() + if torch.distributed.is_initialized(): + + if args.rank == 0: + print( + "torch distributed is already initialized, " + "skipping initialization ...", + flush=True, + ) + args.rank = torch.distributed.get_rank() + args.world_size = torch.distributed.get_world_size() + + else: + + if args.rank == 0: + print("> initializing torch distributed ...", flush=True) + # Manually set the device ids. + if device_count > 0: + device = args.rank % device_count + if args.local_rank is not None: + assert ( + args.local_rank == device + ), "expected local-rank to be the same as rank % device-count." + else: + args.local_rank = device + torch.cuda.set_device(device) + # Call the init process + torch.distributed.init_process_group( + backend=args.distributed_backend, + world_size=args.world_size, + rank=args.rank, + timeout=timedelta(minutes=args.distributed_timeout_minutes), + ) + + # Set the tensor model-parallel, pipeline model-parallel, and + # data-parallel communicators. + if device_count > 0: + if mpu.model_parallel_is_initialized(): + print("model parallel is already initialized") + else: + mpu.initialize_model_parallel( + args.tensor_model_parallel_size, + args.pipeline_model_parallel_size, + args.virtual_pipeline_model_parallel_size, + args.pipeline_model_parallel_split_rank, + context_parallel_size=args.context_parallel_size, + expert_model_parallel_size=args.expert_model_parallel_size, + distributed_timeout_minutes=args.distributed_timeout_minutes, + nccl_communicator_config_path=args.nccl_communicator_config_path, + order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp', + ) + if args.rank == 0: + print( + f"> initialized tensor model parallel with size " + f"{mpu.get_tensor_model_parallel_world_size()}" + ) + print( + f"> initialized pipeline model parallel with size " + f"{mpu.get_pipeline_model_parallel_world_size()}" + ) + + +def _init_autoresume(): + """Set autoresume start time.""" + autoresume = get_adlr_autoresume() + if autoresume: + torch.distributed.barrier() + autoresume.init() + torch.distributed.barrier() + + +def _set_random_seed(seed_, data_parallel_random_init=False): + """Set random seed for reproducability.""" + if seed_ is not None and seed_ > 0: + # Ensure that different pipeline MP stages get different seeds. + seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank()) + # Ensure different data parallel ranks get different seeds + if data_parallel_random_init: + seed = seed + (10 * mpu.get_data_parallel_rank()) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.device_count() > 0: + tensor_parallel.model_parallel_cuda_manual_seed(seed) + else: + raise ValueError("Seed ({}) should be a positive integer.".format(seed)) + + +def write_args_to_tensorboard(): + """Write arguments to tensorboard.""" + args = get_args() + writer = get_tensorboard_writer() + if writer: + for arg in vars(args): + writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration) + + +def set_jit_fusion_options(): + """Set PyTorch JIT layer fusion options.""" + # flags required to enable jit fusion kernels + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) + if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10): + # nvfuser + torch._C._jit_set_profiling_executor(True) + torch._C._jit_set_profiling_mode(True) + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_nvfuser_enabled(True) + torch._C._debug_set_autodiff_subgraph_inlining(False) + else: + # legacy pytorch fuser + torch._C._jit_set_profiling_mode(False) + torch._C._jit_set_profiling_executor(False) + torch._C._jit_override_can_fuse_on_cpu(True) + torch._C._jit_override_can_fuse_on_gpu(True) + + _warmup_jit_function() + + +def _warmup_jit_function(): + """Compilie JIT functions before the main training steps""" + args = get_args() + if args.bf16: + dtype = torch.bfloat16 + elif args.fp16: + dtype = torch.float16 + else: + dtype = torch.float32 + + # Warmup fused bias+gelu + bias = torch.rand( + args.ffn_hidden_size // args.tensor_model_parallel_size, + dtype=dtype, + device="cuda", + ) + input = torch.rand( + ( + args.seq_length, + args.micro_batch_size, + args.ffn_hidden_size // args.tensor_model_parallel_size, + ), + dtype=dtype, + device="cuda", + ) + # Warmup JIT fusions with the input grad_enable state of both forward + # prop and recomputation + for bias_grad, input_grad in zip([True, True], [False, True]): + bias.requires_grad, input.requires_grad = bias_grad, input_grad + for _ in range(5): + output = bias_gelu(bias, input) + del bias, input, output + + # Warmup fused bias+dropout+add + if args.sequence_parallel: + seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size() + else: + seq_length = args.seq_length + input = torch.rand( + (seq_length, args.micro_batch_size, args.hidden_size), + dtype=dtype, + device="cuda", + ) + residual = torch.rand( + (seq_length, args.micro_batch_size, args.hidden_size), + dtype=dtype, + device="cuda", + ) + bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as( + residual + ) + dropout_rate = 0.1 + # Warmup JIT fusions with the input grad_enable state of both forward + # prop and recomputation + for input_grad, bias_grad, residual_grad in zip( + [False, True], [True, True], [True, True] + ): + input.requires_grad = input_grad + bias.requires_grad = bias_grad + residual.requires_grad = residual_grad + for _ in range(5): + output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate) + del bias, input, residual, output + torch.cuda.empty_cache() + + +def setup_logging() -> None: + """ Sets the default logging level based on cmdline args and env vars. + + Precedence: + 1. Command line argument `--logging-level` + 2. Env var `MEGATRON_LOGGING_LEVEL` + 3. Default logging level (INFO) + + Returns: None + """ + args = get_args() + logging_level = None + env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None) + if env_logging_level is not None: + logging_level = int(env_logging_level) + if args.logging_level is not None: + logging_level = args.logging_level + + if logging_level is not None: + logger.info(f'Setting logging level to {logging_level}') + logging.getLogger().setLevel(logging_level) diff --git a/megatron/training/log_handler.py b/megatron/training/log_handler.py new file mode 100644 index 0000000..06f5d18 --- /dev/null +++ b/megatron/training/log_handler.py @@ -0,0 +1,24 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import sys +from logging import LogRecord, StreamHandler + +BLACKLISTED_MODULES = ["torch.distributed"] + + +class CustomHandler(StreamHandler): + """ + Custom handler to filter out logging from code outside of + Megatron Core, and dump to stdout. + """ + + def __init__(self): + super().__init__(stream=sys.stdout) + + def filter(self, record: LogRecord) -> bool: + # Prevent log entries that come from the blacklisted modules + # through (e.g., PyTorch Distributed). + for blacklisted_module in BLACKLISTED_MODULES: + if record.name.startswith(blacklisted_module): + return False + return True diff --git a/megatron/training/one_logger_utils.py b/megatron/training/one_logger_utils.py new file mode 100644 index 0000000..3a45712 --- /dev/null +++ b/megatron/training/one_logger_utils.py @@ -0,0 +1,463 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import time, os + +from .global_vars import get_one_logger, get_args + + +def get_timestamp_in_ms(): + """Helper function to get timestamp in ms + + Returns: + [int]: [timestamp in ms] + """ + return round(time.time() * 1000.0) + + +def on_train_start(iteration, consumed_train_samples, train_samples, seq_length, + train_iters, save, async_save, log_throughput, + num_floating_point_operations_so_far): + """Function will be called at the start of train function to prepare and track E2E metrics. + + Args: + iteration (int): current iteration number + consumed_train_samples (int): consumed sample numbers so far + train_samples (int): total train sample number + seq_length (int): sequence length + train_iters (type): target iteration + save (str): output directory to save checkpoints to + async_save (bool): apply async checkpointing save + log_throughput (bool): log throughput or not + num_floating_point_operations_so_far (int): flops so far + """ + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + # Get app train loop start time + app_train_loop_start_time = get_timestamp_in_ms() + one_logger.store_set('app_train_loop_start_time', app_train_loop_start_time) + + # Set up initial values in store + one_logger.store_set('iteration_start', iteration) + one_logger.store_set('train_samples_start', consumed_train_samples) + + # Init accumulative metric values in one-logger store + one_logger.store_set('train_iterations_time_msecs_total', 0) + one_logger.store_set('tracked_train_iterations', iteration) + one_logger.store_set('validation_iterations_time_msecs_total', 0) + one_logger.store_set('tracked_validation_iterations', 0) + one_logger.store_set('save_checkpoint_count', 0) + one_logger.store_set('save_checkpoint_sync_time_total', 0.0) + + train_samples_target = train_samples + train_tokens_target = seq_length * train_samples_target + e2e_metrics = { + 'train_samples_start': consumed_train_samples, + 'train_iterations_start': iteration, + 'train_samples_target': train_samples_target, + 'train_iterations_target': train_iters, + 'train_tokens_target': train_tokens_target, + 'app_train_loop_start_time': app_train_loop_start_time, + 'is_save_checkpoint_enabled': save is not None, + 'save_checkpoint_strategy': 'async' if async_save else 'sync', + } + if log_throughput: + e2e_metrics.update({ + 'train_tflop_start': float(num_floating_point_operations_so_far) / (10**12), + }) + one_logger.log_metrics(e2e_metrics) + + +def _produce_e2e_metrics(log_throughput=False, throughput=None): + """ Generate APP metrics for E2E tracking + NOTE: always call this function after barrier call + + Args: + log_throughput (bool, optional): if log throughput or not. Defaults to False. + throughput (int, optional): throughput value to log. Defaults to None. + + Returns: + dict: all E2E metrics + """ + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + # Unpack and assign local vars + base_metrics = one_logger.store_get('get_e2e_base_metrics')() + (iteration, train_duration, eval_duration, eval_iterations, + total_flops, num_floating_point_operations_so_far, + consumed_train_samples, world_size, seq_length) = base_metrics.values() + + iteration_start = one_logger.store_get('iteration_start') + train_samples_start = one_logger.store_get('train_samples_start') + + train_samples = consumed_train_samples - train_samples_start + train_iterations = iteration - iteration_start + train_iterations_time_msecs_avg = (train_duration * 1000.0) / train_iterations + if eval_iterations: + validation_iterations_time_msecs_avg = (eval_duration * 1000.0) / eval_iterations + else: + validation_iterations_time_msecs_avg = None + + if not one_logger.store_has_key('first_logged_train_iterations_finish_time'): + one_logger.store_set( + 'first_logged_train_iterations_finish_time', + get_timestamp_in_ms() + ) + + train_tokens = train_samples * seq_length + + e2e_metrics = { + 'first_logged_train_iterations_finish_time': \ + one_logger.store_get('first_logged_train_iterations_finish_time'), + 'train_iterations_end': iteration, + 'train_samples_end': consumed_train_samples, + 'train_iterations': train_iterations, + 'train_samples': train_samples, + 'train_iterations_time_msecs_avg': train_iterations_time_msecs_avg, + 'validation_iterations_time_total': eval_duration, + 'validation_iterations_time_msecs_avg': validation_iterations_time_msecs_avg, + 'train_tokens': train_tokens, + 'train_iterations_time_total': train_duration, + 'last_logged_train_iterations_finish_time': get_timestamp_in_ms(), + } + + if log_throughput: + if train_duration: + train_throughput_per_gpu = total_flops / (train_duration * 10**12 * world_size) + else: + train_throughput_per_gpu = 0.0 + + train_throughput_per_gpu_max = one_logger.store_get('train_throughput_per_gpu_max') + if throughput: + train_throughput_per_gpu_max = max(throughput, train_throughput_per_gpu_max) + one_logger.store_set('train_throughput_per_gpu_max', train_throughput_per_gpu_max) + + throughput_metrics = { + 'train_tflop_end': float(num_floating_point_operations_so_far) / (10**12), + 'train_tflop': float(total_flops) / (10**12), + 'train_throughput_per_gpu': train_throughput_per_gpu, + 'train_throughput_per_gpu_max': train_throughput_per_gpu_max, + } + e2e_metrics.update(throughput_metrics) + + # Tracking minimal train/validation iteration duration metrics + # Minimal train iteration duration + current_train_iterations_time_msecs_total = train_duration * 1000.0 + current_train_iteration = iteration + prev_train_iterations_time_msecs_total = one_logger.store_get('train_iterations_time_msecs_total') + tracked_train_iterations = one_logger.store_get('tracked_train_iterations') + + if current_train_iteration > tracked_train_iterations: + train_iterations_time_msecs = ( + (current_train_iterations_time_msecs_total - prev_train_iterations_time_msecs_total) / + (current_train_iteration - tracked_train_iterations) + ) + + if not one_logger.store_has_key('train_iterations_time_msecs_min'): + train_iterations_time_msecs_min = train_iterations_time_msecs + else: + train_iterations_time_msecs_min = min( + one_logger.store_get('train_iterations_time_msecs_min'), + train_iterations_time_msecs + ) + one_logger.store_set('train_iterations_time_msecs_min', train_iterations_time_msecs_min) + one_logger.store_set('train_iterations_time_msecs_total', current_train_iterations_time_msecs_total) + one_logger.store_set('tracked_train_iterations', current_train_iteration) + + e2e_metrics.update({ + 'train_iterations_time_msecs_min': train_iterations_time_msecs_min + }) + + # Minimal validation iteration duration + current_validation_iterations_time_msecs_total = eval_duration * 1000.0 + current_validation_iteration = eval_iterations + prev_validation_iterations_time_msecs_total = \ + one_logger.store_get('validation_iterations_time_msecs_total') + tracked_validation_iterations = one_logger.store_get('tracked_validation_iterations') + + if current_validation_iteration > tracked_validation_iterations: + validation_iterations_time_msecs = ( + (current_validation_iterations_time_msecs_total - prev_validation_iterations_time_msecs_total) / + (current_validation_iteration - tracked_validation_iterations) + ) + + # Cache minimal validation iteration duration + if not one_logger.store_has_key('validation_iterations_time_msecs_min'): + validation_iterations_time_msecs_min = validation_iterations_time_msecs + else: + validation_iterations_time_msecs_min = min( + one_logger.store_get('validation_iterations_time_msecs_min'), + validation_iterations_time_msecs + ) + one_logger.store_set('validation_iterations_time_msecs_min', validation_iterations_time_msecs_min) + one_logger.store_set('validation_iterations_time_msecs_total', current_validation_iterations_time_msecs_total) + one_logger.store_set('tracked_validation_iterations', current_validation_iteration) + + e2e_metrics.update({ + 'validation_iterations_time_msecs_min': validation_iterations_time_msecs_min + }) + return e2e_metrics + + +def track_e2e_metrics(log_throughput=False, throughput=None): + """Track E2E application metrics with one-logger + + NOTE: the function should be called after barrier call. + + Args: + log_throughput (bool, optional): if log throughput or not. Defaults to False. + throughput (int, optional): throughput value to log. Defaults to None. + """ + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + e2e_metrics = _produce_e2e_metrics(log_throughput, throughput) + one_logger.log_metrics(e2e_metrics) + + +def on_save_checkpoint_start(async_save): + """Function to be called before save-checkpoint start to generate productive metrics to log after ckpt succeeds. + + Args: + async_save (bool): apply async checkpointing save + + Returns: + dict: productive metrics to be stored to DB after ckpt succeeds + """ + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + # Unpack and assign local vars + base_metrics = one_logger.store_get('get_e2e_base_metrics')() + (iteration, train_duration, eval_duration, eval_iterations, + total_flops, num_floating_point_operations_so_far, + consumed_train_samples, world_size, seq_length) = base_metrics.values() + + save_checkpoint_count = one_logger.store_get('save_checkpoint_count') + 1 + one_logger.store_set('save_checkpoint_count', save_checkpoint_count) + one_logger.log_metrics({ + 'train_iterations_save_checkpoint_end': iteration, + 'save_checkpoint_count': save_checkpoint_count, + }) + productive_metrics = { + 'train_tflop_productive_end': float(num_floating_point_operations_so_far) / (10**12), + 'train_iterations_productive_end': iteration, + 'train_samples_productive_end': consumed_train_samples, + 'train_iterations_time_total_productive': train_duration, + 'validation_iterations_time_total_productive': eval_duration, + } + if async_save: + productive_metrics.update({ + 'save_checkpoint_async_count': save_checkpoint_count, + }) + return productive_metrics + + +def on_pretrain_start(): + """ Function to be called at the start of pretrain function to track E2E meta data + """ + args = get_args() + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + job_name = os.environ.get('SLURM_JOB_NAME', None) + app_tag_run_name = job_name if not args.app_tag_run_name else args.app_tag_run_name + app_tag_run_version = args.app_tag_run_version + one_logger.store_set('app_tag_run_name', app_tag_run_name) + one_logger.store_set('app_tag_run_version', app_tag_run_version) + one_logger.store_set('train_throughput_per_gpu_max', 0.0) + + one_logger.log_metrics({ + 'train_iterations_warmup': 5, + 'data_parallel_size' : args.data_parallel_size, + 'context_parallel_size': args.context_parallel_size, + 'global_batch_size': args.global_batch_size, + 'micro_batch_size': args.micro_batch_size, + 'pipeline_model_parallel_size': args.pipeline_model_parallel_size, + 'tensor_model_parallel_size': args.tensor_model_parallel_size, + 'expert_model_parallel_size' : args.expert_model_parallel_size, + 'world_size': args.world_size, + 'model_seq_length': args.seq_length, + 'app_tag_run_name': app_tag_run_name, + 'app_tag_run_version': app_tag_run_version, + 'is_log_throughput_enabled': args.log_throughput, + 'app_run_type': 'training', + 'summary_data_schema_version': '1.0.0', + 'app_metrics_feature_tags': 'full', + }) + +def track_config_flags(train_iters, skip_train, do_train, do_valid, do_test, + dataloader_type, retro_project_dir, retro_cyclic_train_iters): + """Track flags about train/validation/test enablement + + Args: + train_iters (int): target train iteration number + skip_train (bool): flag to skip train iterations + do_train (bool): flags to do train + do_valid (bool): flags to do validation + do_test (bool): flags to do test + dataloader_type (str): dataloader type + retro_project_dir (str): Retro project directory + retro_cyclic_train_iters (int): iteration number for cyclic retro training + """ + one_logger = get_one_logger() + if one_logger: + with one_logger.get_context_manager(): + # Update train_iters for cyclic loader + if dataloader_type == 'cyclic' and retro_project_dir: + assert retro_cyclic_train_iters is not None + train_iters = retro_cyclic_train_iters + # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built. + train_enabled = train_iters and (not skip_train) and do_train and train_iters > 0 + one_logger.log_metrics({ + 'is_train_iterations_enabled': train_enabled, + 'is_validation_iterations_enabled': bool(do_valid), + 'is_test_iterations_enabled': bool(do_test), + }) + +def on_save_checkpoint_success(productive_metrics, async_save): + """Function to be called after checkpointing succeeds and checkpoint is persisted for storing productive metrics + + Args: + productive_metrics (dict): productive related E2E metrics generated at the start of save checkpoint + async_save (bool): apply async checkpointing save + """ + one_logger = get_one_logger() + + if one_logger: + with one_logger.get_context_manager(): + # Accumulate train_iterations_time_total_productive for current iteration + prod_iteration = productive_metrics['train_iterations_productive_end'] + + # Log start timestamp of first iteration that was successfully checkpointed + if not one_logger.store_has_key('first_checkpoint_success'): + app_train_loop_start_time = one_logger.store_get('app_train_loop_start_time') + one_logger.store_set('first_checkpoint_success', True) + one_logger.log_metrics({ + 'first_saved_train_iterations_start_time': app_train_loop_start_time + }) + + # Handle possible out-of-order async checkpoint callbacks + need_update = True + if one_logger.store_has_key('iters_prod_max'): + need_update = prod_iteration > one_logger.store_get('iters_prod_max') + + if need_update: + # Update cache + one_logger.store_set('iters_prod_max', prod_iteration) + + if async_save: + save_checkpoint_sync_time_total_productive = \ + one_logger.store_pop(f'save_checkpoint_sync_time_total_productive:{prod_iteration}') + last_successful_save_checkpoint_sync_finish_time = \ + one_logger.store_pop(f'save_checkpoint_sync_finish_time:{prod_iteration}') + # Update productive metrics and log to DB + productive_metrics.update({ + 'save_checkpoint_sync_time_total_productive': save_checkpoint_sync_time_total_productive, + 'last_successful_save_checkpoint_sync_finish_time': last_successful_save_checkpoint_sync_finish_time + }) + one_logger.log_metrics(productive_metrics) + + +def on_save_checkpoint_end(save_checkpoint_duration, current_iteration, async_save): + """Function to be called after checkpointing ends + + Args: + save_checkpoint_duration (float): duration of current save checkpoint process + current_iteration (int): current train iteration step number + async_save (bool): apply async checkpointing save + """ + one_logger = get_one_logger() + if one_logger: + with one_logger.get_context_manager(): + save_checkpoint_sync_finish_time = get_timestamp_in_ms() + + # Track finish timestamp of the sync part of first successful save checkpoint + if (one_logger.store_has_key('first_checkpoint_success') + and not one_logger.store_has_key('first_successful_checkpoint_end')): + one_logger.store_set('first_successful_checkpoint_end', True) + one_logger.log_metrics({ + 'first_successful_save_checkpoint_sync_finish_time': save_checkpoint_sync_finish_time + }) + + save_checkpoint_sync_count = one_logger.store_get('save_checkpoint_count') + + # accumulate total sync checkpointing duration + save_checkpoint_sync_time_total = \ + one_logger.store_get('save_checkpoint_sync_time_total') + save_checkpoint_duration + one_logger.store_set('save_checkpoint_sync_time_total', save_checkpoint_sync_time_total) + + e2e_metrics = {} + if async_save: + # Cache total sync checkpointing duration + one_logger.store_set( + f'save_checkpoint_sync_time_total_productive:{current_iteration}', + save_checkpoint_sync_time_total + ) + # Cache finish time for current iteration + one_logger.store_set(f'save_checkpoint_sync_finish_time:{current_iteration}', + save_checkpoint_sync_finish_time) + else: + e2e_metrics.update({ + # Track productive total time directly for sync ckpt + 'save_checkpoint_sync_time_total_productive': save_checkpoint_sync_time_total, + 'last_successful_save_checkpoint_sync_finish_time': save_checkpoint_sync_finish_time, + }) + + # Tracking min & max value sync checkpointing duration + # For the first comparison + if not one_logger.store_has_key('save_checkpoint_sync_time_max'): + one_logger.store_set('save_checkpoint_sync_time_max', save_checkpoint_duration) + if not one_logger.store_has_key('save_checkpoint_sync_time_min'): + one_logger.store_set('save_checkpoint_sync_time_min', save_checkpoint_duration) + + save_checkpoint_sync_time_max = max( + one_logger.store_get('save_checkpoint_sync_time_max'), + save_checkpoint_duration + ) + save_checkpoint_sync_time_min = min( + one_logger.store_get('save_checkpoint_sync_time_min'), + save_checkpoint_duration + ) + one_logger.store_set('save_checkpoint_sync_time_max', save_checkpoint_sync_time_max) + one_logger.store_set('save_checkpoint_sync_time_min', save_checkpoint_sync_time_min) + e2e_metrics.update({ + 'save_checkpoint_sync_count': save_checkpoint_sync_count, + 'save_checkpoint_sync_time_max': save_checkpoint_sync_time_max, + 'save_checkpoint_sync_time_min': save_checkpoint_sync_time_min, + 'save_checkpoint_sync_time_total': save_checkpoint_sync_time_total, + }) + one_logger.log_metrics(e2e_metrics) + + +def track_app_tag(batch_size, world_size, seq_length): + """Track app_tag and app_tag ID + + Args: + batch_size (int): current batch size + world_size (int): the number of processes of current job + seq_length (int): current sequence length + """ + # Track app tag & app tag ID + one_logger = get_one_logger() + if one_logger: + with one_logger.get_context_manager(): + app_tag_run_name = one_logger.store_get('app_tag_run_name') + app_tag_run_version = one_logger.store_get('app_tag_run_version') + current_app_tag = (f'{app_tag_run_name}_{app_tag_run_version}_{batch_size}' + f'_{world_size}_{seq_length}') + one_logger.log_app_tag(current_app_tag) + + +def finish(): + """Flush E2E metrics to remote server + """ + one_logger = get_one_logger() + if one_logger: + with one_logger.get_context_manager(): + one_logger.finish() diff --git a/megatron/training/optimizer_param_scheduler.py b/megatron/training/optimizer_param_scheduler.py new file mode 100644 index 0000000..409e1db --- /dev/null +++ b/megatron/training/optimizer_param_scheduler.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Learning rate decay and weight decay incr functions.""" + +import math + +from .utils import print_rank_0 + +class OptimizerParamScheduler(object): + """Anneals learning rate and weight decay""" + + def __init__(self, optimizer, init_lr, max_lr, min_lr, + lr_warmup_steps, lr_decay_steps, lr_decay_style, + start_wd, end_wd, wd_incr_steps, wd_incr_style, + use_checkpoint_opt_param_scheduler=True, + override_opt_param_scheduler=False, + wsd_decay_steps=None, + lr_wsd_decay_style=None): + + # Class values. + self.optimizer = optimizer + + self.init_lr = init_lr + self.max_lr = float(max_lr) + self.min_lr = min_lr + assert self.min_lr >= 0.0 + assert self.max_lr >= self.min_lr + assert self.init_lr <= self.max_lr + + self.lr_warmup_steps = lr_warmup_steps + self.num_steps = 0 + self.lr_decay_steps = lr_decay_steps + self.wsd_decay_steps = wsd_decay_steps + self.lr_wsd_decay_style = lr_wsd_decay_style + assert self.lr_decay_steps > 0 + assert self.lr_warmup_steps < self.lr_decay_steps + + self.lr_decay_style = lr_decay_style + if self.lr_decay_style == "WSD": + assert self.wsd_decay_steps is not None + + self.start_wd = start_wd + self.end_wd = end_wd + assert self.start_wd >= 0.0 + assert self.end_wd >= self.start_wd + self.wd_incr_steps = wd_incr_steps + self.wd_incr_style = wd_incr_style + + self.override_opt_param_scheduler = override_opt_param_scheduler + self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler + if self.override_opt_param_scheduler: + assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\ + 'use-checkpoint are set.' + + # Set the learning rate + self.step(0) + print_rank_0('> learning rate decay style: {}'.format(self.lr_decay_style)) + + + def get_wd(self): + """ Weight decay incr functions""" + if self.num_steps > self.wd_incr_steps: + return self.end_wd + + if self.wd_incr_style == 'constant': + assert self.start_wd == self.end_wd + return self.end_wd + + incr_ratio = float(self.num_steps) / float(self.wd_incr_steps) + assert incr_ratio >= 0.0 + assert incr_ratio <= 1.0 + delta_wd = self.end_wd - self.start_wd + + if self.wd_incr_style == 'linear': + coeff = incr_ratio + elif self.wd_incr_style == 'cosine': + coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0) + else: + raise Exception('{} weight decay increment style is not supported.'.format( + self.wd_incr_style)) + + return self.start_wd + coeff * delta_wd + + + def get_lr(self, param_group): + """Learning rate decay functions from: + https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" + + max_lr = param_group.get('max_lr', self.max_lr) + min_lr = param_group.get('min_lr', self.min_lr) + + # Use linear warmup for the initial part. + if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps: + return ( + self.init_lr + + ( + (max_lr - self.init_lr) + * float(self.num_steps) + / float(self.lr_warmup_steps) + ) + ) + + # If the learning rate is constant, just return the initial value. + if self.lr_decay_style == 'constant': + return max_lr + + # For any steps larger than `self.lr_decay_steps`, use `min_lr`. + if self.num_steps > self.lr_decay_steps: + return min_lr + + # If we are done with the warmup period, use the decay style. + if self.lr_decay_style == 'inverse-square-root': + warmup_steps = max(self.lr_warmup_steps, 1) + num_steps = max(self.num_steps, 1) + lr = max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5) + return max(min_lr, lr) + + num_steps_ = self.num_steps - self.lr_warmup_steps + decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps + decay_ratio = float(num_steps_) / float(decay_steps_) + assert decay_ratio >= 0.0 + assert decay_ratio <= 1.0 + delta_lr = max_lr - min_lr + + if self.lr_decay_style == 'linear': + coeff = (1.0 - decay_ratio) + elif self.lr_decay_style == 'cosine': + coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) + elif self.lr_decay_style == 'WSD': + wsd_anneal_start_ = self.lr_decay_steps - self.wsd_decay_steps + if self.num_steps <= wsd_anneal_start_: + coeff = 1.0 + else: + wsd_steps = self.num_steps - wsd_anneal_start_ + wsd_decay_ratio = float(wsd_steps) / float(self.wsd_decay_steps) + if self.lr_wsd_decay_style == "linear": + coeff = (1.0 - wsd_decay_ratio) + elif self.lr_wsd_decay_style == "cosine": + coeff = 0.5 * (math.cos(math.pi * wsd_decay_ratio) + 1.0) + elif self.lr_wsd_decay_style == "exponential": + coeff = ((2.0 * math.pow(0.5, wsd_decay_ratio)) - 1.0) + else: + raise Exception('{} decay style is not supported.'.format( + self.lr_decay_style)) + + return min_lr + coeff * delta_lr + + + def step(self, increment): + """Set lr for all parameters groups.""" + self.num_steps += increment + new_wd = self.get_wd() + for param_group in self.optimizer.param_groups: + new_lr = self.get_lr(param_group) + param_group['lr'] = new_lr * param_group.get('lr_mult', 1.0) + param_group['weight_decay'] = new_wd * param_group.get('wd_mult', 1.0) + + + def state_dict(self): + state_dict = { + 'max_lr': self.max_lr, + 'lr_warmup_steps': self.lr_warmup_steps, + 'num_steps': self.num_steps, + 'lr_decay_style': self.lr_decay_style, + 'lr_decay_steps': self.lr_decay_steps, + 'min_lr': self.min_lr, + 'start_wd': self.start_wd, + 'end_wd': self.end_wd, + 'wd_incr_style': self.wd_incr_style, + 'wd_incr_steps': self.wd_incr_steps + } + return state_dict + + + def _check_and_set(self, cls_value, sd_value, name): + """Auxiliary function for checking the values in the checkpoint and + setting them.""" + if self.override_opt_param_scheduler: + print_rank_0(' > overriding {} value to {}'.format(name, cls_value)) + return cls_value + + if not self.use_checkpoint_opt_param_scheduler: + assert cls_value == sd_value, \ + f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \ + f'value {sd_value} for {name} do not match' + print_rank_0(' > using checkpoint value {} for {}'.format(sd_value, + name)) + return sd_value + + + def load_state_dict(self, sd): + + if 'start_lr' in sd: + max_lr_ = sd['start_lr'] + else: + max_lr_ = sd['max_lr'] + self.max_lr = self._check_and_set(self.max_lr, max_lr_, + 'learning rate') + + self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'], + 'minimum learning rate') + + if 'warmup_iter' in sd: + lr_warmup_steps_ = sd['warmup_iter'] + elif 'warmup_steps' in sd: + lr_warmup_steps_ = sd['warmup_steps'] + else: + lr_warmup_steps_ = sd['lr_warmup_steps'] + self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps, + lr_warmup_steps_, + 'warmup iterations') + + if 'end_iter' in sd: + lr_decay_steps_ = sd['end_iter'] + elif 'decay_steps' in sd: + lr_decay_steps_ = sd['decay_steps'] + else: + lr_decay_steps_ = sd['lr_decay_steps'] + self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_, + 'total number of iterations') + + if 'decay_style' in sd: + lr_decay_style_ = sd['decay_style'] + else: + lr_decay_style_ = sd['lr_decay_style'] + self.lr_decay_style = self._check_and_set(self.lr_decay_style, + lr_decay_style_, + 'learning rate decay style') + + if 'num_iters' in sd: + num_steps = sd['num_iters'] + else: + num_steps = sd['num_steps'] + self.step(increment=num_steps) + + + if 'start_wd' in sd: + self.start_wd = self._check_and_set(self.start_wd, + sd['start_wd'], + "start weight decay") + self.end_wd = self._check_and_set(self.end_wd, + sd['end_wd'], + "end weight decay") + self.wd_incr_steps = self._check_and_set(self.wd_incr_steps, + sd['wd_incr_steps'], + "total number of weight decay iterations") + self.wd_incr_style = self._check_and_set(self.wd_incr_style, + sd['wd_incr_style'], + "weight decay incr style") \ No newline at end of file diff --git a/megatron/training/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py new file mode 100644 index 0000000..f9b7503 --- /dev/null +++ b/megatron/training/theoretical_memory_usage.py @@ -0,0 +1,187 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Computes theoretical memory footprint for model training.""" + + +import math + +NUM_BYTES_IN_MEGABYTE = 1024 * 1024 + + +def compute_weight_and_optimizer_memory(args, verbose=False): + # Attention projection size. + query_projection_size = args.kv_channels * args.num_attention_heads + query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size + # Group Query Attention. + if not args.group_query_attention: + args.num_query_groups = args.num_attention_heads + # MoE. + num_experts = 1 if args.num_experts is None else args.num_experts + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + num_parameters_in_transformer_layers = ( + 2 + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + # Attention. + ( + (1 + (args.num_query_groups / args.num_attention_heads)) + * query_projection_to_hidden_size_ratio + ) + # MLP. + + ((args.ffn_hidden_size / args.hidden_size) * num_experts * gated_linear_multiplier) + # Transformer layernorms. + + (2 / args.hidden_size) + # Final layernorm. + + (1 / (args.num_layers * args.hidden_size)) + ) + ) + embedding_size = args.hidden_size * args.padded_vocab_size + if args.untie_embeddings_and_output_weights: + num_parameters_in_embedding_layers = 2 * embedding_size + else: + num_parameters_in_embedding_layers = embedding_size + num_total_parameters = num_parameters_in_transformer_layers + num_parameters_in_embedding_layers + if verbose: + print( + f"Number of parameters in transformer layers in billions: " + f"{num_parameters_in_transformer_layers / 10**9: .2f}" + ) + print( + f"Number of parameters in embedding layers in billions: " + f"{num_parameters_in_embedding_layers / 10**9:.2f}" + ) + print(f"Total number of parameters in billions: {num_total_parameters / 10**9:.2f}") + + # Most loaded model shard has (1/pp_size transformer layers + 1 embedding layer) / tp_size. + num_parameters_on_most_loaded_model_shard = ( + (num_parameters_in_transformer_layers / args.pipeline_model_parallel_size) + embedding_size + ) / args.tensor_model_parallel_size + if args.untie_embeddings_and_output_weights and args.pipeline_model_parallel_size == 1: + num_parameters_on_most_loaded_model_shard += ( + embedding_size / args.tensor_model_parallel_size + ) + if verbose: + print( + f"Number of parameters in most loaded shard in billions: " + f"{num_parameters_on_most_loaded_model_shard / 10**9:.4f}" + ) + + if args.pipeline_model_parallel_size > 1: + # Other shards just have (1/pp_size transformer layers) / tp_size. + num_parameters_on_other_model_shards = num_parameters_in_transformer_layers / ( + args.pipeline_model_parallel_size * args.tensor_model_parallel_size + ) + if verbose: + print( + f"Number of parameters in other shards in billions: " + f"{num_parameters_on_other_model_shards / 10**9:.4f}" + ) + + num_bytes_per_parameter = ( + 18 if not args.use_distributed_optimizer else 6 + (12 / args.data_parallel_size) + ) + weight_and_optimizer_memory = ( + num_parameters_on_most_loaded_model_shard * num_bytes_per_parameter + ) + + return weight_and_optimizer_memory + + +def compute_activation_memory(args, num_microbatches, verbose=False): + # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf. + # We are trying to compute the maximum activation footprint, so all calculations in this + # function are for the first pipeline stage. + + # TODO: This function needs to take into account query_projection_size potentially being + # different from hidden_size. + + # Memory footprint from transformer layer (self-attention and MLP). + activation_memory = (args.seq_length * args.micro_batch_size * args.hidden_size) * ( + 18 + (4 * (args.ffn_hidden_size / args.hidden_size)) + ) + if verbose: + print( + f"Activation memory footprint per transformer layer: " + f"{activation_memory / NUM_BYTES_IN_MEGABYTE / args.tensor_model_parallel_size:.1f} MB" + ) + activation_memory *= args.num_layers + + # Now add activation memory required for input embeddings, last LayerNorm and output layer. + + # Input to embedding (pp_size microbatches in flight). + activation_memory += ( + 8 * args.seq_length * args.micro_batch_size * args.pipeline_model_parallel_size + ) + # Dropout in embedding layer (pp_size microbatches in flight). + activation_memory += ( + args.seq_length + * args.micro_batch_size + * args.hidden_size + * args.pipeline_model_parallel_size + ) + + # Multiply by interleaved PP memory factor. + if args.virtual_pipeline_model_parallel_size is not None: + interleaved_schedule_memory_penalty = 1 + ( + (args.pipeline_model_parallel_size - 1) + / (args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size) + ) + in_flight_microbatches = math.ceil( + interleaved_schedule_memory_penalty * args.pipeline_model_parallel_size + ) + if verbose: + print( + f"Memory penalty from interleaved schedule: {interleaved_schedule_memory_penalty:.2f}" + ) + print(f"Number of in-flight microbatches: {in_flight_microbatches}") + activation_memory *= interleaved_schedule_memory_penalty + + # If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size, + # so discount accordingly. + if args.virtual_pipeline_model_parallel_size is None and args.pipeline_model_parallel_size > 1: + if num_microbatches is not None: + activation_memory *= min(1, num_microbatches / args.pipeline_model_parallel_size) + in_flight_microbatches = min(num_microbatches, args.pipeline_model_parallel_size) + else: + in_flight_microbatches = args.pipeline_model_parallel_size + if verbose: + print(f"Number of in-flight microbatches: {in_flight_microbatches}") + + if args.pipeline_model_parallel_size == 1: + # Inputs to output layer and CE loss. + activation_memory += ( + args.seq_length + * args.micro_batch_size + * args.hidden_size + * 4 + * (1 + (args.padded_vocab_size / args.hidden_size)) + ) + + # Activation memory is partitioned by TP size due to tensor and sequence model parallelism. + return activation_memory / args.tensor_model_parallel_size + + +def report_theoretical_memory(args, num_microbatches=None, verbose=False): + weight_and_optimizer_memory = ( + compute_weight_and_optimizer_memory(args, verbose=verbose) / NUM_BYTES_IN_MEGABYTE + ) + + # Formulae here assume sequence parallelism and selective activation recomputation. + if not args.sequence_parallel or args.recompute_granularity != 'selective': + print( + f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory:.2f} MB" + ) + return + + activation_memory = ( + compute_activation_memory(args, num_microbatches=num_microbatches, verbose=verbose) + / NUM_BYTES_IN_MEGABYTE + ) + total_memory = weight_and_optimizer_memory + activation_memory + + print( + f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory:.2f} MB, " + f"activation={activation_memory:.2f} MB, total={total_memory:.2f} MB\n" + ) diff --git a/megatron/training/tokenizer/__init__.py b/megatron/training/tokenizer/__init__.py new file mode 100644 index 0000000..59ceb33 --- /dev/null +++ b/megatron/training/tokenizer/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +from .tokenizer import build_tokenizer diff --git a/megatron/training/tokenizer/bert_tokenization.py b/megatron/training/tokenizer/bert_tokenization.py new file mode 100644 index 0000000..642041e --- /dev/null +++ b/megatron/training/tokenizer/bert_tokenization.py @@ -0,0 +1,431 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding = "utf-8") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + @staticmethod + def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): + """ Converts a sequence of tokens (string) in a single string. """ + + def clean_up_tokenization(out_string): + """ Clean up a list of simple English tokenization artifacts + like spaces before punctuations and abreviated forms. + """ + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string + + text = ' '.join(tokens).replace(' ##', '').strip() + if clean_up_tokenization_spaces: + clean_text = clean_up_tokenization(text) + return clean_text + else: + return text + + def vocab_size(self): + return len(self.vocab) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/megatron/training/tokenizer/gpt2_tokenization.py b/megatron/training/tokenizer/gpt2_tokenization.py new file mode 100644 index 0000000..3f37e44 --- /dev/null +++ b/megatron/training/tokenizer/gpt2_tokenization.py @@ -0,0 +1,321 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for OpenAI GPT.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +import os +import regex as re +from io import open + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE + # tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", +} +PRETRAINED_MERGES_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'gpt2': 1024, +} +VOCAB_NAME = 'vocab.json' +MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \ + list(range(ord("®"), ord("ÿ") + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(object): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level BPE + """ + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None + else: + vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info("loading special tokens file {}".format(special_tokens_file)) + # redirect to the cache, if necessary + try: + from .file_utils import cached_path + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + vocab_file, merges_file)) + return None + if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + logger.info("loading merges file {}".format(merges_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + logger.info("loading merges file {} from cache at {}".format( + merges_file, resolved_merges_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls( + resolved_vocab_file, + resolved_merges_file, + special_tokens=special_tokens, + *inputs, + **kwargs) + return tokenizer + + def __init__(self, vocab_file, merges_file, errors='replace', + special_tokens=None, max_len=None): + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file)) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for + # capitalized versions of contractions + self.pat = re.compile( + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) + for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()} + logger.info("Special tokens {}".format(self.special_tokens)) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except BaseException: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this OpenAI GPT model ({} > {}). Running this" + " sequence through the model will result in indexing errors".format( + len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + return text + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + index = len(self.encoder) + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + + return vocab_file, merge_file, special_tokens_file diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py new file mode 100644 index 0000000..fa266af --- /dev/null +++ b/megatron/training/tokenizer/tokenizer.py @@ -0,0 +1,859 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Megatron tokenizers.""" + +from abc import ABC, abstractmethod +import base64 +import json +from pathlib import Path +from typing import Dict, List, Optional + +import types + +from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer + +from .bert_tokenization import FullTokenizer as FullBertTokenizer +from .gpt2_tokenization import GPT2Tokenizer + + +def build_tokenizer(args): + """Initialize tokenizer.""" + if args.rank == 0: + print('> building {} tokenizer ...'.format(args.tokenizer_type), + flush=True) + + # Select and instantiate the tokenizer. + if args.tokenizer_type == 'BertWordPieceLowerCase': + assert args.vocab_file is not None + tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, + lower_case=True, + vocab_extra_ids=args.vocab_extra_ids) + elif args.tokenizer_type == 'BertWordPieceCase': + assert args.vocab_file is not None + tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, + lower_case=False, + vocab_extra_ids=args.vocab_extra_ids) + elif args.tokenizer_type == 'GPT2BPETokenizer': + assert args.vocab_file is not None + assert args.merge_file is not None + tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) + elif args.tokenizer_type == 'SentencePieceTokenizer': + assert args.tokenizer_model is not None + tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids) + elif args.tokenizer_type == 'GPTSentencePieceTokenizer': + assert args.tokenizer_model is not None + tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'HuggingFaceTokenizer': + tokenizer = _HuggingFaceTokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'Llama2Tokenizer': + assert args.tokenizer_model is not None + tokenizer = _Llama2Tokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'Llama3Tokenizer': + assert args.tokenizer_model is not None + tokenizer = create_llama3_tokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'MistralTokenizer': + assert args.tokenizer_model is not None + tokenizer = create_mistral_tokenizer(args.tokenizer_model) + tokenizer.vocab_size = 32768 + tokenizer.eos_id = tokenizer.instruct_tokenizer.tokenizer.eos_id + elif args.tokenizer_type == 'TikTokenizer': + assert args.tokenizer_model is not None + assert args.tiktoken_pattern is not None + assert args.tiktoken_pattern in {"v1", "v2"} + pattern = PATTERN_TIKTOKEN if args.tiktoken_pattern == "v1" else PATTERN_TIKTOKEN_V2 + tokenizer = CustomTikTokenizer( + path=args.tokenizer_model, + pattern=pattern, + vocab_size=args.vocab_size, + num_special_tokens=args.tiktoken_num_special_tokens, + special_tokens=args.tiktoken_special_tokens, + ) + elif args.tokenizer_type == 'NullTokenizer': + assert args.vocab_size is not None + tokenizer = _NullTokenizer(args.vocab_size) + else: + raise NotImplementedError('{} tokenizer is not ' + 'implemented.'.format(args.tokenizer_type)) + + # Add vocab size (if not already set from a checkpoint). + if getattr(args, "padded_vocab_size", None) is None: + args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, + args) + + return tokenizer + + +def _vocab_size_with_padding(orig_vocab_size, args): + """Pad vocab size so it is divisible by model parallel size and + still having GPU friendly size.""" + + after = orig_vocab_size + multiple = args.make_vocab_size_divisible_by * \ + args.tensor_model_parallel_size + while (after % multiple) != 0: + after += 1 + if args.rank == 0: + print(' > padded vocab (size: {}) with {} dummy tokens ' + '(new size: {})'.format( + orig_vocab_size, after - orig_vocab_size, after), flush=True) + return after + + +class _HuggingFaceTokenizer(MegatronTokenizer): + def __init__(self, pretrained_model_name_or_path): + super().__init__(pretrained_model_name_or_path) + try: + import transformers + except ImportError: + raise EnvironmentError(f"The transformers library must be installed to use huggingface_tokenizer_provider") + + # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there + self._tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) + self._vocab = self._tokenizer.get_vocab() + self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()} + + @property + def vocab_size(self): + return len(self._tokenizer) + + @property + def vocab(self): + """Dictionary from vocab text token to id token.""" + return self._vocab + + @property + def inv_vocab(self): + """Dictionary from vocab id token to text token.""" + return self._inv_vocab + + @property + def decoder(self): + return self._inv_vocab + + def tokenize(self, text): + return self._tokenizer(text).input_ids + + def detokenize(self, token_ids): + return self._tokenizer.decode(token_ids) + + @property + def eod(self): + return self._tokenizer.eos_token_id + + +class _BertWordPieceTokenizer(MegatronTokenizer): + """Original BERT wordpiece tokenizer.""" + + def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0): + super().__init__(vocab_file, lower_case=lower_case, vocab_extra_ids=vocab_extra_ids) + self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case) + self.cls_id = self.tokenizer.vocab['[CLS]'] + self.sep_id = self.tokenizer.vocab['[SEP]'] + self.pad_id = self.tokenizer.vocab['[PAD]'] + self.mask_id = self.tokenizer.vocab['[MASK]'] + self._additional_special_tokens = [] + + # (dsachan) Add BOS and EOS tokens + SPECIAL_TOKENS = {'eos_token': '[EOS]', + 'bos_token': '[BOS]'} + self._bos_token = '[BOS]' + self.add_token(self._bos_token) + self._bos_token_id = self.vocab.get(self._bos_token) + + self._eos_token = '[EOS]' + self.add_token(self._eos_token) + self._eos_token_id = self.vocab.get(self._eos_token) + + # (dsachan) Add additional special tokens + # These can be used as sentinel tokens in T5 model inputs + additional_special_tokens = [] + additional_special_tokens.extend( + ["".format(i) for i in range(vocab_extra_ids)]) + self.add_additional_special_tokens(additional_special_tokens) + + def add_token(self, token): + if token not in self.vocab: + self.inv_vocab[self.vocab_size] = token + # self.vocab_size comes from len(vocab) + # and it will increase as we add elements + self.vocab[token] = self.vocab_size + + def add_additional_special_tokens(self, tokens_list): + setattr(self, "additional_special_tokens", tokens_list) + for value in tokens_list: + self.add_token(value) + + @property + def vocab_size(self): + return self.tokenizer.vocab_size() + + @property + def vocab(self): + return self.tokenizer.vocab + + @property + def inv_vocab(self): + return self.tokenizer.inv_vocab + + def tokenize(self, text): + text_tokens = self.tokenizer.tokenize(text) + return self.tokenizer.convert_tokens_to_ids(text_tokens) + + def decode(self, ids): + tokens = self.tokenizer.convert_ids_to_tokens(ids) + return self.tokenizer.convert_tokens_to_string(tokens) + + def decode_token_ids(self, token_ids): + tokens = self.tokenizer.convert_ids_to_tokens(token_ids) + exclude_list = ['[PAD]', '[CLS]'] + non_pads = [t for t in tokens if t not in exclude_list] + + result = "" + for s in non_pads: + if s.startswith("##"): + result += s[2:] + else: + result += " " + s + + return result + + @property + def cls(self): + return self.cls_id + + @property + def sep(self): + return self.sep_id + + @property + def pad(self): + return self.pad_id + + @property + def mask(self): + return self.mask_id + + @property + def bos(self): + """ Id of the beginning of sentence token in the vocabulary.""" + return self._bos_token_id + + @property + def eos(self): + """ Id of the end of sentence token in the vocabulary.""" + return self._eos_token_id + + @property + def bos_token(self): + """ Beginning of sentence token id """ + return self._bos_token + + @property + def eos_token(self): + """ End of sentence token id """ + return self._eos_token + + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings).""" + return self._additional_special_tokens + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers).""" + return [self.vocab.get(token) for token in self._additional_special_tokens] + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + +class _GPT2BPETokenizer(MegatronTokenizer): + """Original GPT2 BPE tokenizer.""" + + def __init__(self, vocab_file, merge_file): + super().__init__(vocab_file, merge_file) + + self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', + special_tokens=[], max_len=None) + self.eod_id = self.tokenizer.encoder['<|endoftext|>'] + + @property + def vocab_size(self): + return len(self.tokenizer.encoder) + + @property + def vocab(self): + return self.tokenizer.encoder + + @property + def inv_vocab(self): + return self.tokenizer.decoder + + def tokenize(self, text): + return self.tokenizer.encode(text) + + def detokenize(self, token_ids): + return self.tokenizer.decode(token_ids) + + @property + def eod(self): + return self.eod_id + + +class _SentencePieceTokenizer(MegatronTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file, vocab_extra_ids=0): + super().__init__(model_file, vocab_extra_ids=vocab_extra_ids) + + import sentencepiece + self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file) + self._initalize(vocab_extra_ids) + + def _populate_vocab(self): + self._vocab = {} + self._inv_vocab = {} + + for i in range(len(self.tokenizer)): + t = self.tokenizer.id_to_piece(i) + self._inv_vocab[i] = t + self._vocab[t] = i + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + self._special_tokens = {} + self._inv_special_tokens = {} + + self._t5_tokens = [] + + def _add_special_token(t): + if t not in self._vocab: + next_id = len(self._vocab) + self._vocab[t] = next_id + self._inv_vocab[next_id] = t + self._special_tokens[t] = self._vocab[t] + self._inv_special_tokens[self._vocab[t]] = t + + _add_special_token('') + self._cls_id = self._vocab[''] + _add_special_token('') + self._sep_id = self._vocab[''] + _add_special_token('') + self._eod_id = self._vocab[''] + _add_special_token('') + self._mask_id = self._vocab[''] + + pad_id = self.tokenizer.pad_id() + try: + pad_token = self.tokenizer.id_to_piece(pad_id) + except IndexError: + pad_token = '' + _add_special_token(pad_token) + self._pad_id = self._vocab[pad_token] + + bos_id = self.tokenizer.bos_id() + try: + bos_token = self.tokenizer.id_to_piece(bos_id) + except IndexError: + bos_token = '' + _add_special_token(bos_token) + self._bos_id = self._vocab[bos_token] + + eos_id = self.tokenizer.eos_id() + try: + eos_token = self.tokenizer.id_to_piece(eos_id) + except IndexError: + eos_token = '' + _add_special_token(eos_token) + self._eos_id = self._vocab[eos_token] + + for i in range(vocab_extra_ids): + t = "".format(i) + _add_special_token(t) + self._t5_tokens += [t] + + @property + def vocab_size(self): + return len(self._vocab) + + @property + def vocab(self): + return self._vocab + + @property + def inv_vocab(self): + return self._inv_vocab + + @property + def decoder(self): + return self._inv_vocab + + @property + def encoder(self): + return self._vocab + + # From: + # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89 + def tokenize(self, text): + ids = [] + idx = 0 + + while 1: + indices = {} + for token in self._special_tokens: + try: + indices[token] = text[idx:].index(token) + except ValueError: + continue + if len(indices) == 0: + break + + next_token = min(indices, key=indices.get) + next_idx = idx + indices[next_token] + + ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx])) + ids.append(self._special_tokens[next_token]) + idx = next_idx + len(next_token) + + ids.extend(self.tokenizer.encode_as_ids(text[idx:])) + return ids + + # From: + # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125 + def detokenize(self, ids): + text = "" + last_i = 0 + + for i, id in enumerate(ids): + if id in self._inv_special_tokens: + text += self.tokenizer.decode_ids(ids[last_i:i]) + " " + text += self._inv_special_tokens[id] + " " + last_i = i + 1 + + text += self.tokenizer.decode_ids(ids[last_i:]) + return text + + @property + def cls(self): + return self._cls_id + + @property + def sep(self): + return self._sep_id + + @property + def pad(self): + return self._pad_id + + @property + def bos(self): + return self._bos_id + + @property + def eod(self): + return self._eod_id + + @property + def eos(self): + return self._eos_id + + @property + def mask(self): + return self._mask_id + + @property + def additional_special_tokens_ids(self): + return [self.vocab[k] for k in self._t5_tokens] + + +class _GPTSentencePieceTokenizer(_SentencePieceTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file,): + super().__init__(model_file, vocab_extra_ids=0) + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + + self._pad_id = self.tokenizer.pad_id() + self._bos_id = self.tokenizer.bos_id() + self._eos_id = self.tokenizer.eos_id() + + def tokenize(self, text): + return self.tokenizer.encode_as_ids(text) + + def detokenize(self, ids): + return self.tokenizer.decode_ids(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self._eos_id + + @property + def additional_special_tokens_ids(self): + return None + + +class _Llama2Tokenizer(_SentencePieceTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file,): + super().__init__(model_file, vocab_extra_ids=0) + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + + # BOS / EOS token IDs + self.n_words: int = self.tokenizer.vocab_size() + self.bos_id: int = self.tokenizer.bos_id() + self.eos_id: int = self.tokenizer.eos_id() + self.pad_id: int = self.tokenizer.pad_id() + assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size() + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + assert type(s) is str + t = self.tokenizer.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def detokenize(self, ids): + return self.tokenizer.decode_ids(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self.eos_id + + @property + def additional_special_tokens_ids(self): + return None + + +def create_llama3_tokenizer(*args, **kwargs): + + try: + from llama.tokenizer import Tokenizer as Llama3Tokenizer + except ImportError: + raise ImportError("Module 'llama' is required but not installed.") + + class _Llama3Tokenizer(Llama3Tokenizer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def instruct_tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + + assert type(s) is str + + t = self.encode(s, bos=bos, eos=eos, allowed_special='all') + return t + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + + assert type(s) is str + + t = self.encode(s, bos=bos, eos=eos, allowed_special='all') + return t + + def detokenize(self, ids): + return self.decode(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self.eos_id + + @property + def additional_special_tokens_ids(self): + return None + + @property + def vocab_size(self): + return self.model.n_vocab + + return _Llama3Tokenizer(*args, **kwargs) + + +def create_mistral_tokenizer(*args, **kwargs): + try: + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + from mistral_common.tokens.instruct.request import InstructRequest + from mistral_common.protocol.instruct.messages import UserMessage + except ImportError: + raise ImportError("Module 'mistral-common' is required but not installed.") + + class _MistralTokenizer(MistralTokenizer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + tokenizer = _MistralTokenizer.from_file(*args, **kwargs) + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + + assert type(s) is str + + t = self.instruct_tokenizer.tokenizer.encode(s, bos=bos, eos=eos) + + return t + + def instruct_tokenize(self, s: str): + '''Default args for text completion, not chat/dialog.''' + + assert type(s) is str + + t = self.instruct_tokenizer.encode_instruct( + InstructRequest( + messages=[ + UserMessage(content=s), + ], + ) + ) + + return t.tokens[1:] # strip of box + + def detokenize(self, ids): + return self.instruct_tokenizer.tokenizer.decode(ids) + + tokenizer.tokenize = types.MethodType(tokenize, tokenizer) + tokenizer.detokenize = types.MethodType(detokenize, tokenizer) + tokenizer.instruct_tokenize = types.MethodType(instruct_tokenize, tokenizer) + + return tokenizer + + +def reload_mergeable_ranks( + path: str, + max_vocab: Optional[int] = None, +) -> Dict[bytes, int]: + """ + Reload our tokenizer JSON file and convert it to Tiktoken format. + """ + from ..utils import print_rank_0 # To prevent circular import. + + assert path.endswith(".json") + + # reload vocab + with open(path, "r") as f: + vocab = json.load(f) + assert isinstance(vocab, list) + print_rank_0(f"Vocab size: {len(vocab)}") + if max_vocab is not None: + vocab = vocab[:max_vocab] + print_rank_0(f"Cutting vocab to first {len(vocab)} tokens.") + + # build ranks + ranks: Dict[bytes, int] = {} + for i, x in enumerate(vocab): + assert x.keys() == {"rank", "token_bytes", "token_str"} + assert x["rank"] == i + merge = base64.b64decode(x["token_bytes"]) + assert i >= 256 or merge == bytes([i]) + ranks[merge] = x["rank"] + + # sanity check + assert len(ranks) == len(vocab) + assert set(ranks.values()) == set(range(len(ranks))) + + return ranks + + +PATTERN_TIKTOKEN = r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+" +PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + +class CustomTikTokenizer(MegatronTokenizer): + def __init__( + self, + path: str, + pattern: str, + vocab_size: Optional[int], + num_special_tokens: int, + special_tokens: Optional[List[str]], + ): + super().__init__( + path, + pattern=pattern, + vocab_size=vocab_size, + num_special_tokens=num_special_tokens, + special_tokens=special_tokens + ) + import tiktoken + from .. import print_rank_0 # To prevent circular import. + + if vocab_size is None: + vocab_size = 2**17 # Fallback vocab size is 131072. + self._vocab_size = vocab_size + + SPECIAL_TOKENS = ["", "", ""] + if special_tokens is None: + special_tokens = SPECIAL_TOKENS.copy() + assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}" + assert len(special_tokens) <= num_special_tokens < self._vocab_size + assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}" + + special_filler = ["".format(id=i) for i in range(len(special_tokens), num_special_tokens)] + if special_filler: + print_rank_0(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}") + special_tokens = special_tokens + special_filler + assert len(set(special_tokens)) == len(special_tokens) == num_special_tokens, special_tokens + inner_vocab_size = self._vocab_size - num_special_tokens + + token_to_id_without_special_tokens = reload_mergeable_ranks(path, max_vocab=inner_vocab_size) + # Create space for special tokens. + token_to_id_without_special_tokens = {t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()} + + special_tokens = {t: i for i, t in enumerate(special_tokens)} + self._unk_id = special_tokens[""] + self._bos_id = special_tokens[""] + self._eos_id = special_tokens[""] + + # Create tiktoken model. + self._model = tiktoken.Encoding( + name=Path(path).parent.name, + pat_str=pattern, + mergeable_ranks=token_to_id_without_special_tokens, + special_tokens=special_tokens, + ) + + # Create final _id_to_token and _token_to_id data structures with special tokens inserted + # into appropriate locations. + assert set(token_to_id_without_special_tokens.keys()).isdisjoint(set(special_tokens.keys())) + self._token_to_id = token_to_id_without_special_tokens.copy() + self._token_to_id.update(special_tokens) + self._id_to_token = {v: k for k, v in self._token_to_id.items()} + assert set(range(self._vocab_size)) == set(self._id_to_token.keys()) + + + @property + def bos(self) -> int: + return self._bos_id + + @property + def eos(self) -> int: + return self._eos_id + + @property + def unk(self) -> int: + return self._unk_id + + @property + def eod(self) -> int: + return self._eos_id + + @property + def vocab(self): + return self._token_to_id + + @property + def inv_vocab(self): + return self._id_to_token + + def tokenize(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + tokens = self._model.encode_ordinary(s) + if bos: + tokens = [self.bos, *tokens] + if eos: + tokens = [*tokens, self.eos] + + return tokens + + def detokenize(self, tokens: List[int]) -> str: + return self._model.decode(tokens) + + @property + def vocab_size(self) -> int: + return self._vocab_size + + @property + def encoder(self): + return self._token_to_id + + @property + def decoder(self): + return self._id_to_token + + +class _NullTokenizer(MegatronTokenizer): + def __init__(self, vocab_size): + super().__init__(None, vocab_size=vocab_size) + self._vocab_size_without_eod = int(vocab_size) + self._eod_id = self._vocab_size_without_eod + + def tokenize(self, text): + return [int(x) for x in text.split(' ')] + + def detokenize(self, ids): + text = [str(x) for x in ids] + return ' '.join(text) + + @property + def vocab_size(self): + return self._vocab_size_without_eod + 1 + + @property + def vocab(self): + raise NotImplementedError + + @property + def inv_vocab(self): + raise NotImplementedError + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self._eod_id + + @property + def additional_special_tokens_ids(self): + return None diff --git a/megatron/training/training.py b/megatron/training/training.py new file mode 100644 index 0000000..bc156e4 --- /dev/null +++ b/megatron/training/training.py @@ -0,0 +1,1549 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain utilities.""" + +import dataclasses +from datetime import datetime +import gc +import logging +import math +import os +import sys +from .log_handler import CustomHandler +# Make default logging level INFO, but filter out all log messages not from MCore. +logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO) +from .theoretical_memory_usage import report_theoretical_memory +import time +# The earliest we can measure the start time. +_TRAIN_START_TIME = time.time() +import torch + +from megatron.core import mpu, tensor_parallel +from megatron.core.utils import check_param_hashes_across_dp_replicas, get_model_config, StragglerDetector +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint +from megatron.legacy.model import Float16Module +from megatron.core.distributed import DistributedDataParallelConfig +from megatron.core.distributed import DistributedDataParallel as DDP +from megatron.core.distributed import finalize_model_grads +from megatron.core.enums import ModelType +from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig +from megatron.training.initialize import initialize_megatron +from megatron.training.initialize import write_args_to_tensorboard +from megatron.training.initialize import set_jit_fusion_options +from megatron.training.optimizer_param_scheduler import OptimizerParamScheduler +from megatron.legacy.data.data_samplers import build_pretraining_data_loader +from megatron.core.transformer.moe.moe_utils import track_moe_metrics +from megatron.core.pipeline_parallel import get_forward_backward_func +from megatron.core.num_microbatches_calculator import ( + get_current_global_batch_size, + get_num_microbatches, + update_num_microbatches) + +from .async_utils import maybe_finalize_async_save +from .utils import ( + calc_params_l2_norm, + check_adlr_autoresume_termination, + is_last_rank, + print_rank_0, + print_rank_last, + report_memory, + unwrap_model, + append_to_progress_log, +) +from .global_vars import ( + get_args, + get_signal_handler, + get_timers, + get_tensorboard_writer, + get_wandb_writer, + get_one_logger) +from . import one_logger_utils + + +stimer = StragglerDetector() + +def print_datetime(string): + """Note that this call will sync across all ranks.""" + torch.distributed.barrier() + time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + + +def num_floating_point_operations(args, batch_size): + # Attention projection size. + query_projection_size = args.kv_channels * args.num_attention_heads + query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size + # Group Query Attention. + if not args.group_query_attention: + args.num_query_groups = args.num_attention_heads + # MoE. + num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + # Attention. + ( + ( + 1 + + (args.num_query_groups / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + ) * query_projection_to_hidden_size_ratio + ) + # MLP. + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + # Logit. + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) + + +def get_start_time_from_progress_log(): + """ + Gets start time of earliest job with same world size. Also returns the number + of floating-point operations completed in last saved checkpoint. + """ + args = get_args() + assert args.save is not None + progress_log_filename = os.path.join(args.save, "progress.txt") + + # start_time is time when job with same world size started. + # start_num_floating_point_operations is the number of floating-point operations + # completed when this job started. + # latest_num_floating_point_operations is the number of floating-point operations + # completed in most recent saved checkpoint. + start_time = None + start_num_floating_point_operations = None + latest_num_floating_point_operations = 0 + + def _get_field(string, type): + return type(string.split(': ')[1]) + + with open(progress_log_filename, 'r') as f: + for line in f: + line = line.strip() + line_tokens = line.split('\t') + world_size_in_line = _get_field(line_tokens[2], int) + if line_tokens[3] == "Saved checkpoint": + latest_num_floating_point_operations = \ + _get_field(line_tokens[7], float) + if world_size_in_line != args.world_size: + # Re-start search if we see a different world size. + start_time = None + start_num_floating_point_operations = None + continue + if line_tokens[3] == "Starting job": + if start_time is None: + start_time = line_tokens[0] + start_num_floating_point_operations = \ + latest_num_floating_point_operations + assert start_time is not None and start_num_floating_point_operations is not None, \ + "Should have seen at least one 'Starting job' entry with same world_size" + return datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S'), \ + start_num_floating_point_operations + + +def pretrain(train_valid_test_dataset_provider, + model_provider, + model_type, + forward_step_func, + process_non_loss_data_func=None, + extra_args_provider=None, + args_defaults={}): + """Main training program. + + This function will run the followings in the order provided: + 1) initialize Megatron. + 2) setup model, optimizer and lr schedule using the model_provider. + 3) call train_val_test_data_provider to get train/val/test datasets. + 4) train the modle using the forward_step_func. + + Args: + train_valid_test_dataset_provider: a function that takes the size of + train/valid/test dataset and returns `train, valid, test` datasets. + model_provider: a function that returns a vanilla version of the + model. By vanilla we mean a simple model on cpu with no fp16 or ddp. + model_type: an enum that specifies the type of model being trained. + forward_step_func: a function that takes a `data iterator` and `model`, + and returns a `loss` scalar with a dictionary with key:values being + the info we would like to monitor during training, for example + `lm-loss: value`. We also require that this function add + `batch generator` to the timers class. + process_non_loss_data_func: a function to post process outputs of the + network. It can be used for dumping output tensors (e.g images) to + tensorboard. It takes `collected data`(list of tensors), + `current iteration index` and `tensorboard writer` as arguments. + extra_args_provider: a function that takes a parser and adds arguments + to it. It is used for programs to add their own arguments. + args_defaults: a dictionary from argument-name to argument-value. It + to set already parse arguments. + """ + + # Initalize and get arguments, timers, and Tensorboard writer. + initialize_megatron(extra_args_provider=extra_args_provider, + args_defaults=args_defaults) + + args = get_args() + timers = get_timers() + + if args.log_progress: + append_to_progress_log("Starting job") + + # Set pytorch JIT layer fusion options and warmup JIT functions. + set_jit_fusion_options() + + # Adjust the startup time so it reflects the largest value. + # This will be closer to what scheduler will see (outside of + # image ... launches. + global _TRAIN_START_TIME + start_time_tensor = torch.tensor([_TRAIN_START_TIME], + dtype=torch.double, + device='cuda') + torch.distributed.all_reduce(start_time_tensor, + op=torch.distributed.ReduceOp.MIN) + _TRAIN_START_TIME = start_time_tensor.item() + + app_metrics = {} + app_metrics['app_start_time'] = round(_TRAIN_START_TIME * 1000.0) + app_metrics['app_model_init_start_time'] = round(_TRAIN_START_TIME * 1000.0) + + print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( + time.time() - _TRAIN_START_TIME)) + print_datetime('after megatron is initialized') + app_metrics['app_model_init_finish_time'] = one_logger_utils.get_timestamp_in_ms() + + args = get_args() + timers = get_timers() + + # Track E2E metrics on pretrain start + one_logger_utils.on_pretrain_start() + + # Model, optimizer, and learning rate. + timers('model-and-optimizer-setup', log_level=0).start(barrier=True) + app_metrics['app_build_optimizer_start_time'] = one_logger_utils.get_timestamp_in_ms() + model, optimizer, opt_param_scheduler = setup_model_and_optimizer( + model_provider, model_type) + + timers('model-and-optimizer-setup').stop() + print_datetime('after model, optimizer, and learning rate ' + 'scheduler are built') + app_metrics['app_build_optimizer_finish_time'] = one_logger_utils.get_timestamp_in_ms() + config = get_model_config(model[0]) + + # Data stuff. + app_metrics['app_build_dataiters_start_time'] = one_logger_utils.get_timestamp_in_ms() + timers('train/valid/test-data-iterators-setup', log_level=0).start( + barrier=True) + if args.virtual_pipeline_model_parallel_size is not None: + train_data_iterator = [] + valid_data_iterator = [] + test_data_iterator = [] + for i in range(len(model)): + mpu.set_virtual_pipeline_model_parallel_rank(i) + iterators = build_train_valid_test_data_iterators( + train_valid_test_dataset_provider) + train_data_iterator.append(iterators[0]) + valid_data_iterator.append(iterators[1]) + test_data_iterator.append(iterators[2]) + else: + train_data_iterator, valid_data_iterator, test_data_iterator \ + = build_train_valid_test_data_iterators( + train_valid_test_dataset_provider) + timers('train/valid/test-data-iterators-setup').stop() + print_datetime('after dataloaders are built') + app_metrics['app_build_dataiters_finish_time'] = one_logger_utils.get_timestamp_in_ms() + + # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built. + one_logger_utils.track_config_flags(args.train_iters, args.skip_train, args.do_train, + args.do_valid, args.do_test, args.dataloader_type, + args.retro_project_dir, args.retro_cyclic_train_iters) + + # Context used for persisting some state between checkpoint saves. + checkpointing_context = {} + + # Print setup timing. + print_rank_0('done with setup ...') + timers.log(['model-and-optimizer-setup', + 'train/valid/test-data-iterators-setup'], barrier=True) + + one_logger = get_one_logger() + one_logger and one_logger.log_metrics(app_metrics) + + if not args.skip_train: + print_rank_0('training ...') + + if args.dataloader_type == 'cyclic' and args.retro_project_dir: + assert args.retro_cyclic_train_iters is not None + args.train_iters = args.retro_cyclic_train_iters + print_rank_0("retro cyclic train iters : %d" % args.train_iters) + + iteration = 0 + if args.do_train and args.train_iters > 0: + iteration, num_floating_point_operations_so_far = train( + forward_step_func, + model, optimizer, opt_param_scheduler, + train_data_iterator, valid_data_iterator, + process_non_loss_data_func, config, checkpointing_context) + + print_datetime('after training is done') + + if args.save and iteration != 0 and iteration % args.save_interval != 0: + save_checkpoint(iteration, model, optimizer, opt_param_scheduler, + num_floating_point_operations_so_far, checkpointing_context) + + one_logger and one_logger.log_metrics({ + 'app_train_loop_finish_time': one_logger_utils.get_timestamp_in_ms() + }) + + else: + print_rank_0('skipping training (--skip-train is on) ...') + + iteration = args.iteration + + if args.do_valid: + prefix = f'iteration {iteration} on validation set' + evaluate_and_print_results(prefix, forward_step_func, + valid_data_iterator, model, + iteration, process_non_loss_data_func, config, + verbose=True, write_to_tensorboard=not args.skip_train) + + if args.do_test: + prefix = f'iteration {iteration} on test set' + evaluate_and_print_results(prefix, forward_step_func, + test_data_iterator, model, + iteration, process_non_loss_data_func, config, + verbose=True, write_to_tensorboard=not args.skip_train) + + maybe_finalize_async_save(blocking=True) + + one_logger and one_logger.log_metrics({ + 'app_finish_time': one_logger_utils.get_timestamp_in_ms() + }) + one_logger_utils.finish() + + +def update_train_iters(args): + + # For iteration-based training, we don't need to do anything + if args.train_iters: + return + + # Constant batch size with sample-based training. + if args.rampup_batch_size is None: + args.train_iters = args.train_samples // args.global_batch_size + + else: + # Sample based training with rampup batch size. + iterations = 0 + consumed_samples = 0 + # Rampup phase. + while consumed_samples <= int(args.rampup_batch_size[2]): + update_num_microbatches(consumed_samples, consistency_check=False) + consumed_samples += get_current_global_batch_size() + iterations += 1 + # Reset + update_num_microbatches(0, consistency_check=False) + # Constant phase + # Note that we throw away any partial last batch. + iterations += (args.train_samples - consumed_samples) // \ + args.global_batch_size + args.train_iters = iterations + + print_rank_0('setting training iterations to {}'.format(args.train_iters)) + + +def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): + """Build the model.""" + args = get_args() + args.model_type = model_type + + # Build model. + if mpu.get_pipeline_model_parallel_world_size() > 1 and \ + args.virtual_pipeline_model_parallel_size is not None: + assert model_type != ModelType.encoder_and_decoder, \ + "Interleaved schedule not supported for model with both encoder and decoder" + model = [] + for i in range(args.virtual_pipeline_model_parallel_size): + mpu.set_virtual_pipeline_model_parallel_rank(i) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + this_model = model_provider_func( + pre_process=pre_process, + post_process=post_process + ) + this_model.model_type = model_type + model.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + add_encoder = True + add_decoder = True + if model_type == ModelType.encoder_and_decoder: + if mpu.get_pipeline_model_parallel_world_size() > 1: + assert args.pipeline_model_parallel_split_rank is not None, \ + "Split rank needs to be specified for model with both encoder and decoder" + rank = mpu.get_pipeline_model_parallel_rank() + split_rank = args.pipeline_model_parallel_split_rank + world_size = mpu.get_pipeline_model_parallel_world_size() + pre_process = rank == 0 or rank == split_rank + post_process = (rank == (split_rank - 1)) or ( + rank == (world_size - 1)) + add_encoder = mpu.is_pipeline_stage_before_split() + add_decoder = mpu.is_pipeline_stage_after_split() + model = model_provider_func( + pre_process=pre_process, + post_process=post_process, + add_encoder=add_encoder, + add_decoder=add_decoder) + else: + model = model_provider_func( + pre_process=pre_process, + post_process=post_process + ) + model.model_type = model_type + + if not isinstance(model, list): + model = [model] + + # Set tensor model parallel attributes if not set. + # Only parameters that are already tensor model parallel have these + # attributes set for them. We should make sure the default attributes + # are set for all params so the optimizer can use them. + for model_module in model: + for param in model_module.parameters(): + tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param) + + # Print number of parameters. + if mpu.get_data_parallel_rank() == 0: + print(' > number of parameters on (tensor, pipeline) ' + 'model parallel rank ({}, {}): {}'.format( + mpu.get_tensor_model_parallel_rank(), + mpu.get_pipeline_model_parallel_rank(), + sum([sum([p.nelement() for p in model_module.parameters()]) + for model_module in model])), flush=True) + + # GPU allocation. + for model_module in model: + model_module.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if args.fp16 or args.bf16: + model = [Float16Module(model_module, args) for model_module in model] + + if wrap_with_ddp: + config = get_model_config(model[0]) + ddp_config = DistributedDataParallelConfig( + grad_reduce_in_fp32=args.accumulate_allreduce_grads_in_fp32, + overlap_grad_reduce=args.overlap_grad_reduce, + use_distributed_optimizer=args.use_distributed_optimizer, + check_for_nan_in_grad=args.check_for_nan_in_loss_and_grad, + bucket_size=args.ddp_bucket_size, + average_in_collective=args.ddp_average_in_collective) + model = [DDP(config, + ddp_config, + model_chunk, + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0)) + for (model_chunk_idx, model_chunk) in enumerate(model)] + + # Broadcast params from data parallel src rank to other data parallel ranks. + if args.data_parallel_random_init: + for model_module in model: + model_module.broadcast_params() + + return model + + +def get_optimizer_param_scheduler(optimizer): + """Build the learning rate scheduler.""" + args = get_args() + + # Iteration-based training. + if args.train_iters: + if args.lr_decay_iters is None: + args.lr_decay_iters = args.train_iters + lr_decay_steps = args.lr_decay_iters * args.global_batch_size + wd_incr_steps = args.train_iters * args.global_batch_size + wsd_decay_steps = None + if args.lr_wsd_decay_iters is not None: + wsd_decay_steps = args.lr_wsd_decay_iters * args.global_batch_size + if args.lr_warmup_fraction is not None: + lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps + else: + lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size + # Sample-based training. + elif args.train_samples: + # We need to set training iters for later use. Technically + # we need to adjust the training samples too (due to last + # batch being incomplete) but we leave it as is for now. + update_train_iters(args) + if args.lr_decay_samples is None: + args.lr_decay_samples = args.train_samples + lr_decay_steps = args.lr_decay_samples + wd_incr_steps = args.train_samples + wsd_decay_steps = args.lr_wsd_decay_samples + if args.lr_warmup_fraction is not None: + lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps + else: + lr_warmup_steps = args.lr_warmup_samples + else: + raise Exception( + 'either train-iters or train-samples should be provided.') + + opt_param_scheduler = OptimizerParamScheduler( + optimizer, + init_lr=args.lr_warmup_init, + max_lr=args.lr, + min_lr=args.min_lr, + lr_warmup_steps=lr_warmup_steps, + lr_decay_steps=lr_decay_steps, + lr_decay_style=args.lr_decay_style, + start_wd=args.start_weight_decay, + end_wd=args.end_weight_decay, + wd_incr_steps=wd_incr_steps, + wd_incr_style=args.weight_decay_incr_style, + use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler, + override_opt_param_scheduler=args.override_opt_param_scheduler, + wsd_decay_steps=wsd_decay_steps, + lr_wsd_decay_style=args.lr_wsd_decay_style) + + return opt_param_scheduler + + +def setup_model_and_optimizer(model_provider_func, + model_type, + no_wd_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0): + """Setup model and optimizer.""" + args = get_args() + timers = get_timers() + one_logger = get_one_logger() + + model = get_model(model_provider_func, model_type) + unwrapped_model = unwrap_model(model) + + kwargs = {} + for f in dataclasses.fields(OptimizerConfig): + if hasattr(args, f.name): + kwargs[f.name] = getattr(args, f.name) + config = OptimizerConfig(**kwargs) + config.timers = timers + optimizer = get_megatron_optimizer(config, model, no_wd_decay_cond, + scale_lr_cond, lr_mult) + opt_param_scheduler = get_optimizer_param_scheduler(optimizer) + + if args.load is not None or args.pretrained_checkpoint is not None: + one_logger and one_logger.log_metrics({ + 'load_checkpoint_start_time': one_logger_utils.get_timestamp_in_ms() + }) + timers('load-checkpoint', log_level=0).start(barrier=True) + args.iteration, args.num_floating_point_operations_so_far = load_checkpoint( + model, optimizer, opt_param_scheduler) + timers('load-checkpoint').stop(barrier=True) + timers.log(['load-checkpoint']) + one_logger and one_logger.log_metrics({ + 'load_checkpoint_finish_time': one_logger_utils.get_timestamp_in_ms(), + 'load_checkpoint_time': timers('load-checkpoint').active_time() + }) + else: + args.iteration = 0 + args.num_floating_point_operations_so_far = 0 + + # get model without FP16 and/or DDP wrappers + if args.iteration == 0 and len(unwrapped_model) == 1 \ + and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): + print_rank_0("Initializing ICT from pretrained BERT model") + unwrapped_model[0].init_state_dict_from_bert() + if args.fp16: + optimizer.reload_model_params() + + return model, optimizer, opt_param_scheduler + + + +def train_step(forward_step_func, data_iterator, + model, optimizer, opt_param_scheduler, config): + """Single training step.""" + args = get_args() + timers = get_timers() + + # Set grad to zero. + for model_chunk in model: + model_chunk.zero_grad_buffer() + optimizer.zero_grad() + + # Forward pass. + forward_backward_func = get_forward_backward_func() + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=data_iterator, + model=model, + num_microbatches=get_num_microbatches(), + seq_length=args.seq_length, + micro_batch_size=args.micro_batch_size, + decoder_seq_length=args.decoder_seq_length, + forward_only=False) + + # Empty unused memory. + if args.empty_unused_memory_level >= 1: + torch.cuda.empty_cache() + + # Vision gradients. + if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino": + unwrapped_model = unwrap_model(model[0]) + unwrapped_model.cancel_gradients_last_layer(args.curr_iteration) + + # Update parameters. + timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time) + update_successful, grad_norm, num_zeros_in_grad = optimizer.step() + timers('optimizer').stop() + + # Vision momentum. + if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino": + unwrapped_model = unwrap_model(model[0]) + unwrapped_model.update_momentum(args.curr_iteration) + + # Update learning rate. + if update_successful: + increment = get_num_microbatches() * \ + args.micro_batch_size * \ + args.data_parallel_size + opt_param_scheduler.step(increment=increment) + skipped_iter = 0 + else: + skipped_iter = 1 + + # Empty unused memory. + if args.empty_unused_memory_level >= 2: + torch.cuda.empty_cache() + + if mpu.is_pipeline_last_stage(ignore_virtual=True): + # Average loss across microbatches. + loss_reduced = {} + for key in losses_reduced[0].keys(): + numerator = 0 + denominator = 0 + for x in losses_reduced: + val = x[key] + # there is one dict per microbatch. in new reporting, we average + # over the total number of tokens across the global batch. + if isinstance(val, tuple) or isinstance(val, list): + numerator += val[0] + denominator += val[1] + else: + # legacy behavior. we average over the number of microbatches, + # and so the denominator is 1. + numerator += val + denominator += 1 + loss_reduced[key] = numerator / denominator + return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad + return {}, skipped_iter, grad_norm, num_zeros_in_grad + + +def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_rate, iteration, + loss_scale, report_memory_flag, skipped_iter, + grad_norm, params_norm, num_zeros_in_grad): + """Log training information such as losses, timing, ....""" + args = get_args() + timers = get_timers() + writer = get_tensorboard_writer() + wandb_writer = get_wandb_writer() + one_logger = get_one_logger() + + # Advanced, skipped, and Nan iterations. + advanced_iters_key = 'advanced iterations' + skipped_iters_key = 'skipped iterations' + nan_iters_key = 'nan iterations' + # Advanced iterations. + if not skipped_iter: + total_loss_dict[advanced_iters_key] = total_loss_dict.get( + advanced_iters_key, 0) + 1 + else: + if advanced_iters_key not in total_loss_dict: + total_loss_dict[advanced_iters_key] = 0 + # Skipped iterations. + total_loss_dict[skipped_iters_key] = total_loss_dict.get( + skipped_iters_key, 0) + skipped_iter + # Update losses and set nan iterations + got_nan = False + for key in loss_dict: + if not skipped_iter: + total_loss_dict[key] = total_loss_dict.get( + key, torch.tensor([0.0], dtype=torch.float, device='cuda')) + loss_dict[key] + else: + value = loss_dict[key].float().sum().item() + is_nan = value == float('inf') or \ + value == -float('inf') or \ + value != value + got_nan = got_nan or is_nan + total_loss_dict[nan_iters_key] = total_loss_dict.get( + nan_iters_key, 0) + int(got_nan) + + # Logging. + timers_to_log = [ + 'forward-backward', + 'forward-compute', + 'backward-compute', + 'batch-generator', + 'forward-recv', + 'forward-send', + 'backward-recv', + 'backward-send', + 'forward-send-forward-recv', + 'forward-send-backward-recv', + 'backward-send-forward-recv', + 'backward-send-backward-recv', + 'forward-backward-send-forward-backward-recv', + 'layernorm-grads-all-reduce', + 'embedding-grads-all-reduce', + 'all-grads-sync', + 'params-all-gather', + 'optimizer-copy-to-main-grad', + 'optimizer-unscale-and-check-inf', + 'optimizer-clip-main-grad', + 'optimizer-count-zeros', + 'optimizer-inner-step', + 'optimizer-copy-main-to-model-params', + 'optimizer'] + + # Calculate batch size. + batch_size = args.micro_batch_size * args.data_parallel_size * \ + get_num_microbatches() + + # Track app tag & app tag ID + one_logger_utils.track_app_tag(batch_size, args.world_size, args.seq_length) + + total_iterations = total_loss_dict[advanced_iters_key] + \ + total_loss_dict[skipped_iters_key] + + # Tensorboard values. + # Timer requires all the ranks to call. + if args.log_timers_to_tensorboard and \ + (iteration % args.tensorboard_log_interval == 0): + timers.write(timers_to_log, writer, iteration, + normalizer=total_iterations) + if writer and (iteration % args.tensorboard_log_interval == 0): + if wandb_writer: + wandb_writer.log({'samples vs steps': args.consumed_train_samples}, + iteration) + if args.log_learning_rate_to_tensorboard: + writer.add_scalar('learning-rate', learning_rate, iteration) + if args.decoupled_lr is not None: + writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration) + writer.add_scalar('learning-rate vs samples', learning_rate, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'learning-rate': learning_rate}, iteration) + if args.log_batch_size_to_tensorboard: + writer.add_scalar('batch-size', batch_size, iteration) + writer.add_scalar('batch-size vs samples', batch_size, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'batch-size': batch_size}, iteration) + for key in loss_dict: + writer.add_scalar(key , loss_dict[key], iteration) + writer.add_scalar(key + ' vs samples', loss_dict[key], + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({key: loss_dict[key]}, iteration) + if args.log_loss_scale_to_tensorboard: + writer.add_scalar('loss-scale', loss_scale, iteration) + writer.add_scalar('loss-scale vs samples', loss_scale, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'loss-scale': loss_scale}, iteration) + if args.log_world_size_to_tensorboard: + writer.add_scalar('world-size', args.world_size, iteration) + writer.add_scalar('world-size vs samples', args.world_size, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'world-size': args.world_size}, iteration) + if grad_norm is not None: + writer.add_scalar('grad-norm', grad_norm, iteration) + writer.add_scalar('grad-norm vs samples', grad_norm, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'grad-norm': grad_norm}, iteration) + if num_zeros_in_grad is not None: + writer.add_scalar('num-zeros', num_zeros_in_grad, iteration) + writer.add_scalar('num-zeros vs samples', num_zeros_in_grad, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'num-zeros': num_zeros_in_grad}, iteration) + if params_norm is not None: + writer.add_scalar('params-norm', params_norm, iteration) + writer.add_scalar('params-norm vs samples', params_norm, + args.consumed_train_samples) + if wandb_writer: + wandb_writer.log({'params-norm': params_norm}, iteration) + if args.log_memory_to_tensorboard: + mem_stats = torch.cuda.memory_stats() + writer.add_scalar( + "mem-reserved-bytes", + mem_stats["reserved_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-bytes", + mem_stats["allocated_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-count", + mem_stats["allocation.all.current"], + iteration, + ) + if args.num_experts is not None: + moe_loss_scale = 1 / get_num_microbatches() + track_moe_metrics(moe_loss_scale, iteration, writer, wandb_writer, total_loss_dict, args.moe_per_layer_logging) + + if iteration % args.log_interval == 0: + elapsed_time = timers('interval-time').elapsed(barrier=True) + elapsed_time_per_iteration = elapsed_time / total_iterations + + throughput = num_floating_point_operations(args, batch_size) / ( + elapsed_time_per_iteration * 10**12 * args.world_size) + + one_logger_utils.track_e2e_metrics(args.log_throughput, throughput) + + if args.log_timers_to_tensorboard: + if writer: + writer.add_scalar('iteration-time', + elapsed_time_per_iteration, iteration) + if wandb_writer: + wandb_writer.log({'iteration-time': elapsed_time_per_iteration}, + iteration) + log_string = f" [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]" + log_string += ' iteration {:8d}/{:8d} |'.format( + iteration, args.train_iters) + log_string += ' consumed samples: {:12d} |'.format( + args.consumed_train_samples) + log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( + elapsed_time_per_iteration * 1000.0) + if args.log_throughput: + log_string += f' throughput per GPU (TFLOP/s/GPU): {throughput:.1f} |' + if args.log_timers_to_tensorboard: + if writer: + writer.add_scalar('throughput', throughput, iteration) + if wandb_writer: + wandb_writer.log({'throughput': throughput}, iteration) + assert learning_rate is not None + # Decoupled_learning_rate should be not None only on first and last pipeline stage. + log_string += ' learning rate: {:.6E} |'.format(learning_rate) + if args.decoupled_lr is not None and (mpu.is_pipeline_first_stage(ignore_virtual=True) or + mpu.is_pipeline_last_stage(ignore_virtual=True)): + assert decoupled_learning_rate is not None + log_string += ' decoupled learning rate: {:.6E} |'.format(decoupled_learning_rate) + else: + assert decoupled_learning_rate is None + log_string += ' global batch size: {:5d} |'.format(batch_size) + for key in total_loss_dict: + if key not in [advanced_iters_key, skipped_iters_key, + nan_iters_key]: + avg = total_loss_dict[key].item() / \ + float(max(1, total_loss_dict[advanced_iters_key])) + if avg > 0.0: + log_string += ' {}: {:.6E} |'.format(key, avg) + total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda') + log_string += ' loss scale: {:.1f} |'.format(loss_scale) + if grad_norm is not None: + log_string += ' grad norm: {:.3f} |'.format(grad_norm) + if num_zeros_in_grad is not None: + log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) + if params_norm is not None: + log_string += ' params norm: {:.3f} |'.format(params_norm) + log_string += ' number of skipped iterations: {:3d} |'.format( + total_loss_dict[skipped_iters_key]) + log_string += ' number of nan iterations: {:3d} |'.format( + total_loss_dict[nan_iters_key]) + total_loss_dict[advanced_iters_key] = 0 + total_loss_dict[skipped_iters_key] = 0 + total_loss_dict[nan_iters_key] = 0 + print_rank_last(log_string) + if report_memory_flag and learning_rate > 0.: + # Report memory after optimizer state has been initialized. + if torch.distributed.get_rank() == 0: + num_microbatches = get_num_microbatches() + report_theoretical_memory(args, num_microbatches=num_microbatches, verbose=True) + report_memory('(after {} iterations)'.format(iteration)) + report_memory_flag = False + timers.log(timers_to_log, normalizer=args.log_interval) + + return report_memory_flag + + +def compute_throughputs_and_append_to_progress_log(iteration, + num_floating_point_operations_so_far): + args = get_args() + if args.save is None: + return + + # Compute job throughput. + # args.num_floating_point_operations_so_far keeps track of floating-point operations + # completed at the start of job. + global _TRAIN_START_TIME + job_throughput = \ + (num_floating_point_operations_so_far - + args.num_floating_point_operations_so_far) / ( + (time.time() - _TRAIN_START_TIME) * 10**12 * args.world_size) + + # Compute cumulative throughput since jobs of this world size were launched. + # `get_start_time_from_progress_log` returns start time and number of floating-point + # operations of first job of this world size. + start_time, start_num_floating_point_operations = get_start_time_from_progress_log() + elapsed_time = (datetime.now() - start_time).total_seconds() + cumulative_throughput = \ + (num_floating_point_operations_so_far - + start_num_floating_point_operations) / ( + elapsed_time * 10**12 * args.world_size) + + tokens_so_far = args.consumed_train_samples * args.seq_length + saved_ckpt_prefix = 'Saving async checkpoint' if args.async_save else 'Saved checkpoint' + append_to_progress_log(f"{saved_ckpt_prefix}\tIteration: {iteration}\t" + f"Job throughput: {job_throughput:.1f} TFLOP/s/GPU\t" + f"Cumulative throughput: {cumulative_throughput:.1f} TFLOP/s/GPU\t" + f"Floating-point operations: {num_floating_point_operations_so_far:.2e}\t" + f"Tokens (in billions): {tokens_so_far / 10**9:.2f}") + + +def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler, + num_floating_point_operations_so_far, checkpointing_context): + args = get_args() + timers = get_timers() + + # Stop timer to get accurate train interval time and exclude checkpointing duration + timers('interval-time').stop() + + # Extra barrier is added to make sure all ranks report the max time. + timers('save-checkpoint', log_level=0).start(barrier=True) + save_checkpoint_start_time = timers('save-checkpoint').active_time() + + # Log E2E metrics before save-checkpoint + one_logger_utils.track_e2e_metrics() + + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.disable_pre_hook() + save_checkpoint(iteration, model, optimizer, opt_param_scheduler, + num_floating_point_operations_so_far, checkpointing_context) + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.enable_pre_hook() + timers('save-checkpoint').stop(barrier=True) + timers.log(['save-checkpoint']) + save_checkpoint_finish_time = timers('save-checkpoint').active_time() + + # Log E2E metrics after save-checkpoint + one_logger_utils.track_e2e_metrics() + save_checkpoint_duration = save_checkpoint_finish_time - save_checkpoint_start_time + one_logger_utils.on_save_checkpoint_end(save_checkpoint_duration, iteration, args.async_save) + + + if args.log_progress: + compute_throughputs_and_append_to_progress_log(iteration, + num_floating_point_operations_so_far) + + # Recover timing + timers('interval-time', log_level=0).start(barrier=True) + + +def train(forward_step_func, model, optimizer, opt_param_scheduler, + train_data_iterator, valid_data_iterator, + process_non_loss_data_func, config, checkpointing_context): + """Train the model function.""" + args = get_args() + timers = get_timers() + one_logger = get_one_logger() + + # Write args to tensorboard + write_args_to_tensorboard() + + # Turn on training mode which enables dropout. + for model_module in model: + model_module.train() + + # Tracking loss. + total_loss_dict = {} + + # Iterations. + iteration = args.iteration + + # Track E2E metrics at the start of training + one_logger_utils.on_train_start(iteration=iteration, consumed_train_samples=args.consumed_train_samples, + train_samples=args.train_samples, seq_length=args.seq_length, + train_iters=args.train_iters, save=args.save, async_save=args.async_save, + log_throughput=args.log_throughput, + num_floating_point_operations_so_far=args.num_floating_point_operations_so_far) + + num_floating_point_operations_so_far = args.num_floating_point_operations_so_far + + # Setup some training config params + config.grad_scale_func = optimizer.scale_loss + config.timers = timers + if isinstance(model[0], DDP) and args.overlap_grad_reduce: + assert config.no_sync_func is None, \ + ('When overlap_grad_reduce is True, config.no_sync_func must be None; ' + 'a custom no_sync_func is not supported when overlapping grad-reduce') + config.no_sync_func = [model_chunk.no_sync for model_chunk in model] + if len(model) == 1: + config.no_sync_func = config.no_sync_func[0] + if args.delay_grad_reduce: + config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in model] + if len(model) == 1: + config.grad_sync_func = config.grad_sync_func[0] + if args.overlap_param_gather and args.delay_param_gather: + config.param_sync_func = [lambda x: optimizer.finish_param_sync(model_index, x) + for model_index in range(len(model))] + if len(model) == 1: + config.param_sync_func = config.param_sync_func[0] + config.finalize_model_grads_func = finalize_model_grads + + timers('interval-time', log_level=0).start(barrier=True) + print_datetime('before the start of training step') + report_memory_flag = True + exit = False + + if args.manual_gc: + # Disable the default garbage collector and perform the collection manually. + # This is to align the timing of garbage collection across ranks. + assert args.manual_gc_interval >= 0, \ + 'Manual garbage collection interval should be laerger than or equal to 0.' + gc.disable() + gc.collect() + + # Singleton Initialization + if args.log_straggler: + global stimer + world = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + mmcnt = args.straggler_minmax_count + stimer.configure(world, rank, + mmcnt = mmcnt, + enabled = not args.disable_straggler_on_startup, + port = args.straggler_ctrlr_port) + total_flops = 0.0 + + num_microbatches = get_num_microbatches() + eval_duration = 0.0 + eval_iterations = 0 + + def get_e2e_base_metrics(): + """Get base metrics values for one-logger to calculate E2E tracking metrics. + """ + return { + 'iteration': iteration, + 'train_duration': timers('interval-time').active_time(), + 'eval_duration': eval_duration, + 'eval_iterations': eval_iterations, + 'total_flops': total_flops, + 'num_floating_point_operations_so_far': num_floating_point_operations_so_far, + 'consumed_train_samples': args.consumed_train_samples, + 'world_size': args.world_size, + 'seq_length': args.seq_length + } + # Cache into one-logger for callback + if one_logger: + with one_logger.get_context_manager(): + one_logger.store_set('get_e2e_base_metrics', get_e2e_base_metrics) + + while iteration < args.train_iters: + if args.profile and \ + iteration == args.profile_step_start and \ + torch.distributed.get_rank() in args.profile_ranks: + torch.cuda.cudart().cudaProfilerStart() + torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__() + + maybe_finalize_async_save(False) + + # Update number of microbatches first without consistency check to decide if a + # checkpoint should be saved. If the number of microbatches is different + # from the previous iteration, save a checkpoint. Then run consistency check + # to make sure training configuration is still valid. + update_num_microbatches(args.consumed_train_samples, consistency_check=False) + if get_num_microbatches() != num_microbatches and iteration != 0: + assert get_num_microbatches() > num_microbatches, \ + "number of microbatches should be increasing due to batch size rampup" + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + checkpointing_context) + num_microbatches = get_num_microbatches() + update_num_microbatches(args.consumed_train_samples, consistency_check=True) + + args.curr_iteration = iteration + loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \ + train_step(forward_step_func, + train_data_iterator, + model, + optimizer, + opt_param_scheduler, + config) + iteration += 1 + batch_size = mpu.get_data_parallel_world_size() * \ + args.micro_batch_size * \ + get_num_microbatches() + args.consumed_train_samples += batch_size + num_fp_ops = num_floating_point_operations(args, batch_size) + num_floating_point_operations_so_far += num_fp_ops + total_flops += num_fp_ops + + # Logging. + loss_scale = optimizer.get_loss_scale().item() + params_norm = None + if args.log_params_norm: + params_norm = calc_params_l2_norm(model) + + learning_rate = None + decoupled_learning_rate = None + for param_group in optimizer.param_groups: + if param_group['is_decoupled_lr']: + decoupled_learning_rate = param_group['lr'] + else: + learning_rate = param_group['lr'] + report_memory_flag = training_log(loss_dict, total_loss_dict, + learning_rate, + decoupled_learning_rate, + iteration, loss_scale, + report_memory_flag, skipped_iter, + grad_norm, params_norm, num_zeros_in_grad) + + # StragglerDetector + if iteration % args.log_interval == 0 and args.log_straggler: + stimer.report(total_flops, args.log_interval) + total_flops = 0.0 + + if args.check_weight_hash_across_dp_replicas_interval is not None and \ + iteration % args.check_weight_hash_across_dp_replicas_interval == 0: + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.disable_pre_hook() + assert check_param_hashes_across_dp_replicas(model), \ + "Parameter hashes not matching across DP replicas" + torch.distributed.barrier() + print_rank_0(f">>> Weight hashes match after {iteration} iterations...") + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.enable_pre_hook() + + # Autoresume + if args.adlr_autoresume and \ + (iteration % args.adlr_autoresume_interval == 0): + check_adlr_autoresume_termination(iteration, model, optimizer, + opt_param_scheduler) + + # Evaluation + if args.eval_interval and iteration % args.eval_interval == 0 and \ + args.do_valid: + timers('interval-time').stop() + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.disable_pre_hook() + if args.manual_gc and args.manual_gc_eval: + # Collect all objects. + gc.collect() + prefix = 'iteration {}'.format(iteration) + timers('eval-time', log_level=0).start(barrier=True) + evaluate_and_print_results(prefix, forward_step_func, + valid_data_iterator, model, + iteration, process_non_loss_data_func, + config, False) + eval_duration += timers('eval-time').elapsed() + eval_iterations += args.eval_iters + timers('eval-time').stop() + one_logger_utils.track_e2e_metrics() + + if args.manual_gc and args.manual_gc_eval: + # Collect only the objects created and used in evaluation. + gc.collect(generation=0) + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.enable_pre_hook() + timers('interval-time', log_level=0).start(barrier=True) + + # Checkpointing + saved_checkpoint = False + if args.exit_signal_handler: + signal_handler = get_signal_handler() + if any(signal_handler.signals_received()): + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + checkpointing_context) + print_datetime('exiting program after receiving SIGTERM.') + exit = True + break + + if args.save and args.save_interval and \ + iteration % args.save_interval == 0: + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + checkpointing_context) + saved_checkpoint = True + + # Exiting based on duration + if args.exit_duration_in_mins: + train_time = (time.time() - _TRAIN_START_TIME) / 60.0 + done_cuda = torch.tensor( + [train_time > args.exit_duration_in_mins], + dtype=torch.int, device='cuda') + torch.distributed.all_reduce( + done_cuda, op=torch.distributed.ReduceOp.MAX) + done = done_cuda.item() + if done: + if not saved_checkpoint: + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + checkpointing_context) + print_datetime('exiting program after {} minutes'.format(train_time)) + exit = True + break + + # Exiting based on iterations + if args.exit_interval and iteration % args.exit_interval == 0: + if args.save and not saved_checkpoint: + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + checkpointing_context) + torch.distributed.barrier() + print_datetime('exiting program at iteration {}'.format(iteration)) + exit = True + break + + if args.profile and \ + iteration == args.profile_step_end and \ + torch.distributed.get_rank() in args.profile_ranks: + torch.cuda.cudart().cudaProfilerStop() + + if args.manual_gc: + if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0: + gc.collect() + + one_logger_utils.track_e2e_metrics() + + # Flush TensorBoard, WandB writers and one-logger + writer = get_tensorboard_writer() + if writer: + writer.flush() + wandb_writer = get_wandb_writer() + if wandb_writer: + wandb_writer.finish() + + # Close out pre-hooks if using distributed optimizer and overlapped param gather. + if args.use_distributed_optimizer and args.overlap_param_gather: + optimizer.disable_pre_hook() + + maybe_finalize_async_save(True) + + # If any exit conditions (signal handler, duration, iterations) have been reached, exit. + if exit: + sys.exit() + + return iteration, num_floating_point_operations_so_far + + +def evaluate(forward_step_func, + data_iterator, + model, + process_non_loss_data_func, + config, + verbose=False): + """Evaluation.""" + args = get_args() + timers = get_timers() + + timers('evaluate', log_level=0).start(barrier=True) + + if args.vision_pretraining and args.vision_pretraining_type == "dino": + from megatron.legacy.model.vision.knn_monitor import compute_feature_bank + compute_feature_bank(model) + + # Turn on evaluation mode which disables dropout. + for model_module in model: + model_module.eval() + + total_loss_dict = {} + + # make validation batch size independent from training batch size + eval_batch_size = args.global_batch_size + eval_num_microbatches = eval_batch_size // \ + (args.micro_batch_size * args.data_parallel_size) + + with torch.no_grad(): + iteration = 0 + if verbose: + print_rank_0(f'Evaluating on {args.eval_iters * eval_batch_size} samples') + while iteration < args.eval_iters: + iteration += 1 + if verbose: + print_rank_0(f'Evaluating iter {iteration}/{args.eval_iters}') + + forward_backward_func = get_forward_backward_func() + # Don't care about timing during evaluation + config.timers = None + loss_dicts = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=data_iterator, + model=model, + num_microbatches=eval_num_microbatches, + seq_length=args.seq_length, + micro_batch_size=args.micro_batch_size, + decoder_seq_length=args.decoder_seq_length, + forward_only=True) + config.timers = get_timers() + + # Empty unused memory + if args.empty_unused_memory_level >= 1: + torch.cuda.empty_cache() + + if mpu.is_pipeline_last_stage(ignore_virtual=True): + # Reduce across processes. + for loss_dict in loss_dicts: + for key in loss_dict: + if key not in total_loss_dict: + total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float).cuda() + val = loss_dict[key] + if isinstance(val, tuple) or isinstance(val, list): + total_loss_dict[key][0] += val[0] + total_loss_dict[key][1] += val[1] + else: + total_loss_dict[key][0] += val + total_loss_dict[key][1] += 1 + + args.consumed_valid_samples += eval_batch_size + + if args.exit_duration_in_mins: + train_time = (time.time() - _TRAIN_START_TIME) / 60.0 + done_cuda = torch.tensor( + [train_time > args.exit_duration_in_mins], + dtype=torch.int, device='cuda') + torch.distributed.all_reduce( + done_cuda, op=torch.distributed.ReduceOp.MAX) + done = done_cuda.item() + if done: + print_rank_0('Exiting during evaluation, timelimit reached') + return None, None, True + + collected_non_loss_data = None + if process_non_loss_data_func is not None and is_last_rank(): + collected_non_loss_data = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=data_iterator, + model=model, + num_microbatches=get_num_microbatches(), + seq_length=args.seq_length, + micro_batch_size=args.micro_batch_size, + decoder_seq_length=args.decoder_seq_length, + forward_only=True, + collect_non_loss_data=True) + + # Move model back to the train mode. + for model_module in model: + model_module.train() + + for key in total_loss_dict: + numerator, denominator = total_loss_dict[key] + total_loss_dict[key] = numerator / denominator + + timers('evaluate').stop() + timers.log(['evaluate']) + + return total_loss_dict, collected_non_loss_data, False + +def evaluate_and_print_results(prefix, forward_step_func, + data_iterator, model, + iteration, process_non_loss_data_func, config, + verbose=False, write_to_tensorboard=True): + """Helper function to evaluate and dump results on screen.""" + args = get_args() + if write_to_tensorboard: + writer = get_tensorboard_writer() + else: + writer = None + + wandb_writer = get_wandb_writer() + + total_loss_dict, collected_non_loss_data, timelimit = evaluate( + forward_step_func, data_iterator, model, + process_non_loss_data_func, config, verbose) + # Timelimit hit during evaluation + if timelimit: + return + string = ' validation loss at {} | '.format(prefix) + for key in total_loss_dict: + string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) + ppl = math.exp(min(20, total_loss_dict[key].item())) + string += '{} PPL: {:.6E} | '.format(key, ppl) + if writer: + writer.add_scalar('{} validation'.format(key), + total_loss_dict[key].item(), + iteration) + writer.add_scalar('{} validation vs samples'.format(key), + total_loss_dict[key].item(), + args.consumed_train_samples) + if args.log_validation_ppl_to_tensorboard: + writer.add_scalar('{} validation ppl'.format(key), ppl, + iteration) + writer.add_scalar('{} validation ppl vs samples'.format(key), + ppl, args.consumed_train_samples) + if wandb_writer and is_last_rank(): + wandb_writer.log({ + '{} validation'.format(key): total_loss_dict[key].item()}, + iteration) + + if process_non_loss_data_func is not None and writer and is_last_rank(): + process_non_loss_data_func(collected_non_loss_data, iteration, writer) + + length = len(string) + 1 + print_rank_last('-' * length) + print_rank_last(string) + print_rank_last('-' * length) + + +def cyclic_iter(iter): + while True: + for x in iter: + yield x + + +def get_train_valid_test_num_samples(): + """Train/valid/test num samples.""" + + args = get_args() + + # Number of train/valid/test samples. + if args.train_samples: + train_samples = args.train_samples + else: + train_samples = args.train_iters * args.global_batch_size + eval_iters = (args.train_iters // args.eval_interval + 1) * \ + args.eval_iters + test_iters = args.eval_iters + + return ( + train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size, + ) + + +def build_train_valid_test_datasets(build_train_valid_test_datasets_provider): + """Build pretraining datasets.""" + train_valid_test_num_samples = get_train_valid_test_num_samples() + print_rank_0(' > datasets target sizes (minimum size):') + print_rank_0(' train: {}'.format(train_valid_test_num_samples[0])) + print_rank_0(' validation: {}'.format(train_valid_test_num_samples[1])) + print_rank_0(' test: {}'.format(train_valid_test_num_samples[2])) + return build_train_valid_test_datasets_provider(train_valid_test_num_samples) + + +def build_train_valid_test_data_loaders( + build_train_valid_test_datasets_provider): + """Build pretraining data loaders.""" + + args = get_args() + + (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) + + print_rank_0('> building train, validation, and test datasets ...') + + # Backward compatibility, assume fixed batch size. + if args.iteration > 0 and args.consumed_train_samples == 0: + assert args.train_samples is None, \ + 'only backward compatiblity support for iteration-based training' + args.consumed_train_samples = args.iteration * args.global_batch_size + if args.iteration > 0 and args.consumed_valid_samples == 0: + if args.train_samples is None: + args.consumed_valid_samples = (args.iteration // args.eval_interval) * \ + args.eval_iters * args.global_batch_size + + # Rely on distributed-aware core datasets, temporary + is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False) + + # Construct the data pipeline + if is_distributed or mpu.get_tensor_model_parallel_rank() == 0: + + # Build datasets. + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + build_train_valid_test_datasets_provider) + # Build dataloders. + train_dataloader = build_pretraining_data_loader( + train_ds, args.consumed_train_samples) + if args.skip_train: + valid_dataloader = build_pretraining_data_loader(valid_ds, 0) + else: + valid_dataloader = build_pretraining_data_loader( + valid_ds, args.consumed_valid_samples) + test_dataloader = build_pretraining_data_loader(test_ds, 0) + + # Flags to know if we need to do training/validation/testing. + do_train = train_dataloader is not None and args.train_iters > 0 + do_valid = valid_dataloader is not None and args.eval_iters > 0 + do_test = test_dataloader is not None and args.eval_iters > 0 + flags = torch.tensor( + [int(do_train), int(do_valid), int(do_test)], + dtype=torch.long, device='cuda') + else: + flags = torch.tensor([0, 0, 0], dtype=torch.long, device='cuda') + + torch.distributed.broadcast(flags, 0) + + args.do_train = getattr(args, "do_train", False) or flags[0].item() + args.do_valid = getattr(args, "do_valid", False) or flags[1].item() + args.do_test = getattr(args, "do_test", False) or flags[2].item() + + return train_dataloader, valid_dataloader, test_dataloader + + +def build_train_valid_test_data_iterators( + build_train_valid_test_datasets_provider): + """Build pretraining data iterators.""" + + args = get_args() + + # Build loaders. + train_dataloader, valid_dataloader, test_dataloader = \ + build_train_valid_test_data_loaders( + build_train_valid_test_datasets_provider) + + # Build iterators. + dl_type = args.dataloader_type + assert dl_type in ['single', 'cyclic', 'external'] + + def _get_iterator(dataloader_type, dataloader): + """Return dataset iterator.""" + if dataloader_type == "single": + return iter(dataloader) + elif dataloader_type == "cyclic": + return iter(cyclic_iter(dataloader)) + elif dataloader_type == "external": + # External dataloader is passed through. User is expected to define how to iterate. + return dataloader + else: + raise RuntimeError("unexpected dataloader type") + + if train_dataloader is not None: + train_data_iterator = _get_iterator(dl_type, train_dataloader) + else: + train_data_iterator = None + + if valid_dataloader is not None: + valid_data_iterator = _get_iterator(dl_type, valid_dataloader) + else: + valid_data_iterator = None + + if test_dataloader is not None: + test_data_iterator = _get_iterator(dl_type, test_dataloader) + else: + test_data_iterator = None + + return train_data_iterator, valid_data_iterator, test_data_iterator diff --git a/megatron/training/utils.py b/megatron/training/utils.py new file mode 100644 index 0000000..5965d78 --- /dev/null +++ b/megatron/training/utils.py @@ -0,0 +1,386 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""General utilities.""" +import os +import sys +from datetime import datetime + +import torch + +try: + from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_l2norm +except ImportError: + try: + from apex.multi_tensor_apply import multi_tensor_applier + except ImportError: + multi_tensor_applier = None + + try: + from amp_C import multi_tensor_l2norm + except ImportError: + import warnings + warnings.warn( + f'Transformer Engine and Apex are not installed. ' + 'Falling back to local implementations of ' + 'multi_tensor_applier and multi_tensor_l2norm' + ) + + from megatron.core.utils import ( + local_multi_tensor_l2_norm as multi_tensor_l2norm, + local_multi_tensor_applier as multi_tensor_applier, + ) + +from megatron.training import ( + get_args, + get_adlr_autoresume, +) +from megatron.core import DistributedDataParallel as DDP +from megatron.core import mpu +from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate +from megatron.legacy.model import Float16Module +from megatron.legacy.model.module import param_is_not_shared + + +ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module) + + +def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES): + return_list = True + if not isinstance(model, list): + model = [model] + return_list = False + unwrapped_model = [] + for model_module in model: + while isinstance(model_module, module_instances): + model_module = model_module.module + unwrapped_model.append(model_module) + if not return_list: + return unwrapped_model[0] + return unwrapped_model + + +def calc_params_l2_norm(model): + """Calculate l2 norm of parameters """ + args = get_args() + if not isinstance(model, list): + model = [model] + # Remove duplicate params. + params_data = [] + for model_ in model: + for param in model_.parameters(): + is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) + if mpu.get_expert_model_parallel_rank() > 0: + if not getattr(param, 'allreduce', True) and is_not_tp_duplicate: + assert param_is_not_shared(param) + params_data.append(param.data.float() if args.bf16 else param.data) + else: + is_not_shared = param_is_not_shared(param) + if is_not_shared and is_not_tp_duplicate: + params_data.append(param.data.float() if args.bf16 else param.data) + + # Calculate norm + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda') + norm, _ = multi_tensor_applier( + multi_tensor_l2norm, + dummy_overflow_buf, + [params_data], + False # no per-parameter norm + ) + norm_2 = norm * norm + if mpu.get_expert_model_parallel_world_size() == 1: + # Sum across all model-parallel GPUs(tensor + pipeline). + torch.distributed.all_reduce(norm_2, + op=torch.distributed.ReduceOp.SUM, + group=mpu.get_model_parallel_group()) + else: + # Sum across tensor, pipeline and expert model-parallel GPUs. + torch.distributed.all_reduce(norm_2, + op=torch.distributed.ReduceOp.SUM, + group=mpu.get_tensor_and_expert_parallel_group()) + torch.distributed.all_reduce(norm_2, + op=torch.distributed.ReduceOp.SUM, + group=mpu.get_pipeline_model_parallel_group()) + return norm_2.item() ** 0.5 + + +def average_losses_across_data_parallel_group(losses): + """Reduce a tensor of losses across all GPUs.""" + averaged_losses = torch.cat( + [loss.clone().detach().view(1) for loss in losses]) + torch.distributed.all_reduce(averaged_losses, + group=mpu.get_data_parallel_group()) + averaged_losses = averaged_losses / \ + torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) + + return averaged_losses + + +def report_memory(name): + """Simple GPU memory report.""" + mega_bytes = 1024.0 * 1024.0 + string = name + ' memory (MB)' + string += ' | allocated: {}'.format( + torch.cuda.memory_allocated() / mega_bytes) + string += ' | max allocated: {}'.format( + torch.cuda.max_memory_allocated() / mega_bytes) + string += ' | reserved: {}'.format( + torch.cuda.memory_reserved() / mega_bytes) + string += ' | max reserved: {}'.format( + torch.cuda.max_memory_reserved() / mega_bytes) + if mpu.get_data_parallel_rank() == 0: + print("[Rank {}] {}".format(torch.distributed.get_rank(), string), + flush=True) + + +def print_params_min_max_norm(optimizer, iteration): + """Print min, max, and norm of all parameters.""" + index = 0 + rank = torch.distributed.get_rank() + string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n' + optimizer_ = optimizer.optimizer + for param_group in optimizer_.param_groups: + for param in param_group['params']: + index += 1 + min_ = param.data.min() + max_ = param.data.max() + norm = torch.linalg.norm(param.data) + string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format( + iteration, rank, index, int(param.tensor_model_parallel)) + string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm) + print(string, flush=True) + + +def check_adlr_autoresume_termination(iteration, model, + optimizer, opt_param_scheduler): + """Check for autoresume signal and exit if it is received.""" + from megatron.training.checkpointing import save_checkpoint + + args = get_args() + autoresume = get_adlr_autoresume() + # Add barrier to ensure consistnecy. + torch.distributed.barrier() + if autoresume.termination_requested(): + if args.save: + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + print_rank_0(">>> autoresume termination request found!") + if torch.distributed.get_rank() == 0: + autoresume.request_resume() + print_rank_0(">>> training terminated. Returning") + sys.exit(0) + + +def get_ltor_masks_and_position_ids(data, + eod_token, + reset_position_ids, + reset_attention_mask, + eod_mask_loss): + """Build masks and position id for left to right model.""" + + # Extract batch size and sequence length. + micro_batch_size, seq_length = data.size() + + # Attention mask (lower triangular). + if reset_attention_mask: + att_mask_batch = micro_batch_size + else: + att_mask_batch = 1 + attention_mask = torch.tril(torch.ones( + (att_mask_batch, seq_length, seq_length), device=data.device)).view( + att_mask_batch, 1, seq_length, seq_length) + + # Loss mask. + loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) + if eod_mask_loss: + loss_mask[data == eod_token] = 0.0 + + # Position ids. + position_ids = torch.arange(seq_length, dtype=torch.long, + device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data) + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(micro_batch_size): + + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= (i + 1 - prev_index) + prev_index = i + 1 + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + + return attention_mask, loss_mask, position_ids + + +def get_batch_on_this_cp_rank(batch): + """ Slice batch input along sequence dimension into multiple chunks, + which are parallelized across GPUs in a context parallel group. + """ + + # With causal masking, each token only attends to its prior tokens. Simply split + # sequence into CP chunks can result in severe load imbalance. That's to say, chunks + # at the end of sequence have bigger workload than others. To address this issue, + # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0 + # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so + # that we can get balanced workload among GPUs in a context parallel group. + args = get_args() + cp_size = args.context_parallel_size + if cp_size > 1: + cp_rank = mpu.get_context_parallel_rank() + for key, val in batch.items(): + if val is not None: + seq_dim = 1 if key != 'attention_mask' else 2 + val = val.view( + *val.shape[0:seq_dim], + 2 * cp_size, + val.shape[seq_dim] // (2 * cp_size), + *val.shape[(seq_dim + 1) :], + ) + index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], + device="cpu", pin_memory=True).cuda(non_blocking=True) + val = val.index_select(seq_dim, index) + val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :]) + batch[key] = val + + return batch + + +def print_rank_0(message): + """If distributed is initialized, print only on rank 0.""" + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + print(message, flush=True) + else: + print(message, flush=True) + +def is_last_rank(): + return torch.distributed.get_rank() == ( + torch.distributed.get_world_size() - 1) + +def print_rank_last(message): + """If distributed is initialized, print only on last rank.""" + if torch.distributed.is_initialized(): + if is_last_rank(): + print(message, flush=True) + else: + print(message, flush=True) + + +def append_to_progress_log(string, barrier=True): + """ Append given string to progress log. """ + args = get_args() + if args.save is None: + return + progress_log_filename = os.path.join(args.save, "progress.txt") + if barrier: + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + with open(progress_log_filename, 'a') as f: + job_id = os.getenv('SLURM_JOB_ID', '') + num_gpus = args.world_size + f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\tJob ID: {job_id}\t" + f"# GPUs: {num_gpus}\t{string}\n") + + +def get_batch_on_this_tp_rank(data_iterator): + + args = get_args() + + def _broadcast(item): + if item is not None: + torch.distributed.broadcast(item, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) + + if mpu.get_tensor_model_parallel_rank() == 0: + + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + + batch = { + 'tokens': data["tokens"].cuda(non_blocking = True), + 'labels': data["labels"].cuda(non_blocking = True), + 'loss_mask': data["loss_mask"].cuda(non_blocking = True), + 'attention_mask': None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking = True), + 'position_ids': data["position_ids"].cuda(non_blocking = True) + } + + if args.pipeline_model_parallel_size == 1: + _broadcast(batch['tokens']) + _broadcast(batch['labels']) + _broadcast(batch['loss_mask']) + _broadcast(batch['attention_mask']) + _broadcast(batch['position_ids']) + + elif mpu.is_pipeline_first_stage(): + _broadcast(batch['tokens']) + _broadcast(batch['attention_mask']) + _broadcast(batch['position_ids']) + + elif mpu.is_pipeline_last_stage(): + _broadcast(batch['labels']) + _broadcast(batch['loss_mask']) + _broadcast(batch['attention_mask']) + + else: + + tokens=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device()) + labels=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device()) + loss_mask=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.float32 , device = torch.cuda.current_device()) + if args.create_attention_mask_in_dataloader: + attention_mask=torch.empty( + (args.micro_batch_size,1,args.seq_length,args.seq_length), dtype = torch.bool , device = torch.cuda.current_device() + ) + else: + attention_mask=None + position_ids=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device()) + + if args.pipeline_model_parallel_size == 1: + _broadcast(tokens) + _broadcast(labels) + _broadcast(loss_mask) + _broadcast(attention_mask) + _broadcast(position_ids) + + elif mpu.is_pipeline_first_stage(): + labels=None + loss_mask=None + + _broadcast(tokens) + _broadcast(attention_mask) + _broadcast(position_ids) + + elif mpu.is_pipeline_last_stage(): + tokens=None + position_ids=None + + _broadcast(labels) + _broadcast(loss_mask) + _broadcast(attention_mask) + + batch = { + 'tokens': tokens, + 'labels': labels, + 'loss_mask': loss_mask, + 'attention_mask': attention_mask, + 'position_ids': position_ids + } + + return batch diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py new file mode 100644 index 0000000..f81d4de --- /dev/null +++ b/megatron/training/yaml_arguments.py @@ -0,0 +1,456 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Megatron arguments.""" + +import argparse +import dataclasses +import json +import os +import torch +import types + +from itertools import chain, starmap +from types import SimpleNamespace +import yaml, re, os +from types import SimpleNamespace + +import torch.nn.functional as F + +from megatron.core.transformer import TransformerConfig + +# Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml +# Allows for yaml to use environment variables +env_pattern = re.compile(r".*?\${(.*?)}.*?") +def env_constructor(loader, node): + value = loader.construct_scalar(node) + for group in env_pattern.findall(value): + assert os.environ.get(group) is not None, f"environment variable {group} in yaml not found" + value = value.replace(f"${{{group}}}", os.environ.get(group)) + return value +yaml.add_implicit_resolver("!pathex", env_pattern) +yaml.add_constructor("!pathex", env_constructor) + + +str_dtype_to_torch = { + "float32" : torch.float32, + "float16" : torch.float16, + "bfloat16" : torch.bfloat16 +} + +def validate_yaml(args, defaults={}): + + # This is for legacy script env var setting + if type(args.data_path) is str: + # If no white space its a single path + split_data_path = args.data_path.split() + if len(split_data_path) != 1: + args.data_path = split_data_path + + # Tensor model parallel size. + args.model_parallel.tensor_model_parallel_size = min( + args.model_parallel.tensor_model_parallel_size, args.world_size) + assert args.world_size % args.model_parallel.tensor_model_parallel_size == 0, 'world size'\ + ' ({}) is not divisible by tensor model parallel size ({})'.format( + args.world_size, args.model_parallel.tensor_model_parallel_size) + # Pipeline model parallel size. + args.model_parallel.pipeline_model_parallel_size = min( + args.model_parallel.pipeline_model_parallel_size, + (args.world_size // args.model_parallel.tensor_model_parallel_size)) + args.model_parallel.transformer_pipeline_model_parallel_size = ( + args.model_parallel.pipeline_model_parallel_size - 1 + if args.standalone_embedding_stage else + args.model_parallel.pipeline_model_parallel_size + ) + # Checks. + model_parallel_size = args.model_parallel.pipeline_model_parallel_size * \ + args.model_parallel.tensor_model_parallel_size + assert args.world_size % (model_parallel_size * args.model_parallel.context_parallel_size) == 0, \ + 'world size ({}) is not divisible by tensor parallel size ({}) times ' \ + 'pipeline parallel size ({}) times context parallel size ({})'.format( + args.world_size, args.model_parallel.tensor_model_parallel_size, + args.model_parallel.pipeline_model_parallel_size, args.model_parallel.context_parallel_size) + + # data_parallel_size is not in model parallel config + args.data_parallel_size = args.world_size // (model_parallel_size * args.model_parallel.context_parallel_size) + if args.rank == 0: + print('using world size: {}, data-parallel size: {}, ' + 'context-parallel size: {} ' + 'tensor-model-parallel size: {}, ' + 'pipeline-model-parallel size: {} '.format( + args.world_size, args.data_parallel_size, + args.model_parallel.context_parallel_size, + args.model_parallel.tensor_model_parallel_size, + args.model_parallel.pipeline_model_parallel_size), flush=True) + if args.model_parallel.pipeline_model_parallel_size > 1: + if args.model_parallel.pipeline_model_parallel_split_rank is not None: + assert args.model_parallel.pipeline_model_parallel_split_rank < \ + args.model_parallel.pipeline_model_parallel_size, 'split rank needs'\ + ' to be less than pipeline model parallel size ({})'.format( + args.model_parallel.pipeline_model_parallel_size) + + if args.model_parallel.tp_comm_overlap: + assert args.model_parallel.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled' + + # Set input defaults. + for key in defaults: + # For default to be valid, it should not be provided in the + # arguments that are passed to the program. We check this by + # ensuring the arg is set to None. + if getattr(args, key, None) is not None: + if args.rank == 0: + print('WARNING: overriding default arguments for {key}:{v} \ + with {key}:{v2}'.format(key=key, v=defaults[key], + v2=getattr(args, key)), + flush=True) + else: + setattr(args, key, defaults[key]) + + # Batch size. + assert args.micro_batch_size is not None + assert args.micro_batch_size > 0 + if args.global_batch_size is None: + args.global_batch_size = args.micro_batch_size * args.data_parallel_size + if args.rank == 0: + print('setting global batch size to {}'.format( + args.global_batch_size), flush=True) + assert args.global_batch_size > 0 + + # num_layers_per_virtual_pipeline_stage is not insde model parallel for checkpointing + if args.num_layers_per_virtual_pipeline_stage is not None: + assert args.model_parallel.pipeline_model_parallel_size > 2, \ + 'pipeline-model-parallel size should be greater than 2 with ' \ + 'interleaved schedule' + assert args.language_model.num_layers % args.model_parallel.transformer_pipeline_model_parallel_size == 0, \ + 'number of layers should be divisible by the pipeline parallel size' + num_layers_per_pipeline_stage = args.language_model.num_layers // args.model_parallel.transformer_pipeline_model_parallel_size + assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \ + 'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage' + args.model_parallel.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \ + args.num_layers_per_virtual_pipeline_stage + else: + args.model_parallel.virtual_pipeline_model_parallel_size = None + # Overlap P2P communication is disabled if not using the interleaved schedule. + args.model_parallel.overlap_p2p_comm = False + if args.rank == 0: + print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved ' + 'schedule does not support overlapping p2p communication') + + if args.overlap_param_gather: + assert args.use_distributed_optimizer, \ + '--overlap-param-gather only supported with distributed optimizer' + assert args.overlap_grad_reduce, \ + '--overlap-grad-reduce should be turned on when using --overlap-param-gather' + + # Parameters dtype. + if args.model_parallel.fp16: + assert not args.model_parallel.bf16 + args.model_parallel.params_dtype = torch.half + if args.model_parallel.bf16: + assert not args.model_parallel.fp16 + args.model_parallel.params_dtype = torch.bfloat16 + # bfloat16 requires gradient accumulation and all-reduce to + # be done in fp32. + if not args.accumulate_allreduce_grads_in_fp32: + args.accumulate_allreduce_grads_in_fp32 = True + if args.rank == 0: + print('accumulate and all-reduce gradients in fp32 for ' + 'bfloat16 data type.', flush=True) + + if args.rank == 0: + print('using {} for parameters ...'.format(args.model_parallel.params_dtype), + flush=True) + + if args.dataloader_type is None: + args.dataloader_type = 'single' + + # Consumed tokens. + args.consumed_train_samples = 0 + args.consumed_valid_samples = 0 + + # Support for variable sequence lengths across batches/microbatches. + # set it if the dataloader supports generation of variable sequence lengths + # across batches/microbatches. Due to additional communication overhead + # during pipeline parallelism, it should not be set if sequence length + # is constant during training. + args.model_parallel.variable_seq_lengths = False + + # Iteration-based training. + if args.train_iters: + # If we use iteration-based training, make sure the + # sample-based options are off. + assert args.train_samples is None, \ + 'expected iteration-based training' + assert args.lr_decay_samples is None, \ + 'expected iteration-based learning rate decay' + assert args.lr_warmup_samples == 0, \ + 'expected iteration-based learning rate warmup' + assert args.rampup_batch_size is None, \ + 'expected no batch-size rampup for iteration-based training' + if args.lr_warmup_fraction is not None: + assert args.lr_warmup_iters == 0, \ + 'can only specify one of lr-warmup-fraction and lr-warmup-iters' + + # Sample-based training. + if args.train_samples: + # If we use sample-based training, make sure the + # iteration-based options are off. + assert args.train_iters is None, \ + 'expected sample-based training' + assert args.lr_decay_iters is None, \ + 'expected sample-based learning rate decay' + assert args.lr_warmup_iters == 0, \ + 'expected sample-based learnig rate warmup' + if args.lr_warmup_fraction is not None: + assert args.lr_warmup_samples == 0, \ + 'can only specify one of lr-warmup-fraction ' \ + 'and lr-warmup-samples' + + # How to handle this better + if args.language_model.num_layers is not None: + assert args.encoder_num_layers is None, \ + 'cannot have both num-layers and encoder-num-layers specified' + args.encoder_num_layers = args.language_model.num_layers + else: + assert args.encoder_num_layers is not None, \ + 'either num-layers or encoder-num-layers should be specified' + args.language_model.num_layers = args.encoder_num_layers + + # Check required arguments. + # removed max_position_embeddings from reqs + required_args = ['num_layers', 'hidden_size', 'num_attention_heads'] + for req_arg in required_args: + _check_arg_is_not_none(args.language_model, req_arg) + + # Checks. + if args.language_model.ffn_hidden_size is None: + if args.language_model.activation_func == "swiglu": + # reduce the dimnesion for MLP since projections happens on + # two linear layers. this keeps the number of paramters in + # the same ballpark as the counterpart with 4*h size + # we keep it a multiple of 64, which means the actual tensor size + # will be a multiple of 64 / tp_size + args.language_model.ffn_hidden_size = int((4 * args.language_model.hidden_size * 2 / 3) / 64) * 64 + else: + args.language_model.ffn_hidden_size = 4 * args.language_model.hidden_size + + if args.language_model.kv_channels is None: + assert args.language_model.hidden_size % args.language_model.num_attention_heads == 0 + args.language_model.kv_channels = args.language_model.hidden_size // args.language_model.num_attention_heads + + #TODO: Implement arguments for encoder-decoder + if args.seq_length is not None: + assert args.encoder_seq_length is None + args.encoder_seq_length = args.seq_length + else: + assert args.encoder_seq_length is not None + args.seq_length = args.encoder_seq_length + + if args.seq_length is not None: + assert args.max_position_embeddings >= args.seq_length + if args.decoder_seq_length is not None: + assert args.max_position_embeddings >= args.decoder_seq_length + if args.lr is not None: + assert args.min_lr <= args.lr + if args.save is not None: + assert args.save_interval is not None + # Mixed precision checks. + if args.fp16_lm_cross_entropy: + assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' + if args.language_model.fp32_residual_connection: + assert args.model_parallel.fp16 or args.model_parallel.bf16, \ + 'residual connection in fp32 only supported when using fp16 or bf16.' + + if args.language_model.moe_grouped_gemm: + assert args.model_parallel.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.' + dc = torch.cuda.get_device_capability() + assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels." + + if args.weight_decay_incr_style == 'constant': + assert args.start_weight_decay is None + assert args.end_weight_decay is None + args.start_weight_decay = args.weight_decay + args.end_weight_decay = args.weight_decay + else: + assert args.start_weight_decay is not None + assert args.end_weight_decay is not None + + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + # Persistent fused layer norm. + if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11): + args.language_model.persist_layer_norm = False + if args.rank == 0: + print('Persistent fused layer norm kernel is supported from ' + 'pytorch v1.11 (nvidia pytorch container paired with v1.11). ' + 'Defaulting to no_persist_layer_norm=True') + + # Activation recomputing. + if args.language_model.distribute_saved_activations: + assert args.model_parallel.tensor_model_parallel_size > 1, 'can distribute ' \ + 'recomputed activations only across tensor model ' \ + 'parallel groups' + assert args.language_model.recompute_granularity == 'full', \ + 'distributed recompute activations is only '\ + 'application to full recompute granularity' + assert args.language_model.recompute_method is not None, \ + 'for distributed recompute activations to work you '\ + 'need to use a recompute method ' + assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \ + 'distributed recompute activations are supported for pytorch ' \ + 'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \ + 'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR) + + if args.language_model.recompute_granularity == 'selective': + assert args.language_model.recompute_method is None, \ + 'recompute method is not yet supported for ' \ + 'selective recomputing granularity' + + # disable sequence parallelism when tp=1 + # to avoid change in numerics when + # sequence_parallelism is enabled. + if args.model_parallel.tensor_model_parallel_size == 1: + args.model_parallel.sequence_parallel = False + + # disable async_tensor_model_parallel_allreduce when + # model parallel memory optimization is enabled + if args.model_parallel.sequence_parallel: + args.model_parallel.async_tensor_model_parallel_allreduce = False + + if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": + if args.model_parallel.sequence_parallel: + raise RuntimeError( + "Using sequence parallelism requires setting the environment variable " + "CUDA_DEVICE_MAX_CONNECTIONS to 1") + if args.model_parallel.async_tensor_model_parallel_allreduce: + raise RuntimeError( + "Using async gradient all reduce requires setting the environment " + "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") + + # Retro checks. + if getattr(args, 'retro_add_retriever', False): + raise Exception("Retro untested for yaml args. See arguments.py.") + + # Sequence parallelism unsupported. + assert not args.sequence_parallel, \ + "retro currently does not support sequence parallelism." + + # Pipeline parallelism unsupported. + assert args.pipeline_model_parallel_size == 1, \ + "retro currently does not support pipeline parallelism." + + #TODO: Retro args loading not tested + # Load retro args (used by both Retro & GPT). + if getattr(args, 'retro_project_dir', None) is not None: + raise Exception("Retro untested for yaml args. See arguments.py.") + + if args.language_model.rotary_interleaved and args.language_model.apply_rope_fusion: + raise RuntimeError('--rotary-interleaved does not work with rope_fusion.') + + # MoE Spec check + if args.language_model.num_moe_experts is not None: + assert args.spec is None, "Model Spec must be None when using MoEs" + if args.model_parallel.tensor_model_parallel_size > 1: + assert args.model_parallel.sequence_parallel, \ + "When using MoE and tensor parallelism, sequence parallelism must be used." + + # Expert parallelism check + if args.model_parallel.expert_model_parallel_size > 1: + assert args.language_model.num_moe_experts is not None, "num_experts must be non None to use expert model parallelism" + assert args.language_model.num_moe_experts % args.model_parallel.expert_model_parallel_size == 0, \ + "Number of experts should be a multiple of expert model parallel_size." + assert not args.model_parallel.fp16, \ + "Expert parallelism is not supported with fp16 training." + + # Print arguments. + _print_args("arguments", args) + + #TODO: Added as much of the global initialization requires the model parallel arguments + args = SimpleNamespace(**args.__dict__, **args.model_parallel.__dict__) + args = SimpleNamespace(**args.__dict__, **args.language_model.__dict__) + # For GPT Layer spec in pretrain_gpt + args.num_experts = args.language_model.num_moe_experts + + return args + +def _print_args(title, args): + """Print arguments.""" + if args.rank == 0: + print(f'------------------------ {title} ------------------------', + flush=True) + str_list = [] + for arg in vars(args): + dots = '.' * (48 - len(arg)) + str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg))) + for arg in sorted(str_list, key=lambda x: x.lower()): + print(arg, flush=True) + print(f'-------------------- end of {title} ---------------------', + flush=True) + +def core_config_from_args(args, dataclass=TransformerConfig): + """Builds core config object from namespace args from given dataclass + + Raises exception if argument missing in args + + Args: + args(SimpleNamespace, optional): Namespace to pull argument values from + dataclass (dataclass, optional): Core dataclass config to pull argument names from + + + Returns: + SimpleNamespace: The returned namespace to build core config from + """ + kw_args = {} + for f in dataclasses.fields(dataclass): + if hasattr(args, f.name): + kw_args[f.name] = getattr(args, f.name) + else: + raise Exception(f"Missing argument {f.name} for {str(dataclass)} config") + return kw_args + +def _check_arg_is_not_none(args, arg): + assert getattr(args, arg) is not None, '{} argument is None'.format(arg) + +def core_transformer_config_from_yaml(args, transfomer_key = "language_model"): + # Combine transfomer config with model parallel args + args = SimpleNamespace(**vars(getattr(args, transfomer_key)), **vars(args.model_parallel)) + # Translate args to core transformer configuration + kw_args = core_config_from_args(args, TransformerConfig) + + # Hardcoded + kw_args['deallocate_pipeline_outputs'] = True + kw_args['pipeline_dtype'] = kw_args['params_dtype'] + kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm + + assert args.activation_func in ["swiglu","squaredrelu","gelu"], f"{args.activation_func} is not a supported activation function" + if args.activation_func == "swiglu": + kw_args['activation_func'] = F.silu + kw_args['gated_linear_unit'] = True + kw_args['bias_activation_fusion'] = args.bias_swiglu_fusion + elif args.activation_func == "squaredrelu": + def squared_relu(x): + return torch.pow(F.relu(x), 2) + kw_args['activation_func'] = squared_relu + elif args.activation_func == "gelu": + kw_args['activation_func'] = F.gelu + if args.add_bias_linear: + kw_args['bias_activation_fusion'] = False + else: + kw_args['bias_activation_fusion'] = args.bias_activation_fusion + + if args.init_method == "xavier_uniform": + kw_args['init_method'] = torch.nn.init.xavier_uniform_ + kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_ + + # Return Transformer config. + return TransformerConfig(**kw_args) + +def load_yaml(yaml_path): + print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored") + with open(yaml_path, "r") as f: + config = yaml.load(f,Loader=yaml.FullLoader) + # Convert to nested namespace + config_namespace = json.loads(json.dumps(config), object_hook=lambda item: SimpleNamespace(**item)) + # Add config location to namespace + config_namespace.yaml_cfg = yaml_path + return config_namespace + diff --git a/pretrain_bert.py b/pretrain_bert.py new file mode 100644 index 0000000..f5c5530 --- /dev/null +++ b/pretrain_bert.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain BERT""" + +from functools import partial + +import torch +import torch.nn.functional as F + +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.core import tensor_parallel +from megatron.core.enums import ModelType +import megatron.legacy.model +from megatron.core.models.bert.bert_model import BertModel +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.transformer.spec_utils import import_module +from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec, bert_layer_local_spec +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.bert_dataset import BERTMaskedWordPieceDataset, BERTMaskedWordPieceDatasetConfig +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core import mpu, tensor_parallel + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building BERT model ...') + + args = get_args() + config = core_transformer_config_from_args(args) + num_tokentypes = 2 if args.bert_binary_head else 0 + + if args.use_legacy_models: + model = megatron.legacy.model.BertModel( + config=config, + num_tokentypes=num_tokentypes, + add_binary_head=args.bert_binary_head, + parallel_output=True, + pre_process=pre_process, + post_process=post_process) + else: + if args.spec is None: + transformer_layer_spec = bert_layer_with_transformer_engine_spec #default spec + elif args.spec[0] == 'local': + print_rank_0('Using Local spec for transformer layers') + transformer_layer_spec = bert_layer_local_spec + else : + transformer_layer_spec = import_module(args.spec) + + model = BertModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + num_tokentypes=num_tokentypes, + add_binary_head=args.bert_binary_head, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + parallel_output=True, + pre_process=pre_process, + post_process=post_process) + + return model + + +def get_batch(data_iterator): + """Build the batch.""" + + # Items and their type. + keys = ['text', 'types', 'labels', + 'is_random', 'loss_mask', 'padding_mask'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens = data_b['text'].long() + types = data_b['types'].long() + sentence_order = data_b['is_random'].long() + loss_mask = data_b['loss_mask'].float() + lm_labels = data_b['labels'].long() + padding_mask = data_b['padding_mask'].long() + + return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask + + +def loss_func(loss_mask, sentence_order, output_tensor): + lm_loss_, sop_logits = output_tensor + + lm_loss_ = lm_loss_.float() + loss_mask = loss_mask.float() + lm_loss = torch.sum( + lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum() + + if sop_logits is not None: + sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(), + sentence_order.view(-1), + ignore_index=-1) + sop_loss = sop_loss.float() + loss = lm_loss + sop_loss + averaged_losses = average_losses_across_data_parallel_group( + [lm_loss, sop_loss]) + return loss, {'lm loss': averaged_losses[0], + 'sop loss': averaged_losses[1]} + else: + loss = lm_loss + averaged_losses = average_losses_across_data_parallel_group( + [lm_loss]) + return loss, {'lm loss': averaged_losses[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = get_batch( + data_iterator) + timers('batch-generator').stop() + + if not args.bert_binary_head: + types = None + + # Forward pass through the model. + output_tensor = model(tokens, padding_mask, + tokentype_ids=types, lm_labels=lm_labels) + + return output_tensor, partial(loss_func, loss_mask, sentence_order) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + tokenizer = get_tokenizer() + + config = BERTMaskedWordPieceDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=args.split, + path_to_cache=args.data_cache_path, + tokenizer=tokenizer, + masking_probability=args.mask_prob, + short_sequence_probability=args.short_seq_prob, + masking_max_ngram=3, + masking_do_full_word=True, + masking_do_permutation=False, + masking_use_longer_ngrams=False, + masking_use_geometric_distribution=False, + classification_head=args.bert_binary_head, + ) + + print_rank_0('> building train, validation, and test datasets ' + 'for BERT ...') + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + BERTMaskedWordPieceDataset, + train_val_test_num_samples, + lambda: mpu.get_tensor_model_parallel_rank() == 0, + config, + ).build() + + print_rank_0("> finished creating BERT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + pretrain(train_valid_test_datasets_provider, model_provider, + ModelType.encoder_or_decoder, + forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) diff --git a/pretrain_gpt.py b/pretrain_gpt.py new file mode 100644 index 0000000..949f157 --- /dev/null +++ b/pretrain_gpt.py @@ -0,0 +1,251 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +"""Pretrain GPT.""" + +import os +import torch +from functools import partial + +from typing import Union +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.core import mpu +from megatron.core.enums import ModelType +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig +from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset +import megatron.legacy.model +from megatron.core.models.gpt import GPTModel +from megatron.training import pretrain +from megatron.core.utils import StragglerDetector +from megatron.core.transformer.spec_utils import import_module +from megatron.training.utils import ( + get_batch_on_this_cp_rank, + get_batch_on_this_tp_rank, +) +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.yaml_arguments import core_transformer_config_from_yaml +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) + + +stimer = StragglerDetector() + +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + """Builds the model. + + If you set the use_legacy_models to True, it will return the legacy GPT model and if not the mcore GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model + """ + args = get_args() + use_te = args.transformer_impl == "transformer_engine" + + print_rank_0('building GPT model ...') + # Experimental loading arguments from yaml + if args.yaml_cfg is not None: + config = core_transformer_config_from_yaml(args, "language_model") + else: + config = core_transformer_config_from_args(args) + + if args.use_legacy_models: + model = megatron.legacy.model.GPTModel( + config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + else: # using core models + if args.spec is not None: + transformer_layer_spec = import_module(args.spec) + else: + if use_te: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm) + else: + transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm) + + model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + rotary_base=args.rotary_base + ) + + return model + + +def get_batch(data_iterator): + """Generate a batch.""" + + # TODO: this is pretty hacky, find a better way + if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()): + return None, None, None, None, None + + # get batches based on the TP rank you are on + batch = get_batch_on_this_tp_rank(data_iterator) + + # slice batch along sequence dimension for context parallelism + batch = get_batch_on_this_cp_rank(batch) + + return batch.values() + + +def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + """Loss function. + + Args: + loss_mask (torch.Tensor): Used to mask out some portions of the loss + output_tensor (torch.Tensor): The tensor with the losses + + Returns: + the loss scalar for this micro-batch + the number of non-padded tokens in this microbatch + a dict containing reporting metrics on the loss and number of tokens across + the data parallel ranks + """ + args = get_args() + + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + total_tokens = loss_mask.sum() + loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)]) + + if args.context_parallel_size > 1: + torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group()) + + # Check individual rank losses are not NaN prior to DP all-reduce. + if args.check_for_nan_in_loss_and_grad: + global_rank = torch.distributed.get_rank() + assert not loss[0].isnan(), ( + f'Rank {global_rank}: found NaN in local forward loss calculation. ' + f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}' + ) + + # Reduce loss for logging. + reporting_loss = loss.clone().detach() + torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group()) + + local_num_tokens = loss[1].clone().detach().to(torch.int) + return ( + loss[0] * args.context_parallel_size, + local_num_tokens, + {'lm loss': (reporting_loss[0], reporting_loss[1])}, + ) + + +def forward_step(data_iterator, model: GPTModel): + """Forward training step. + + Args: + data_iterator : Input data iterator + model (GPTModel): The GPT Model + """ + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + global stimer + with stimer(bdata=True): + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + with stimer: + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def is_dataset_built_on_rank(): + return ( + mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage() + ) and mpu.get_tensor_model_parallel_rank() == 0 + + +def core_gpt_dataset_config_from_args(args): + tokenizer = get_tokenizer() + + return GPTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=args.split, + num_dataset_builder_threads=args.num_dataset_builder_threads, + path_to_cache=args.data_cache_path, + mmap_bin_files=args.mmap_bin_files, + tokenizer=tokenizer, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + create_attention_mask=args.create_attention_mask_in_dataloader, + s3_cache_path = args.s3_cache_path + ) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build the train test and validation datasets. + + Args: + train_val_test_num_samples : A list containing the number of samples in train test and validation. + """ + args = get_args() + + config = core_gpt_dataset_config_from_args(args) + + if args.mock_data: + dataset_type = MockGPTDataset + else: + dataset_type = GPTDataset + + print_rank_0("> building train, validation, and test datasets for GPT ...") + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + dataset_type, + train_val_test_num_samples, + is_dataset_built_on_rank, + config + ).build() + + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + ) diff --git a/pretrain_ict.py b/pretrain_ict.py new file mode 100644 index 0000000..205588b --- /dev/null +++ b/pretrain_ict.py @@ -0,0 +1,166 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain BERT for Inverse Cloze Task""" + +from functools import partial +import math + +import torch +import torch.distributed as dist +import torch.nn.functional as F + +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.core import mpu +from megatron.core.enums import ModelType +from megatron.legacy.data.biencoder_dataset_utils import get_ict_batch +from megatron.legacy.data.dataset_utils import build_train_valid_test_datasets +from megatron.legacy.model.biencoder_model import biencoder_model_provider +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group + + +def pretrain_ict_model_provider(pre_process=True, post_process=True): + args = get_args() + + model = biencoder_model_provider( + only_context_model=False, + only_query_model=False, + biencoder_shared_query_context_model=\ + args.biencoder_shared_query_context_model, + pre_process=pre_process, post_process=post_process) + + return model + +def get_group_world_size_rank(): + + group = mpu.get_data_parallel_group() + rank = torch.distributed.get_rank(group=group) + world_size = torch.distributed.get_world_size(group=group) + + return group, rank, world_size + + +class AllgatherFromDataParallelRegion(torch.autograd.Function): + + @staticmethod + def forward(ctx, input_): + assert input_.dim() == 2 + group, rank, world_size = get_group_world_size_rank() + + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank] = input_ + torch.distributed.all_gather(tensor_list, input_, group=group) + + output = torch.cat(tensor_list, dim=0).contiguous() + + return output + + + @staticmethod + def backward(ctx, grad_output): + group, rank, world_size = get_group_world_size_rank() + + assert grad_output.shape[0] % world_size == 0 + dim_size = grad_output.shape[0] // world_size + output_list = torch.split(grad_output, dim_size, dim=0) + + # get chunk from this rank + output = output_list[rank].contiguous() + return output + +def loss_func(output_tensor): + args = get_args() + query_logits, context_logits = output_tensor + + micro_batch_size = query_logits.shape[0] + # recall we assert that tensor_model_parallel_size == 1 + assert mpu.get_tensor_model_parallel_world_size() == 1, \ + "Model parallel size > 1 not supported for ICT" + + global_batch_size = dist.get_world_size() * micro_batch_size + all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits) + all_context_logits = AllgatherFromDataParallelRegion.apply(context_logits) + + # scores are inner products between query and context embeddings + retrieval_scores = torch.matmul(all_query_logits, + torch.transpose(all_context_logits, 0, 1)) + # scaling the retriever scores + if args.retriever_score_scaling: + retrieval_scores = retrieval_scores / math.sqrt(args.hidden_size) + + softmax_scores = F.log_softmax(retrieval_scores, dim=1) + sorted_vals, sorted_indices = torch.topk(softmax_scores, + k=softmax_scores.shape[1], sorted=True) + + def topk_accuracy(k): + return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \ + for i in range(global_batch_size)]) / global_batch_size]) + + topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies] + + labels = torch.arange(global_batch_size).long().cuda() + loss = F.nll_loss(softmax_scores, labels, reduction='mean') + reduced_losses = average_losses_across_data_parallel_group([loss, *topk_accs]) + + # Scale the retrieval loss + loss = loss * mpu.get_data_parallel_world_size() + + # create stats_dict with retrieval loss and all specified top-k accuracies + topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \ + zip(args.retriever_report_topk_accuracies, reduced_losses[1:])} + stats_dict = dict(loss=reduced_losses[0], **topk_acc_dict) + return loss, stats_dict + + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + query_tokens, query_mask, \ + context_tokens, context_mask, context_indices = get_ict_batch(data_iterator) + timers('batch-generator').stop() + + # Query and Context Types + query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0) + context_types = torch.cuda.LongTensor(*context_tokens.shape).fill_(0) + + # Forward model. + output_tensor = model(query_tokens, query_mask, query_types, context_tokens, + context_mask, context_types) + + return output_tensor, partial(loss_func) + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid and test datasets.""" + args = get_args() + print_rank_0('> building train, validation, and test datasets ' + 'for BERT ICT...') + + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + binary_head=False, + dataset_type='ict') + print_rank_0("> finished creating BERT ICT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + print_rank_0("WARNING : This script is DEPRECATED. Will be removed in mcore release 0.9") + pretrain(train_valid_test_datasets_provider, + pretrain_ict_model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) diff --git a/pretrain_mamba.py b/pretrain_mamba.py new file mode 100644 index 0000000..f2dbb97 --- /dev/null +++ b/pretrain_mamba.py @@ -0,0 +1,239 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Pretrain Mamba.""" + +import os +import torch +from functools import partial + +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.core import mpu +# from megatron.core import parallel_state +from megatron.core.enums import ModelType +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig +from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset +from megatron.core.models.mamba import MambaModel +from megatron.training import pretrain +from megatron.core.utils import StragglerDetector +from megatron.core.transformer.spec_utils import import_module +from megatron.training.utils import ( + get_batch_on_this_cp_rank, + get_batch_on_this_tp_rank, +) +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + + +stimer = StragglerDetector() + +def count_parameters_in_layer(model, layer_name): + num_params = 0 + for name, param in model.named_parameters(): + if layer_name in name: + num_params += param.numel() + print_rank_0(f" - {name}: {param.numel()}") + return num_params + + +def model_provider(pre_process=True, post_process=True) -> MambaModel: + """Builds the model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + MambaModel: The returned model + """ + args = get_args() + + print_rank_0('building Mamba model ...') + config = core_transformer_config_from_args(get_args()) + + assert args.use_legacy_models == False, "Mamba only supported in Mcore!" + + if args.spec is not None: + mamba_stack_spec = import_module(args.spec) + else: + raise("You must provide a valid Mamba layer spec!") + + model = MambaModel( + config=config, + mamba_stack_spec=mamba_stack_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + hybrid_attention_ratio=args.hybrid_attention_ratio, + hybrid_mlp_ratio=args.hybrid_mlp_ratio, + hybrid_override_pattern=args.hybrid_override_pattern, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type + ) + + for l in range(model.decoder.num_layers_per_pipeline_rank): + layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.') + print_rank_0(f" == params layer {l}: {layer_params}") + + return model + + +def get_batch(data_iterator): + """Generate a batch.""" + + # TODO: this is pretty hacky, find a better way + if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()): + return None, None, None, None, None + + # get batches based on the TP rank you are on + batch = get_batch_on_this_tp_rank(data_iterator) + + # slice batch along sequence dimension for context parallelism + batch = get_batch_on_this_cp_rank(batch) + + return batch.values() + +def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + """Loss function. + + Args: + loss_mask (torch.Tensor): Used to mask out some portions of the loss + output_tensor (torch.Tensor): The tensor with the losses + + Returns: + the loss scalar for this micro-batch + the number of non-padded tokens in this microbatch + a dict containing reporting metrics on the loss and number of tokens across + the data parallel ranks + """ + args = get_args() + + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + total_tokens = loss_mask.sum() + loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)]) + + if args.context_parallel_size > 1: + torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group()) + + # Check individual rank losses are not NaN prior to DP all-reduce. + if args.check_for_nan_in_loss_and_grad: + global_rank = torch.distributed.get_rank() + assert not loss[0].isnan(), ( + f'Rank {global_rank}: found NaN in local forward loss calculation. ' + f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}' + ) + + # Reduce loss for logging. + reporting_loss = loss.clone().detach() + torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group()) + + local_num_tokens = loss[1].clone().detach().to(torch.int) + return ( + loss[0] * args.context_parallel_size, + local_num_tokens, + {'lm loss': (reporting_loss[0], reporting_loss[1])}, + ) + + +def forward_step(data_iterator, model: MambaModel): + """Forward training step. + + Args: + data_iterator : Input data iterator + model (MambaModel): The GPT Model + """ + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + global stimer + with stimer(bdata=True): + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + with stimer: + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def is_dataset_built_on_rank(): + return ( + mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage() + ) and mpu.get_tensor_model_parallel_rank() == 0 + + +def core_gpt_dataset_config_from_args(args): + tokenizer = get_tokenizer() + + return GPTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=args.split, + num_dataset_builder_threads=args.num_dataset_builder_threads, + path_to_cache=args.data_cache_path, + mmap_bin_files=args.mmap_bin_files, + tokenizer=tokenizer, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + create_attention_mask=args.create_attention_mask_in_dataloader, + ) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build the train test and validation datasets. + + Args: + train_val_test_num_samples : A list containing the number of samples in train test and validation. + """ + args = get_args() + + config = core_gpt_dataset_config_from_args(args) + + if args.mock_data: + dataset_type = MockGPTDataset + else: + dataset_type = GPTDataset + + print_rank_0("> building train, validation, and test datasets for GPT ...") + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + dataset_type, + train_val_test_num_samples, + is_dataset_built_on_rank, + config + ).build() + + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + pretrain(train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) diff --git a/pretrain_retro.py b/pretrain_retro.py new file mode 100644 index 0000000..a0d8f9d --- /dev/null +++ b/pretrain_retro.py @@ -0,0 +1,244 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain Retro.""" + +from functools import partial +import torch + +from megatron.training import get_args +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core import tensor_parallel +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets +from megatron.core.datasets.retro.query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig +from megatron.core.enums import ModelType +from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel +from megatron.core.models.retro.utils import get_all_true_mask +from megatron.training import pretrain +from megatron.training.utils import get_ltor_masks_and_position_ids +from pretrain_gpt import ( + is_dataset_built_on_rank, + loss_func, + model_provider as default_model_provider, + train_valid_test_datasets_provider as gpt_train_valid_test_datasets_provider, +) + + +def get_retro_config(): + return core_transformer_config_from_args(get_args(), RetroConfig) + + +def core_model_provider(pre_process=True, post_process=True): + """Build the model using Megatron-Core.""" + + args = get_args() + config = get_retro_config() + + # NOTE: Experimental customization feature + if args.spec is not None: + block_spec = import_module(args.spec)() + else: + block_spec = get_retro_decoder_block_spec(config, use_transformer_engine=True) + + print_rank_0('building GPT model ...') + model = RetroModel( + config=config, + transformer_layer_spec=block_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent + ) + return model + + +def model_provider(pre_process=True, post_process=True): + """Build the model. + + Select between two different model classes: + 1. Default model (uses megatron.legacy.models/gpt_model.py). + 2. Core model (uses megatron/core/models/retro/model.py). + """ + + args = get_args() + if not args.use_legacy_models and args.retro_add_retriever: + provider = core_model_provider + else: + provider = default_model_provider + model = provider(pre_process=pre_process, post_process=post_process) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + + args = get_args() + tokenizer = get_tokenizer() + config = get_retro_config() + + # Items and their type. + keys = ['text'] + if args.retro_add_retriever: + keys.append('neighbor_tokens') + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + + if args.retro_add_retriever: + # note: [bs * l * k, r] + # note: 2x == neighbor, continuation + neighbor_tokens = data_b['neighbor_tokens'] \ + .view(-1, config.retro_retrieved_length).long() + _, _, neighbor_position_ids = get_ltor_masks_and_position_ids( + neighbor_tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + neighbor_attention_mask = get_all_true_mask( + (1, 1, config.retro_retrieved_length, config.retro_retrieved_length), + neighbor_tokens.device) + return tokens, labels, loss_mask, attention_mask, position_ids, \ + neighbor_tokens, neighbor_attention_mask, neighbor_position_ids + + else: + return tokens, labels, loss_mask, attention_mask, position_ids + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + if args.retro_add_retriever: + tokens, labels, loss_mask, attention_mask, position_ids, \ + neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \ + get_batch(data_iterator) + else: + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \ + None, None, None + timers('batch-generator').stop() + + # Model call. + if args.use_legacy_models: + forward_kwargs = { + "retriever_input_ids" : neighbor_tokens, + "retriever_position_ids" : neighbor_position_ids, + "retriever_attn_mask" : neighbor_attention_mask, + } + else: + if args.retro_add_retriever: + forward_kwargs = { + "context_input_ids" : neighbor_tokens, + "context_position_ids" : neighbor_position_ids, + "context_mask" : neighbor_attention_mask, + } + else: + forward_kwargs = {} + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels, **forward_kwargs) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_valid_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + # Dataset config. + retro_config = get_retro_config() + data_config = MultiSplitGPTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=args.split, + split_preprocessing=retro_config.retro_split_preprocessing, + path_to_cache=args.data_cache_path, + return_document_ids=False, + tokenizer=get_tokenizer(), + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + ) + + # GPT datasets. + print_rank_0(" > multi-split gpt datasets.") + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + MultiSplitGPTDataset, + train_valid_test_num_samples, + is_dataset_built_on_rank, + data_config, + ).build() + + gpt_datasets = { + "train" : (train_ds, train_valid_test_num_samples[0]), + "valid" : (valid_ds, train_valid_test_num_samples[1]), + "test" : (test_ds, train_valid_test_num_samples[2]), + } + + # Retro datasets. + if args.retro_add_retriever: + return get_retro_datasets( + config=retro_config, + gpt_datasets=gpt_datasets, + sample_length=args.seq_length, + eod_token_id=get_tokenizer().eod, + ) + + # Multi-split GPT datasets. + else: + return ( + gpt_datasets["train"][0], + gpt_datasets["valid"][0], + gpt_datasets["test"][0], + ) + + +if __name__ == "__main__": + + # Temporary for transition to core datasets. + train_valid_test_datasets_provider.is_distributed = True + + pretrain(train_valid_test_datasets_provider, + model_provider, + ModelType.retro_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) diff --git a/pretrain_t5.py b/pretrain_t5.py new file mode 100644 index 0000000..e9702c3 --- /dev/null +++ b/pretrain_t5.py @@ -0,0 +1,263 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain T5""" + +from functools import partial +from typing import Union + +import torch + +from megatron.training import ( + get_args, + get_timers, + get_tokenizer, + print_rank_0 +) +from megatron.core import mpu, tensor_parallel +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.t5_dataset import ( + T5MaskedWordPieceDataset, + T5MaskedWordPieceDatasetConfig, +) +from megatron.core.enums import ModelType +from megatron.core.models.T5 import T5Model +from megatron.training import pretrain +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset, T5MaskedWordPieceDatasetConfig +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.models.T5.t5_spec import (get_t5_encoder_with_transformer_engine_block_spec, + get_t5_decoder_with_transformer_engine_block_spec, + get_t5_encoder_with_local_block_spec, + get_t5_decoder_with_local_block_spec) +from megatron.legacy.model import T5Model as LegacyT5Model + +""" +Pipeline parallelism for T5 +(Caveat: currently, mcore T5 model has not supported pipeline-parallelism) +=========================== + +T5 is a model architecture with both encoder and decoder blocks. +Consequently, pipeline parallelism is implemented slightly differently +compared to architectures like GPT and BERT. + +In particular, when pipeline_model_parallel_world_size > 1, each stage +either executes an encoder block or a decoder block. The +--pipeline-model-parallel-split-rank argument controls the rank at which +the split happens: all ranks lower than this argument execute the +encoder block, and all ranks equal to or higher than this argument value +execute the decoder block. + +In the encoder section of the model, only one tensor is sent downstream: +the intermediate encoder_hidden_state. In the decoder section of the +model, two tensors are sent downstream in the forward pass: the fully +computed encoder_hidden_state, and the intermediate decoder_hidden_state. + +In particular, these are the shapes of the tensors sent between +different workers: + If rank is in decoder section: + intermediate decoder_hidden_state (pre-transpose), + complete encoder_hidden_state (post-transpose). + If rank is at boundary between encoder and decoder sections: + complete encoder_hidden_state (post-transpose). + If rank is in encoder section: + intermediate encoder_hidden_state (pre-transpose). + +Additionally, we have code in the backward_step function in schedules.py +to accumulate the encoder_hidden_state gradient across skip connections +(encoder_hidden_state fed in as input to each layer in the decoder). +""" + + +def model_provider( + pre_process=True, post_process=True, add_encoder=True, add_decoder=True +) -> Union[LegacyT5Model, T5Model]: + """Builds the model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + add_encoder (bool, optional): Defaults to True + add_decoder (bool, optional): Defaults to True + Returns: + T5Model: The returned T5 model + """ + + args = get_args() + config = core_transformer_config_from_args(args) + if args.use_legacy_models: + model = LegacyT5Model( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + add_encoder=add_encoder, + add_decoder=add_decoder, + ) + else: + if args.transformer_impl == "local": + en_block_spec = get_t5_encoder_with_local_block_spec(args.encoder_num_layers) + de_block_spec = get_t5_decoder_with_local_block_spec(args.decoder_num_layers) + elif args.transformer_impl == "transformer_engine": + en_block_spec = get_t5_encoder_with_transformer_engine_block_spec( + args.encoder_num_layers + ) + de_block_spec = get_t5_decoder_with_transformer_engine_block_spec( + args.decoder_num_layers + ) + print_rank_0('building T5 model ...') + model = T5Model( + config=config, + transformer_encoder_layer_spec=en_block_spec, + transformer_decoder_layer_spec=de_block_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + ) + + return model + + +def get_batch(data_iterator): + """Build the batch.""" + + keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', 'enc_mask', 'dec_mask', 'enc_dec_mask'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_enc = data_b['text_enc'].long() + tokens_dec = data_b['text_dec'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + enc_mask = data_b['enc_mask'] < 0.5 + dec_mask = data_b['dec_mask'] < 0.5 + enc_dec_mask = data_b['enc_dec_mask'] < 0.5 + + return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask, enc_dec_mask + + +def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + """Loss function. + + Args: + loss_mask (torch.Tensor): Used to mask out some portions of the loss + output_tensor (torch.Tensor): The tensor with the losses + + Returns: + the loss scalar for this micro-batch + the number of non-padded tokens in this microbatch + a dict containing reporting metrics on the loss and number of tokens across + the data parallel ranks + """ + lm_loss_ = output_tensor.float() + total_tokens = loss_mask.sum() + + lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) + lm_loss = torch.cat([lm_loss.view(1), total_tokens.view(1)]) + + reporting_loss = lm_loss.clone().detach() + torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group()) + + num_tokens = lm_loss[1].clone().detach().to(torch.int) + return lm_loss[0], num_tokens, {'lm loss': (reporting_loss[0], reporting_loss[1])} + + +def forward_step(data_iterator, model: T5Model): + """Forward training step. + + Args: + data_iterator : Input data iterator + model (T5Model): The T5 Model + """ + + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch generator', log_level=2).start() + tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask = get_batch( + data_iterator + ) + timers('batch generator').stop() + + # Forward model lm_labels + output_tensor = model( + tokens_enc, tokens_dec, enc_mask, dec_mask, enc_dec_mask, lm_labels=lm_labels + ) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples: int): + """Build the train test and validation datasets. + + Args: + train_val_test_num_samples : A list containing the number of samples in train test and validation. + """ + args = get_args() + + tokenizer = get_tokenizer() + + config = T5MaskedWordPieceDatasetConfig( + random_seed=args.seed, + sequence_length=args.encoder_seq_length, + sequence_length_decoder=args.decoder_seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=args.split, + path_to_cache=args.data_cache_path, + tokenizer=tokenizer, + masking_probability=args.mask_prob, + short_sequence_probability=args.short_seq_prob, + masking_max_ngram=10, + masking_do_full_word=True, + masking_do_permutation=False, + masking_use_longer_ngrams=False, + masking_use_geometric_distribution=True, + ) + + print_rank_0('> building train, validation, and test datasets for T5 ...') + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + T5MaskedWordPieceDataset, + train_val_test_num_samples, + lambda: mpu.get_tensor_model_parallel_rank() == 0, + config, + ).build() + + print_rank_0("> finished creating T5 datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_and_decoder, + forward_step, + args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}, + ) diff --git a/pretrain_vision_classify.py b/pretrain_vision_classify.py new file mode 100644 index 0000000..8d9b28b --- /dev/null +++ b/pretrain_vision_classify.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain VIT""" + +import torch +import torch.nn.functional as F +from functools import partial +from megatron.training import get_args, get_timers, print_rank_0 +from megatron.core.enums import ModelType +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.classification import VitClassificationModel +from megatron.legacy.model.vision.classification import MitClassificationModel +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.arguments import core_transformer_config_from_args + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + args = get_args() + config = core_transformer_config_from_args(args) + if args.vision_backbone_type == 'vit': + print_rank_0("building VIT model ...") + model = VitClassificationModel(config=config, + num_classes=args.num_classes, + pre_process=pre_process, + post_process=post_process) + elif args.vision_backbone_type == 'mit': + print_rank_0("building MIT model ...") + model = MitClassificationModel(num_classes=args.num_classes, + pre_process=pre_process, + post_process=post_process) + else: + raise Exception('{} vision backbone is not supported.'.format( + args.vision_backbone_type)) + return model + + +def get_batch(data_iterator): + """Build the batch.""" + data = next(data_iterator) + + # only data parallelism; no need for broadcast + images = data[0].cuda() + labels = data[1].cuda() + + return images, labels + + +def loss_func(labels, output_tensor): + logits = output_tensor.contiguous().float() + loss = F.cross_entropy(logits, labels) + + outputs = torch.argmax(logits, -1) + correct = (outputs == labels).float() + accuracy = torch.mean(correct) + + averaged_loss = average_losses_across_data_parallel_group([loss, accuracy]) + + return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} + + +def forward_step(data_iterator, model): + """Forward step.""" + timers = get_timers() + + # Get the batch. + timers("batch-generator", log_level=2).start() + ( + images, + labels, + ) = get_batch(data_iterator) + timers("batch-generator").stop() + + # Forward model. lm_labels + output_tensor = model(images) + + return output_tensor, partial(loss_func, labels) + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0( + "> building train, validation, and test datasets " "for VIT ..." + ) + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + ) + print_rank_0("> finished creating VIT datasets ...") + + return train_ds, valid_ds, None + + +if __name__ == "__main__": + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True} + ) diff --git a/pretrain_vision_dino.py b/pretrain_vision_dino.py new file mode 100644 index 0000000..f75280c --- /dev/null +++ b/pretrain_vision_dino.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch +import torch.nn.functional as F +import torch.nn as nn +import numpy as np +import torch.distributed as dist +from functools import partial +from megatron.training import get_args, get_timers, print_rank_0 +from megatron.core.enums import ModelType +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.dino import DINOPretrainModel +from megatron.legacy.model.vision.knn_monitor import knn_predict, get_feature_bank +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group, unwrap_model +from megatron.training.arguments import core_transformer_config_from_args + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + config = core_transformer_config_from_args(get_args()) + return DINOPretrainModel(config, pre_process=pre_process, post_process=post_process) + +def get_batch(data_iterator): + """Build the batch.""" + data = next(data_iterator) + + # only data parallelism; no need for broadcast + if isinstance(data[0], list): + images = [aug.cuda() for aug in data[0]] + else: + images = data[0].cuda() + labels = data[1].cuda() + + return images, labels + + +def loss_func(model, labels, output_tensor, collect_data=False): + args = get_args() + + model = unwrap_model(model) + if model.training: + student_output, teacher_output = output_tensor + loss = model.dino_loss(student_output, teacher_output, args.curr_iteration) + averaged_loss = average_losses_across_data_parallel_group([loss]) + return loss, {"loss": averaged_loss[0]} + else: + _, teacher_feature = output_tensor + feature_bank, feature_labels, classes = get_feature_bank() + feature = F.normalize(teacher_feature.float(), dim=1) + + knn_accs = [] + for k in [10, 20, 100, 200]: + pred_labels = knn_predict(feature, feature_bank, + feature_labels, classes, k, 0.07) + knn_acc = (pred_labels[:, 0] == labels).float().mean() + knn_accs.append(knn_acc) + + averaged_loss = average_losses_across_data_parallel_group(knn_accs) + return 0, {"knn_acc_10": averaged_loss[0], + "knn_acc_20": averaged_loss[1], + "knn_acc_100": averaged_loss[2], + "knn_acc_200": averaged_loss[3]} + + +def forward_step(data_iterator, model): + """Forward step.""" + timers = get_timers() + + # Get the batch. + timers("batch-generator", log_level=2).start() + ( + images, + labels, + ) = get_batch(data_iterator) + timers("batch-generator").stop() + + return model(images), partial(loss_func, model, labels) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0( + "> building train, validation, and test datasets " "for VIT ..." + ) + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + ) + print_rank_0("> finished creating VIT datasets ...") + + return train_ds, valid_ds, None + + +if __name__ == "__main__": + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True} + ) + diff --git a/pretrain_vision_inpaint.py b/pretrain_vision_inpaint.py new file mode 100644 index 0000000..8570baa --- /dev/null +++ b/pretrain_vision_inpaint.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain VIT""" + +import torch +import torch.nn.functional as F +from functools import partial +from megatron.training import get_args, get_timers, print_rank_0, print_rank_last +from megatron.core.enums import ModelType +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.inpainting import VitInpaintingModel +from megatron.legacy.model.vision.inpainting import MitInpaintingModel +from megatron.training import pretrain +from megatron.training.utils import average_losses_across_data_parallel_group +from tasks.vision.segmentation.metrics import SSIM, PSNR +from megatron.training.arguments import core_transformer_config_from_args + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + config = core_transformer_config_from_args(args) + if args.vision_backbone_type == 'vit': + model = VitInpaintingModel(config=config, + pre_process=pre_process, + post_process=post_process) + elif args.vision_backbone_type == 'mit': + model = MitInpaintingModel(config=config, + pre_process=pre_process, + post_process=post_process) + else: + raise Exception('{} vision backbone is not supported.'.format( + args.vision_backbone_type)) + return model + + +def get_batch(data_iterator): + """Build the batch.""" + data = next(data_iterator) + + # only data parallelism; no need for broadcast + images = data[0][0].cuda() + masks = data[0][1].cuda() + return images, masks + + +def loss_func(images, masks, masked_images, outputs, non_loss_data=False): + outputs = outputs.contiguous().float() + masks_flip = 1-masks + flip_masked_outputs = outputs.masked_fill(masks_flip.bool(), 0) + flip_masked_images = images.masked_fill(masks_flip.bool(), 0) + + ssim_fun = SSIM() + psnr_fun = PSNR() + + if not non_loss_data: + mask_count = torch.count_nonzero(masks) + loss = F.mse_loss( + flip_masked_outputs, + flip_masked_images.float(), + reduction="sum" + ) + loss = loss/mask_count + ssim = ssim_fun(flip_masked_outputs, flip_masked_images.float()) + psnr = psnr_fun(flip_masked_outputs, flip_masked_images.float()) + + averaged_loss = average_losses_across_data_parallel_group( + [loss, psnr, ssim] + ) + + return loss, {"loss": averaged_loss[0], + "psnr": averaged_loss[1], + 'ssim': averaged_loss[2]} + else: + synth_images = masked_images.float() + flip_masked_outputs + ssim = ssim_fun(synth_images, images.float()) + psnr = psnr_fun(synth_images, images.float()) + return torch.cat((images, masked_images, synth_images), dim=2), ssim, psnr + + +def forward_step(data_iterator, model): + """Forward step.""" + timers = get_timers() + + # Get the batch. + timers("batch-generator", log_level=2).start() + ( + images, + masks, + ) = get_batch(data_iterator) + timers("batch-generator").stop() + + masked_images = images.masked_fill(masks.bool(), 0) + outputs = model(masked_images) + + # Forward mode + return outputs, partial(loss_func, images, masks, masked_images) + + +def process_non_loss_data(data, iteration, writer): + psnr_sum = 0 + ssim_sum = 0 + for (output_tb, ssim, psnr) in data: + output_tb[output_tb < 0] = 0 + output_tb[output_tb > 1] = 1 + writer.add_images("gt-input-output-vald", output_tb, + global_step=iteration, walltime=None, + dataformats='NCHW') + psnr_sum = psnr_sum + psnr.item() + ssim_sum = ssim_sum + ssim.item() + psnr = psnr_sum/len(data) + ssim = ssim_sum/len(data) + writer.add_scalar('PSNR generate value-validation', psnr, iteration) + writer.add_scalar('SSIM generate value-validation', ssim, iteration) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0( + "> building train, validation, and test datasets " "for VIT ..." + ) + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + ) + print_rank_0("> finished creating VIT datasets ...") + + return train_ds, valid_ds, None + + +if __name__ == "__main__": + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + process_non_loss_data, + args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True} + ) diff --git a/pretrain_vlm.py b/pretrain_vlm.py new file mode 100644 index 0000000..2bee069 --- /dev/null +++ b/pretrain_vlm.py @@ -0,0 +1,221 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Pretrain vision language model.""" +from copy import deepcopy +from functools import partial +from types import SimpleNamespace + +import torch + +from megatron.core import tensor_parallel +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.gpt_dataset import MockGPTLowLevelDataset +from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig +from megatron.core.enums import ModelType +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.multimodal.llava_model import LLaVAModel +from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec +from megatron.core.transformer.spec_utils import import_module +from megatron.training import get_args, get_timers, get_tokenizer, pretrain, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from pretrain_gpt import is_dataset_built_on_rank, loss_func + + +def model_provider(pre_process=True, post_process=True, parallel_output=True) -> LLaVAModel: + """Builds the model. + + Note: currently, only LLaVA model is supported. Follow-up changes will make this configurable. + + Args: + pre_process (bool): Enable preprocessing in the model. NOTE: Not used at the moment. + post_process (bool): Enable postprocessing in the model. NOTE: Not used at the moment. + parallel_output (bool): Enable model parallel output. + + Returns: + model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model + """ + args = get_args() + + print_rank_0('building a multimodal model ...') + language_transformer_config = core_transformer_config_from_args(get_args()) + + if args.spec is not None: + language_transformer_layer_spec = import_module(args.spec) + else: + language_transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + args.num_experts, args.moe_grouped_gemm + ) + + vision_transformer_layer_spec = get_vit_layer_with_transformer_engine_spec() + + # TODO: Make these configurable via input .yaml config. + vision_transformer_config = deepcopy(language_transformer_config) + + vision_projection_type = "mlp" + vision_projection_config = deepcopy(language_transformer_config) + vision_projection_modules = deepcopy(language_transformer_layer_spec.submodules.mlp.submodules) + + model = LLaVAModel( + language_transformer_config=language_transformer_config, + language_transformer_layer_spec=language_transformer_layer_spec, + language_vocab_size=args.padded_vocab_size, + language_max_sequence_length=args.max_position_embeddings, + vision_transformer_config=vision_transformer_config, + vision_transformer_layer_spec=vision_transformer_layer_spec, + drop_vision_class_token=args.drop_vision_class_token, + vision_projection_config=vision_projection_config, + vision_projection_layer_spec=vision_projection_modules, + vision_projection_type=vision_projection_type, + parallel_output=parallel_output, + language_position_embedding_type=args.position_embedding_type, + language_rotary_percent=args.rotary_percent, + ) + + return model + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build the train test and validation datasets. + + Args: + train_val_test_num_samples : A list containing the number of samples in train, validation, and test sets. + + Returns: + train_ds, val_ds, test_ds (megatron.core.datasets.multimodal_dataset.MockMultimodalDataset): Train, validation, and test datasets, respectively. + """ + args = get_args() + + config = MultimodalDatasetConfig( + random_seed=args.seed, + split=args.split, + sequence_length=args.seq_length, + tokenizer=get_tokenizer(), + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + image_h=args.img_h, + image_w=args.img_w, + preprocess_func=_preprocess_data_for_llava, + ) + + print_rank_0("> building train, validation, and test datasets for multimodal ...") + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + MockMultimodalDataset, train_val_test_num_samples, is_dataset_built_on_rank, config + ).build() + + print_rank_0("> finished creating multimodal datasets ...") + + return train_ds, valid_ds, test_ds + + +def _preprocess_data_for_llava(data): + """Preprocess data sample to the format expected by a LLaVA model. + + Note: This doesn't support all the different modes in the official LLaVA repo yet. + + Args: + data (dict): Data sample with keys like 'image', 'tokens', etc. + + Returns: + data (dict): Processed data sample suitable for the model. + """ + args = get_args() + + # TODO: Move these to multimodal spec (added in a separate code change). + class_token_len = 1 + add_class_token = True + + num_patches_per_dim_h = args.img_h // args.patch_dim + num_patches_per_dim_w = args.img_w // args.patch_dim + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + num_image_tokens = num_patches + (class_token_len if add_class_token else 0) + + data["loss_mask"] = torch.cat( + [torch.zeros(num_image_tokens, dtype=torch.float32), data["loss_mask"]] + ) + data["labels"] = torch.cat([torch.zeros(num_image_tokens, dtype=torch.int64), data["labels"]]) + + full_seq_length = len(data["labels"]) + attention_mask = torch.tril(torch.ones((1, full_seq_length, full_seq_length))) + attention_mask = attention_mask < 0.5 + attention_mask[:, num_image_tokens:, num_image_tokens:] = data["attention_mask"] + data["attention_mask"] = attention_mask + + return data + + +def get_batch(data_iterator): + """Generate a batch. + + Args: + data_iterator: Iterable dataset. + + Returns: + sample: A data sample with images, tokens, etc. + """ + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + + data_i = tensor_parallel.broadcast_data(["tokens", "position_ids", "labels"], data, torch.int64) + data_f = tensor_parallel.broadcast_data(["image", "loss_mask"], data, torch.float32) + data_b = tensor_parallel.broadcast_data(["attention_mask"], data, torch.bool) + + tokens = data_i["tokens"].long() + position_ids = data_i["position_ids"].long() + labels = data_i["labels"].long() + images = data_f["image"].float() + loss_mask = data_f["loss_mask"].float() + attention_mask = data_b["attention_mask"].bool() + + return tokens, position_ids, labels, images, loss_mask, attention_mask + + +def forward_step(data_iterator, model: LLaVAModel): + """Forward training step. + + Args: + data_iterator: Iterable dataset. + model (megatron.core.models.multimodal.llava_model.LLaVAModel): Multimodal model + + Returns: + output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size]. + loss_func (callable): Loss function with a loss mask specified. + """ + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, position_ids, labels, images, loss_mask, attention_mask = get_batch(data_iterator) + timers('batch-generator').stop() + + output_tensor = model(images, tokens, position_ids, attention_mask, labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def add_vlm_extra_args(parser): + """Extra arguments.""" + group = parser.add_argument_group(title='vision language model specific arguments') + group.add_argument( + "--drop-vision-class-token", + action="store_true", + default=False, + help="Drop vision class token before input to the language model.", + ) + return parser + + +if __name__ == "__main__": + train_valid_test_datasets_provider.is_distributed = True + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + extra_args_provider=add_vlm_extra_args, + ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..934745e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +[build-system] +requires = [ + "setuptools", + "pybind11", +] + +[tool.isort] +profile = "black" # black-compatible +line_length = 100 # should match black parameters +py_version = 38 # python 3.8 as a target version +known_first_party = ["megatron"] # FIRSTPARTY section +known_third_party = ["transformer_engine"] # THIRDPARTY section +sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] +default_section = "THIRDPARTY" +extend_skip = ["setup.py"] + +[tool.black] +line_length = 100 +skip_string_normalization = true +# recongized by future versions, disallows to reformat code with incompatible versions +# Matches NeMO version so people working on both codebases don't need two different version of black installed +required_version = "24" diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..adb0062 --- /dev/null +++ b/setup.py @@ -0,0 +1,109 @@ +"""Setup for pip package.""" + +import importlib.util +import subprocess + +import setuptools +from setuptools import Extension + +spec = importlib.util.spec_from_file_location('package_info', 'megatron/core/package_info.py') +package_info = importlib.util.module_from_spec(spec) +spec.loader.exec_module(package_info) + + +__contact_emails__ = package_info.__contact_emails__ +__contact_names__ = package_info.__contact_names__ +__description__ = package_info.__description__ +__download_url__ = package_info.__download_url__ +__homepage__ = package_info.__homepage__ +__keywords__ = package_info.__keywords__ +__license__ = package_info.__license__ +__package_name__ = package_info.__package_name__ +__repository_url__ = package_info.__repository_url__ +__version__ = package_info.__version__ + + +with open("megatron/core/README.md", "r", encoding='utf-8') as fh: + long_description = fh.read() +long_description_content_type = "text/markdown" + +############################################################################### +# Extension Making # +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # + +extra_compile_args = ( + subprocess.check_output(["python3", "-m", "pybind11", "--includes"]) + .decode("utf-8") + .strip() + .split() +) + +############################################################################### + +setuptools.setup( + name=__package_name__, + # Versions should comply with PEP440. For a discussion on single-sourcing + # the version across setup.py and the project code, see + # https://packaging.python.org/en/latest/single_source_version.html + version=__version__, + description=__description__, + long_description=long_description, + long_description_content_type=long_description_content_type, + # The project's main homepage. + url=__repository_url__, + download_url=__download_url__, + # Author details + author=__contact_names__, + author_email=__contact_emails__, + # maintainer Details + maintainer=__contact_names__, + maintainer_email=__contact_emails__, + # The licence under which the project is released + license=__license__, + classifiers=[ + # How mature is this project? Common values are + # 1 - Planning + # 2 - Pre-Alpha + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + # 6 - Mature + # 7 - Inactive + 'Development Status :: 5 - Production/Stable', + # Indicate who your project is intended for + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Information Technology', + # Indicate what your project relates to + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Image Recognition', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Utilities', + # Pick your license as you wish (should match "license" above) + 'License :: OSI Approved :: BSD License', + # Supported python versions + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + # Additional Setting + 'Environment :: Console', + 'Natural Language :: English', + 'Operating System :: OS Independent', + ], + packages=setuptools.find_namespace_packages(include=["megatron.core", "megatron.core.*"]), + ext_modules=[ + Extension( + "megatron.core.datasets.helpers", + sources=["megatron/core/datasets/helpers.cpp"], + language="c++", + extra_compile_args=extra_compile_args, + ) + ], + # Add in any packaged data. + include_package_data=True, + # PyPI package information. + keywords=__keywords__, +) diff --git a/tasks/data_utils.py b/tasks/data_utils.py new file mode 100644 index 0000000..914acf1 --- /dev/null +++ b/tasks/data_utils.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +""" Tasks data utility.""" + +import re +import numpy as np + + +def clean_text(text): + """Remove new lines and multiple spaces and adjust end of sentence dot.""" + + text = text.replace("\n", " ") + text = re.sub(r'\s+', ' ', text) + for _ in range(3): + text = text.replace(' . ', '. ') + + return text + + +def build_sample(ids, types, paddings, label, unique_id): + """Convert to numpy and return a sample consumed by the batch producer.""" + + ids_np = np.array(ids, dtype=np.int64) + types_np = np.array(types, dtype=np.int64) + paddings_np = np.array(paddings, dtype=np.int64) + sample = ({'text': ids_np, + 'types': types_np, + 'padding_mask': paddings_np, + 'label': int(label), + 'uid': int(unique_id)}) + + return sample + + +def build_tokens_types_paddings_from_text(text_a, text_b, + tokenizer, max_seq_length): + """Build token types and paddings, trim if needed, and pad if needed.""" + + text_a_ids = tokenizer.tokenize(text_a) + text_b_ids = None + if text_b is not None: + text_b_ids = tokenizer.tokenize(text_b) + + return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, + max_seq_length, tokenizer.cls, + tokenizer.sep, tokenizer.pad) + + +def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, + cls_id, sep_id, pad_id): + """Build token types and paddings, trim if needed, and pad if needed.""" + + ids = [] + types = [] + paddings = [] + + # [CLS]. + ids.append(cls_id) + types.append(0) + paddings.append(1) + + # A. + len_text_a = len(text_a_ids) + ids.extend(text_a_ids) + types.extend([0] * len_text_a) + paddings.extend([1] * len_text_a) + + # [SEP]. + ids.append(sep_id) + types.append(0) + paddings.append(1) + + # B. + if text_b_ids is not None: + len_text_b = len(text_b_ids) + ids.extend(text_b_ids) + types.extend([1] * len_text_b) + paddings.extend([1] * len_text_b) + + # Cap the size. + trimmed = False + if len(ids) >= max_seq_length: + max_seq_length_m1 = max_seq_length - 1 + ids = ids[0:max_seq_length_m1] + types = types[0:max_seq_length_m1] + paddings = paddings[0:max_seq_length_m1] + trimmed = True + + # [SEP]. + if (text_b_ids is not None) or trimmed: + ids.append(sep_id) + if text_b_ids is None: + types.append(0) + else: + types.append(1) + paddings.append(1) + + # Padding. + padding_length = max_seq_length - len(ids) + if padding_length > 0: + ids.extend([pad_id] * padding_length) + types.extend([pad_id] * padding_length) + paddings.extend([0] * padding_length) + + return ids, types, paddings diff --git a/tasks/ensemble_classifier.py b/tasks/ensemble_classifier.py new file mode 100644 index 0000000..c2333b7 --- /dev/null +++ b/tasks/ensemble_classifier.py @@ -0,0 +1,149 @@ +import os +import argparse +import collections + +import numpy as np +import torch + + +def process_files(args): + all_predictions = collections.OrderedDict() + all_labels = collections.OrderedDict() + all_uid = collections.OrderedDict() + for path in args.paths: + path = os.path.join(path, args.prediction_name) + try: + data = torch.load(path) + for dataset in data: + name, d = dataset + predictions, labels, uid = d + if name not in all_predictions: + all_predictions[name] = np.array(predictions) + if args.labels is None: + args.labels = [i for i in range(all_predictions[name].shape[1])] + if args.eval: + all_labels[name] = np.array(labels) + all_uid[name] = np.array(uid) + else: + all_predictions[name] += np.array(predictions) + assert np.allclose(all_uid[name], np.array(uid)) + except Exception as e: + print(e) + continue + return all_predictions, all_labels, all_uid + + +def get_threshold(all_predictions, all_labels, one_threshold=False): + if one_threshold: + all_predictons = {'combined': np.concatenate(list(all_predictions.values()))} + all_labels = {'combined': np.concatenate(list(all_predictions.labels()))} + out_thresh = [] + for dataset in all_predictions: + preds = all_predictions[dataset] + labels = all_labels[dataset] + out_thresh.append(calc_threshold(preds, labels)) + return out_thresh + + +def calc_threshold(p, l): + trials = [(i) * (1. / 100.) for i in range(100)] + best_acc = float('-inf') + best_thresh = 0 + for t in trials: + acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean() + if acc > best_acc: + best_acc = acc + best_thresh = t + return best_thresh + + +def apply_threshold(preds, t): + assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0]))) + prob = preds[:, -1] + thresholded = (prob >= t).astype(int) + preds = np.zeros_like(preds) + preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1 + return preds + + +def threshold_predictions(all_predictions, threshold): + if len(threshold) != len(all_predictions): + threshold = [threshold[-1]] * (len(all_predictions) - len(threshold)) + for i, dataset in enumerate(all_predictions): + thresh = threshold[i] + preds = all_predictions[dataset] + all_predictions[dataset] = apply_threshold(preds, thresh) + return all_predictions + + +def postprocess_predictions(all_predictions, all_labels, args): + for d in all_predictions: + all_predictions[d] = all_predictions[d] / len(args.paths) + + if args.calc_threshold: + args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold) + print('threshold', args.threshold) + + if args.threshold is not None: + all_predictions = threshold_predictions(all_predictions, args.threshold) + + return all_predictions, all_labels + + +def write_predictions(all_predictions, all_labels, all_uid, args): + all_correct = 0 + count = 0 + for dataset in all_predictions: + preds = all_predictions[dataset] + preds = np.argmax(preds, -1) + if args.eval: + correct = (preds == all_labels[dataset]).sum() + num = len(all_labels[dataset]) + accuracy = correct / num + count += num + all_correct += correct + accuracy = (preds == all_labels[dataset]).mean() + print(accuracy) + if not os.path.exists(os.path.join(args.outdir, dataset)): + os.makedirs(os.path.join(args.outdir, dataset)) + outpath = os.path.join( + args.outdir, dataset, os.path.splitext( + args.prediction_name)[0] + '.tsv') + with open(outpath, 'w') as f: + f.write('id\tlabel\n') + f.write('\n'.join(str(uid) + '\t' + str(args.labels[p]) + for uid, p in zip(all_uid[dataset], preds.tolist()))) + if args.eval: + print(all_correct / count) + + +def ensemble_predictions(args): + all_predictions, all_labels, all_uid = process_files(args) + all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args) + write_predictions(all_predictions, all_labels, all_uid, args) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--paths', required=True, nargs='+', + help='paths to checkpoint directories used in ensemble') + parser.add_argument('--eval', action='store_true', + help='compute accuracy metrics against labels (dev set)') + parser.add_argument('--outdir', + help='directory to place ensembled predictions in') + parser.add_argument('--prediction-name', default='test_predictions.pt', + help='name of predictions in checkpoint directories') + parser.add_argument('--calc-threshold', action='store_true', + help='calculate threshold classification') + parser.add_argument('--one-threshold', action='store_true', + help='use on threshold for all subdatasets') + parser.add_argument('--threshold', nargs='+', default=None, type=float, + help='user supplied threshold for classification') + parser.add_argument('--labels', nargs='+', default=None, + help='whitespace separated list of label names') + args = parser.parse_args() + ensemble_predictions(args) + + +if __name__ == '__main__': + main() diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py new file mode 100644 index 0000000..be29b93 --- /dev/null +++ b/tasks/eval_utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Evaluation utilities.""" + +import os +import time +from functools import partial + +import torch + +from megatron.training import get_args +from megatron.training import print_rank_last, is_last_rank +from megatron.core import mpu +from megatron.schedules import get_forward_backward_func +from tasks.finetune_utils import build_data_loader +from tasks.finetune_utils import process_batch + + +def accuracy_func_provider(single_dataset_provider): + """Provide function that calculates accuracies.""" + args = get_args() + + # Build dataloaders. + datapaths = args.valid_data + dataloaders = [] + for datapath in datapaths: + dataset = single_dataset_provider(datapath) + dataloader = build_data_loader( + dataset, args.orig_micro_batch_size, num_workers=args.num_workers, + drop_last=(mpu.get_data_parallel_world_size() > 1)) + dataloaders.append((dataset.dataset_name, dataloader)) + + def metrics_func(model, epoch, output_predictions=False): + print_rank_last('calculating metrics ...') + correct = 0 + total = 0 + if output_predictions: + assert mpu.get_data_parallel_world_size() == 1 + named_predictions = [] + names = 'predictions' + for name, dataloader in dataloaders: + output = calculate_correct_answers(name, model, dataloader, + epoch, output_predictions) + if not output_predictions: + correct_ans, total_count = output + else: + correct_ans, total_count, predictions = output + named_predictions.append((name, predictions)) + names += '_' + name + correct += correct_ans + total += total_count + if is_last_rank(): + percent = float(correct) * 100.0 / float(total) + print(' >> |epoch: {}| overall: correct / total = {} / {} = ' + '{:.4f} %'.format(epoch, correct, total, percent)) + + if output_predictions and is_last_rank(): + assert args.load is not None + filename = os.path.join(args.load, names + '.pt') + torch.save(named_predictions, filename) + + return metrics_func + + +def calculate_correct_answers(name, model, dataloader, + epoch, output_predictions): + """Calculate correct over total answers and return prediction if the + `output_predictions` is true.""" + args = get_args() + forward_backward_func = get_forward_backward_func() + start_time = time.time() + for m in model: + m.eval() + saved_micro_batch_size = args.micro_batch_size + saved_global_batch_size = args.global_batch_size + + ds = dataloader.dataset + if hasattr(ds, 'sample_multiplier'): + # If our dataset as a sample_multiplier attribute that means + # each "sample" from the dataset actually has multiple samples + # that will collapse into the batch dimension (for example in + # the RACE dataset that has several options), we need to + # account for that when setting the micro batch size. + sample_multiplier = ds.sample_multiplier + else: + sample_multiplier = 1 + micro_batch_size_times_data_parallel = args.orig_micro_batch_size * args.data_parallel_size + num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel + + def loss_func(output_predictions, labels, output_tensor): + logits = output_tensor + + loss_dict = {} + # Add output predictions. + if output_predictions: + assert False + loss_dict['softmaxes'] = torch.nn.Softmax(dim=-1)( + logits.float()).data.cpu().numpy().tolist() + loss_dict['labels'] = labels.data.cpu().numpy().tolist() + loss_dict['ids'] = batch['uid'].cpu().numpy().tolist() + # Compute the correct answers. + predicted = torch.argmax(logits, dim=-1) + corrects = (predicted == labels) + # Add to the counters. + loss_dict['total'] = labels.size(0) + loss_dict['correct'] = corrects.sum().item() + + return 0, loss_dict + + # defined inside to capture output_predictions + def correct_answers_forward_step(batch, model): + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + tokens, types, labels, attention_mask = process_batch(batch_) + + # Forward model. + args = get_args() + output_tensor = model(tokens, attention_mask, tokentype_ids=types) + + return output_tensor, partial(loss_func, output_predictions, labels) + + with torch.no_grad(): + # For all the batches in the dataset. + total = 0 + correct = 0 + if output_predictions: + # This option is only possible when data parallel size is 1. + assert mpu.get_data_parallel_world_size() == 1 + softmaxes = [] + labels = [] + ids = [] + for _, batch in enumerate(dataloader): + # For evaluation only mode we use drop_last = False to get all the + # samples, which means we might not have a full batch, so we + # adjust batch_size here to actual batch size of data + actual_batch_size = len(batch['label']) + # ... applying sample_multiplier if necessary + args.micro_batch_size = actual_batch_size * sample_multiplier + args.global_batch_size = actual_batch_size * sample_multiplier * num_micro_batches + + loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model, + optimizer=None, timers=None, forward_only=True) + + for loss_dict in loss_dicts: + if output_predictions: + softmaxes.extend(loss_dict['softmaxes']) + labels.extend(loss_dict['labels']) + ids.extend(loss_dict['ids']) + total += loss_dict['total'] + correct += loss_dict['correct'] + + + for m in model: + m.train() + args.micro_batch_size = saved_micro_batch_size + args.global_batch_size = saved_global_batch_size + + # Reduce. + if mpu.is_pipeline_last_stage(): + unreduced = torch.tensor([correct, total], dtype=torch.long, device='cuda') + torch.distributed.all_reduce(unreduced, + group=mpu.get_data_parallel_group()) + + # Print on screen. + + correct_ans = unreduced[0].item() + total_count = unreduced[1].item() + percent = float(correct_ans) * 100.0 / float(total_count) + elapsed_time = time.time() - start_time + print_rank_last(' > |epoch: {}| metrics for {}: correct / total ' + '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( + epoch, name, correct_ans, total_count, + percent, elapsed_time)) + + if output_predictions: + return correct_ans, total_count, (softmaxes, labels, ids) + return correct_ans, total_count + if output_predictions: + return 0, 0, () + return 0, 0 diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py new file mode 100644 index 0000000..cd335c2 --- /dev/null +++ b/tasks/finetune_utils.py @@ -0,0 +1,304 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Finetune utilities.""" + +from functools import partial +import sys +import torch + +from megatron.training import get_args, get_num_microbatches +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.core import mpu +from megatron.core.enums import ModelType +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint +from megatron.training.training import evaluate_and_print_results +from megatron.training.training import setup_model_and_optimizer +from megatron.training.training import train_step +from megatron.training.training import training_log +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.utils import calc_params_l2_norm +from megatron.training.utils import check_adlr_autoresume_termination + + +def process_batch(batch): + """Process batch and produce inputs for the model.""" + args = get_args() + + tokens = batch['text'].long().cuda().contiguous() + types = batch['types'].long().cuda().contiguous() + labels = batch['label'].long().cuda().contiguous() + attention_mask = batch['padding_mask'].float().cuda().contiguous() + if args.fp16: + attention_mask = attention_mask.half() + + return tokens, types, labels, attention_mask + + +def cross_entropy_loss_func(labels, output_tensor): + logits = output_tensor + + # Cross-entropy loss. + loss_func = torch.nn.CrossEntropyLoss() + loss = loss_func(logits.contiguous().float(), labels) + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def _cross_entropy_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + tokens, types, labels, attention_mask = process_batch(batch_) + timers('batch-generator').stop() + + # Forward model. + output_tensor = model(tokens, attention_mask, tokentype_ids=types) + + return output_tensor, partial(cross_entropy_loss_func, labels) + + +def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, + task_collate_fn=None): + """Data loader. Note that batch-size is the local (per GPU) batch-size.""" + + # Sampler. + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader(dataset, + batch_size=micro_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + collate_fn=task_collate_fn) + + return data_loader + + +def _build_infinite_size_dataloader(dataloader): + """Build a looped dataloader with infinite size.""" + + iterator = dataloader.__iter__() + while True: + try: + yield iterator.__next__() + except StopIteration: + iterator = dataloader.__iter__() + + +def _build_train_valid_dataloaders(train_dataset, valid_dataset, + task_collate_fn=None): + """Traing and validation dataloaders.""" + args = get_args() + + print_rank_0('building train and validation dataloaders ...') + # Training dataset. + train_dataloader = build_data_loader(train_dataset, args.micro_batch_size, + args.num_workers, not args.keep_last, + task_collate_fn) + # Set the training iterations. + args.train_iters_per_epoch = len(train_dataloader) + args.train_iters = args.epochs * args.train_iters_per_epoch + # Validation dataset. For this dataset, we do not need to set up + # shuffling so we can just use a simple infinite loop. + valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size, + args.num_workers, not args.keep_last, + task_collate_fn) + valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_) + + # Now that we've built the data loaders, set batch_size arguments + # to the actual batch size the model will see for this dataset. + # This is necessary so pipeline transfers know what size they are + # and the LR schedule, which is based on samples seen, gets set + # correctly. + args.orig_micro_batch_size = args.micro_batch_size + args.orig_global_batch_size = args.global_batch_size + if hasattr(train_dataset, 'sample_multiplier'): + # If our dataset as a sample_multiplier attribute that means + # each "sample" from the dataset actually has multiple samples + # that will collapse into the batch dimension (for example in + # the RACE dataset that has several options), we need to + # account for that when setting the micro batch size. + args.micro_batch_size *= train_dataset.sample_multiplier + args.global_batch_size *= train_dataset.sample_multiplier + + return train_dataloader, valid_dataloader + + +def _train(model, optimizer, opt_param_scheduler, forward_step, + train_dataloader, valid_dataloader, end_of_epoch_callback): + """Train the model.""" + args = get_args() + timers = get_timers() + + assert get_num_microbatches() == 1, "finetuning with gradient accumulation doesn't currently work" + + # Turn on training mode which enables dropout. + for m in model: + m.train() + + # Tracking loss. + losses_dict_sum = {} + + # Starting epoch and iteration + start_epoch = args.iteration // args.train_iters_per_epoch + start_iteration = args.iteration % args.train_iters_per_epoch + iteration = args.iteration + + # Memory reporting flag. + report_memory_flag = True + + # For each remaining epoch + timers('interval-time', log_level=0).start(barrier=True) + for epoch in range(start_epoch, args.epochs): + print_rank_0('working on epoch {} ...'.format(epoch + 1)) + + # Set the data loader epoch to shuffle the index iterator. + train_dataloader.sampler.set_epoch(args.seed + epoch) + + # For all the batches in the dataset. + for iteration_, batch in enumerate(train_dataloader): + + # Ignore the iterations before starting value + if iteration_ < start_iteration: + continue + # Set to zero so the next epoch does not skip any batches. + start_iteration = 0 + + # Train for one step. + out = train_step(forward_step, batch, model, optimizer, opt_param_scheduler) + + losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = out + iteration += 1 + + # Logging. + params_norm = None + if args.log_params_norm: + params_norm = calc_params_l2_norm(model) + report_memory_flag = training_log(losses_dict, losses_dict_sum, + optimizer.param_groups[0]['lr'], + iteration, + optimizer.get_loss_scale().item(), + report_memory_flag, skipped_iter, + grad_norm, params_norm, num_zeros_in_grad) + + # Autoresume + if args.adlr_autoresume and \ + (iteration % args.adlr_autoresume_interval == 0): + check_adlr_autoresume_termination(iteration, model, + optimizer, opt_param_scheduler) + + # Checkpointing + saved_checkpoint = False + if args.save and args.save_interval and \ + iteration % args.save_interval == 0: + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + saved_checkpoint = True + + # Evaluation + if args.eval_interval and iteration % args.eval_interval == 0: + prefix = 'iteration {}'.format(iteration) + evaluate_and_print_results(prefix, forward_step, + valid_dataloader, model, + iteration, None, False) + + # Exiting based on iterations + if args.exit_interval and iteration % args.exit_interval == 0: + if not saved_checkpoint: + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + torch.distributed.barrier() + print_rank_0('exiting program at iteration {}'.format(iteration)) + sys.exit() + + # Checkpointing at the end of each epoch. + if args.save: + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + + # Callback at the end of each epoch. + if end_of_epoch_callback is not None: + end_of_epoch_callback(model, epoch) + + +def finetune(train_valid_datasets_provider, model_provider, + model_type=ModelType.encoder_or_decoder, + forward_step=_cross_entropy_forward_step, + end_of_epoch_callback_provider=None, + task_collate_fn=None): + """Main finetune function used across all tasks.""" + args = get_args() + timers = get_timers() + + assert args.rampup_batch_size is None, \ + 'batch size scaling is not supported for finetuning' + + # Train and validation data loaders. + timers('train/valid/test dataset/dataloder', log_level=0).start() + if args.epochs > 0: + train_dataset, valid_dataset = train_valid_datasets_provider() + train_dataloader, valid_dataloader = _build_train_valid_dataloaders( + train_dataset, valid_dataset, task_collate_fn) + else: + args.train_iters = 0 + timers('train/valid/test dataset/dataloder').stop() + + # Build calback function. + timers('callback function', log_level=0).start() + end_of_epoch_callback = None + if end_of_epoch_callback_provider is not None: + end_of_epoch_callback = end_of_epoch_callback_provider() + timers('callback function').stop() + + # Build model, optimizer and learning rate scheduler. + timers('model and optimizer', log_level=0).start() + model, optimizer, opt_param_scheduler = setup_model_and_optimizer(model_provider, model_type) + timers('model and optimizer').stop() + + # If pretrained checkpoint is provided and we have not trained for + # any iteration (i.e., iteration is zero), then load the pretrained + # checkpoint. + timers('pretrained checkpoint', log_level=0).start(barrier=True) + if args.iteration == 0 and args.pretrained_checkpoint is not None: + original_load = args.load + args.load = args.pretrained_checkpoint + original_rng = args.no_load_rng + args.no_load_rng = True + _ = load_checkpoint(model, None, None) + args.load = original_load + args.no_load_rng = original_rng + # This is critical when only model is loaded. We should make sure + # main parameters are also updated. + optimizer.reload_model_params() + timers('pretrained checkpoint').stop() + + # Print setup timing. + print_rank_0('done with setups ...') + timers.log(['train/valid/test dataset/dataloder', 'callback function', + 'model and optimizer', 'pretrained checkpoint'], barrier=True) + print_rank_0('training ...') + + # Finetune the model. + if args.epochs > 0: + _train(model, optimizer, opt_param_scheduler, forward_step, + train_dataloader, valid_dataloader, end_of_epoch_callback) + # Or just evaluate. + else: + if end_of_epoch_callback is not None: + print_rank_0('evaluation only mode, setting epoch to -1') + end_of_epoch_callback(model, epoch=-1, output_predictions=True) + print_rank_0('done :-)') diff --git a/tasks/glue/data.py b/tasks/glue/data.py new file mode 100644 index 0000000..3e2eeaa --- /dev/null +++ b/tasks/glue/data.py @@ -0,0 +1,56 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""GLUE dataset.""" + +from abc import ABC +from abc import abstractmethod + +from torch.utils.data import Dataset + +from megatron.training import print_rank_0 +from tasks.data_utils import build_sample +from tasks.data_utils import build_tokens_types_paddings_from_text + + +class GLUEAbstractDataset(ABC, Dataset): + """GLUE base dataset class.""" + + def __init__(self, task_name, dataset_name, datapaths, + tokenizer, max_seq_length): + # Store inputs. + self.task_name = task_name + self.dataset_name = dataset_name + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + print_rank_0(' > building {} dataset for {}:'.format(self.task_name, + self.dataset_name)) + # Process the files. + string = ' > paths:' + for path in datapaths: + string += ' ' + path + print_rank_0(string) + self.samples = [] + for datapath in datapaths: + self.samples.extend(self.process_samples_from_single_path(datapath)) + print_rank_0(' >> total number of samples: {}'.format( + len(self.samples))) + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + raw_sample = self.samples[idx] + ids, types, paddings = build_tokens_types_paddings_from_text( + raw_sample['text_a'], raw_sample['text_b'], + self.tokenizer, self.max_seq_length) + sample = build_sample(ids, types, paddings, + raw_sample['label'], raw_sample['uid']) + return sample + + @abstractmethod + def process_samples_from_single_path(self, datapath): + """Abstract method that takes a single path / filename and + returns a list of dataset samples, each sample being a dict of + {'text_a': string, 'text_b': string, 'label': int, 'uid': int} + """ + pass diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py new file mode 100644 index 0000000..7e89453 --- /dev/null +++ b/tasks/glue/finetune.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""GLUE finetuning/evaluation.""" + +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.legacy.model.classification import Classification +from tasks.eval_utils import accuracy_func_provider +from tasks.finetune_utils import finetune +from megatron.training.arguments import core_transformer_config_from_args + + +def glue_classification(num_classes, Dataset, + name_from_datapath_func): + + def train_valid_datasets_provider(): + """Build train and validation dataset.""" + args = get_args() + tokenizer = get_tokenizer() + + train_dataset = Dataset('training', args.train_data, + tokenizer, args.seq_length) + valid_dataset = Dataset('validation', args.valid_data, + tokenizer, args.seq_length) + + return train_dataset, valid_dataset + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + config = core_transformer_config_from_args() + + print_rank_0('building classification model for {} ...'.format( + args.task)) + model = Classification(config=config, num_classes=num_classes, num_tokentypes=2, + pre_process=pre_process, post_process=post_process) + + return model + + def metrics_func_provider(): + """Privde metrics callback function.""" + def single_dataset_provider(datapath): + args = get_args() + tokenizer = get_tokenizer() + + name = name_from_datapath_func(datapath) + return Dataset(name, [datapath], tokenizer, args.seq_length) + return accuracy_func_provider(single_dataset_provider) + + """Finetune/evaluate.""" + finetune(train_valid_datasets_provider, model_provider, + end_of_epoch_callback_provider=metrics_func_provider) + + +def main(): + args = get_args() + + if args.task == 'MNLI': + + num_classes = 3 + from tasks.glue.mnli import MNLIDataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('MNLI')[-1].strip( + '.tsv').strip('/').replace('_', '-') + + elif args.task == 'QQP': + + num_classes = 2 + from tasks.glue.qqp import QQPDataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('QQP')[-1].strip( + '.tsv').strip('/').replace('_', '-') + + else: + raise NotImplementedError('GLUE task {} is not implemented.'.format( + args.task)) + + glue_classification(num_classes, Dataset, name_from_datapath) diff --git a/tasks/glue/mnli.py b/tasks/glue/mnli.py new file mode 100644 index 0000000..cd4b2d6 --- /dev/null +++ b/tasks/glue/mnli.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""MNLI dataset.""" + +from megatron.training import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} + + +class MNLIDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label='contradiction'): + self.test_label = test_label + super().__init__('MNLI', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 10: + is_test = True + print_rank_0( + ' reading {}, {} and {} columns and setting ' + 'labels to {}'.format( + row[0].strip(), row[8].strip(), + row[9].strip(), self.test_label)) + else: + print_rank_0(' reading {} , {}, {}, and {} columns ' + '...'.format( + row[0].strip(), row[8].strip(), + row[9].strip(), row[-1].strip())) + continue + + text_a = clean_text(row[8].strip()) + text_b = clean_text(row[9].strip()) + unique_id = int(row[0].strip()) + label = row[-1].strip() + if is_test: + label = self.test_label + + assert len(text_a) > 0 + assert len(text_b) > 0 + assert label in LABELS + assert unique_id >= 0 + + sample = {'text_a': text_a, + 'text_b': text_b, + 'label': LABELS[label], + 'uid': unique_id} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/qqp.py b/tasks/glue/qqp.py new file mode 100644 index 0000000..f8a0e06 --- /dev/null +++ b/tasks/glue/qqp.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""QQP dataset.""" + +from megatron.training import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = [0, 1] + + +class QQPDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label=0): + self.test_label = test_label + super().__init__('QQP', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 3: + is_test = True + print_rank_0(' reading {}, {}, and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[1].strip(), + row[2].strip(), self.test_label)) + else: + assert len(row) == 6 + print_rank_0(' reading {}, {}, {}, and {} columns' + ' ...'.format( + row[0].strip(), row[3].strip(), + row[4].strip(), row[5].strip())) + continue + + if is_test: + assert len(row) == 3, 'expected length 3: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = clean_text(row[2].strip()) + label = self.test_label + assert len(text_a) > 0 + assert len(text_b) > 0 + else: + if len(row) == 6: + uid = int(row[0].strip()) + text_a = clean_text(row[3].strip()) + text_b = clean_text(row[4].strip()) + label = int(row[5].strip()) + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + if len(text_b) == 0: + print_rank_0('***WARNING*** zero length b, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': label} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/main.py b/tasks/main.py new file mode 100644 index 0000000..da8c4b9 --- /dev/null +++ b/tasks/main.py @@ -0,0 +1,100 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Main tasks functionality.""" + +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) + +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron + + +def get_tasks_args(parser): + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title='tasks') + + group.add_argument('--task', type=str, required=True, + help='Task name.') + group.add_argument('--epochs', type=int, default=None, + help='Number of finetunning epochs. Zero results in ' + 'evaluation only.') + group.add_argument('--keep-last', action='store_true', + help='Keep the last batch (maybe incomplete) in' + 'the data loader') + group.add_argument('--train-data', nargs='+', default=None, + help='Whitespace separated paths or corpora names ' + 'for training.') + group.add_argument('--valid-data', nargs='*', default=None, + help='path(s) to the validation data.') + group.add_argument('--overlapping-eval', type=int, default=32, + help='Sliding window for overlapping evaluation.') + group.add_argument('--strict-lambada', action='store_true', + help='Use more difficult formulation of lambada.') + # Retriever args + group.add_argument('--qa-data-dev', type=str, default=None, + help='Path to the QA dataset dev file.') + group.add_argument('--qa-data-test', type=str, default=None, + help='Path to the QA dataset test file.') + + # Faiss arguments for retriever + group.add_argument('--faiss-use-gpu', action='store_true', + help='Whether create the FaissMIPSIndex on GPU') + group.add_argument('--faiss-match', type=str, default='string', \ + choices=['regex', 'string'], help="Answer matching '\ + 'logic type") + group.add_argument('--faiss-topk-retrievals', type=int, default=100, + help='Number of blocks to use as top-k during retrieval') + + # finetune for retriever + group.add_argument('--eval-micro-batch-size', type=int, default=None, + help='Eval Batch size per model instance (local batch ' + 'size). Global batch size is local batch size ' + 'times data parallel size.') + group.add_argument('--train-with-neg', action='store_true', + help='Whether to use negative examples during model ' + 'training') + group.add_argument('--train-hard-neg', type=int, default=0, + help='Number of hard negative exmaples to use during ' + 'training') + + + # parameters for Av.rank validation method + # Following options/arguments have been taken directly from DPR codebase + group.add_argument('--val-av-rank-hard-neg', type=int, default=30, + help='Av.rank validation: how many hard negatives to' + ' take from each question pool') + group.add_argument('--val-av-rank-other-neg', type=int, default=30, + help='Av.rank validation: how many other negatives to' + ' take from each question pool') + + + return parser + + +if __name__ == '__main__': + + initialize_megatron(extra_args_provider=get_tasks_args) + + args = get_args() + + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for downstream tasks.") + exit() + + if args.task == 'RACE': + from race.finetune import main + elif args.task in ['MNLI', 'QQP']: + from glue.finetune import main + elif args.task in ['LAMBADA', 'WIKITEXT103']: + from zeroshot_gpt.evaluate import main + elif args.task in ['ICT-ZEROSHOT-NQ', 'RETRIEVER-EVAL']: + from orqa.evaluate_orqa import main + elif args.task in ['RET-FINETUNE-NQ']: + from orqa.supervised.finetune import main + else: + raise NotImplementedError('Task {} is not implemented.'.format( + args.task)) + + main() diff --git a/tasks/msdp/README.md b/tasks/msdp/README.md new file mode 100644 index 0000000..e606e7e --- /dev/null +++ b/tasks/msdp/README.md @@ -0,0 +1,19 @@ + +# Multi-Stage Prompting for Knowledgeable Dialogue Generation + +Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. + +## Multi-Stage Dialogue Prompting + +### Data Preparation +1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) +2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets. + +### Stage-1: Prompting for Knowledge Generation +1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. +2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. + +### Stage-2: Prompting for Response Generation +1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). +2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. +3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. diff --git a/tasks/msdp/evaluate.py b/tasks/msdp/evaluate.py new file mode 100644 index 0000000..87cfbdb --- /dev/null +++ b/tasks/msdp/evaluate.py @@ -0,0 +1,45 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Model evaluation""" + +from megatron.training import get_args +from megatron.training import print_rank_0 +from tasks.msdp.metrics import F1Metric +from tqdm import tqdm + + +def evaluate_f1(guess_file, answer_file): + """Evaluating F1 Score""" + + guess_list = [] + print_rank_0('reading %s' % guess_file) + with open(guess_file, "r") as f: + for i, line in enumerate(tqdm(f)): + line = line.strip() + if "<|endoftext|>" in line: + line = line.replace("<|endoftext|>", "") + guess_list.append(line) + + answer_list = [] + print_rank_0('reading %s' % answer_file) + with open(answer_file, "r") as f: + for i, line in enumerate(tqdm(f)): + line = line.strip() + if line == "no_passages_used": + line = "" + answer_list.append(line) + + assert len(guess_list) == len(answer_list), \ + "lengths of guess and answer are different!" + + precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) + print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) + + print_rank_0('done :-)') + + +def main(): + args = get_args() + + evaluate_f1(args.guess_file, args.answer_file) + diff --git a/tasks/msdp/main.py b/tasks/msdp/main.py new file mode 100644 index 0000000..a0068c7 --- /dev/null +++ b/tasks/msdp/main.py @@ -0,0 +1,66 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Run multi-stage dialogue prompting (MSDP).""" + +import os +import sys +sys.path.append(os.path.abspath(os.path.join( + os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir))) +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron + + +def get_tasks_args(parser): + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title='tasks') + + # parameters for the knowledgeable dialogue generation + group.add_argument('--task', type=str, required=True, + help='Task name.') + group.add_argument("--sample-input-file", type=str, default=None, + help='Get input from file instead of interactive mode, ' + 'each line is an input.') + group.add_argument("--sample-output-file", type=str, default=None, + help='Output file got from --sample-input-file') + group.add_argument('--prompt-file', type=str, default=None, + help='prompting file') + group.add_argument('--prompt-type', type=str, default=None, + choices=['knowledge', 'response'], + help='prompt type (knowledge or response)') + group.add_argument('--num-prompt-examples', type=int, default=10, + help='number of prompt examples') + group.add_argument('--guess-file', type=str, default=None, + help='datapath for generated sentences') + group.add_argument('--answer-file', type=str, default=None, + help='datapath for golden sentences') + group.add_argument('--out-seq-length', type=int, default=100, + help='output sequence length') + group.add_argument('--api-prompt', default=False, action="store_true", + help='setup model api for prompting') + group.add_argument('--megatron-api-url', type=str, default=None, + help='url of the megatron api') + + return parser + + +if __name__ == '__main__': + + initialize_megatron(extra_args_provider=get_tasks_args) + + args = get_args() + + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for downstream tasks.") + exit() + + if args.task == 'MSDP-PROMPT': + from tasks.msdp.prompt import main + + elif args.task == 'MSDP-EVAL-F1': + from tasks.msdp.evaluate import main + + else: + raise NotImplementedError('Task {} is not implemented.'.format( + args.task)) + + main() diff --git a/tasks/msdp/metrics.py b/tasks/msdp/metrics.py new file mode 100644 index 0000000..fc7ce5d --- /dev/null +++ b/tasks/msdp/metrics.py @@ -0,0 +1,77 @@ + +# The following code is adapted from +# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, +# which is licensed under the MIT license. More details on the license can be +# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. + +"""Provides standard metric evaluations for dialog.""" + +from collections import Counter +from typing import List +import numpy as np +import re + +re_art = re.compile(r'\b(a|an|the)\b') +re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') + + +def normalize_answer(s): + """ + Lower text and remove punctuation, articles and extra whitespace. + """ + s = s.lower() + s = re_punc.sub(' ', s) + s = re_art.sub(' ', s) + s = ' '.join(s.split()) + return s + + +class F1Metric: + """ + Helper class which computes token-level F1. + """ + + @staticmethod + def _prec_recall_f1_score(pred_items, gold_items): + """ + Compute precision, recall and f1 given a set of gold and prediction items. + :param pred_items: iterable of predicted values + :param gold_items: iterable of gold values + :return: tuple (p, r, f1) for precision, recall, f1 + """ + common = Counter(gold_items) & Counter(pred_items) + num_same = sum(common.values()) + if num_same == 0: + return 0, 0, 0 + precision = 1.0 * num_same / len(pred_items) + recall = 1.0 * num_same / len(gold_items) + f1 = (2 * precision * recall) / (precision + recall) + return precision, recall, f1 + + @staticmethod + def compute_each_pair(guess: str, answer: str): + if answer == "": + return None, None, None + if guess == "": + return 0, 0, 0 + g_tokens = normalize_answer(guess).split() + a_tokens = normalize_answer(answer).split() + + precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) + return precision, recall, f1 + + @staticmethod + def compute_all_pairs(guesses: List[str], answers: List[str]): + # additional augment: + assert len(guesses) == len(answers) + + precision_list, recall_list, f1_list = [], [], [] + for guess, answer in zip(guesses, answers): + precision, recall, f1 = F1Metric.compute_each_pair(guess, answer) + if precision is None or recall is None or f1 is None: + continue + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + + return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) diff --git a/tasks/msdp/preprocessing.py b/tasks/msdp/preprocessing.py new file mode 100644 index 0000000..d904c9d --- /dev/null +++ b/tasks/msdp/preprocessing.py @@ -0,0 +1,582 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Preprocessing for Wizard of Wikipedia and Wizard of Internet datasets""" + +import torch +import argparse +from nltk import word_tokenize +from tqdm import tqdm +import numpy as np +import json + +def get_args(): + parser = argparse.ArgumentParser(description="Preprocessing") + + parser.add_argument("--func", type=str, default=None, + help="choose to run which function") + parser.add_argument("--raw_file", type=str, default=None, + help="path of the input file") + parser.add_argument("--processed_file", type=str, default=None, + help="path of the output file") + parser.add_argument("--knwl_ref_file", type=str, default=None, + help="path of the knowledge reference file") + parser.add_argument("--resp_ref_file", type=str, default=None, + help="path of the knowledge reference file") + parser.add_argument("--knwl_gen_file", type=str, default=None, + help="path of the generated knowledge file") + parser.add_argument("--test_file", type=str, default=None, + help="path of the test file") + parser.add_argument("--train_file", type=str, default=None, + help="path of the train file") + parser.add_argument("--model_file", type=str, default=None, + help="path of the model file") + parser.add_argument("--data_type", type=str, default=None, + help="data types, choose one out of three types: \ + wow_seen, wow_unseen, and woi") + parser.add_argument("--seed", type=int, default=1234, + help="random seed") + + args = parser.parse_args() + return args + + +def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file): + """ + This is a function used for processing the wizard of wikipedia (wow) dataset + Expected processed format: + topic \t dialogue context \t golden knowledge \t golden response + """ + + # loading the raw data + print("> Loading data from %s" % raw_file) + with open(raw_file, "r") as fr: + dialog_data = json.load(fr) + + print("> Processing data ...") + fproc = open(processed_file, "w") + fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None + fresp = open(resp_ref_file, "w") if resp_ref_file else None + + for i, sample in enumerate(tqdm(dialog_data)): + # get all the dialog data for a single dialog sample + dialog = sample["dialog"] + + turn_list = [] # collect the dialog history + # processing for each single dialog sample + for j, turn in enumerate(dialog): + # text of each turn + text = turn["text"] + if not (text.endswith("?") or text.endswith(".") or text.endswith("!")): + text = text + "." + + if j == 0: + # first turn + turn_list.append(text) + continue + + speaker = turn["speaker"].lower() + if "wizard" in speaker: + checked_sentence = list(turn["checked_sentence"].values()) # knowledge + checked_passage = list(turn["checked_passage"].values()) # topic + + assert len(checked_sentence) <= 1 + + # get the ground truth knowledge + if len(checked_sentence) > 0: + checked_sentence = checked_sentence[0] + else: + checked_sentence = "no_passages_used" + + if len(checked_passage) == 1: + checked_passage = checked_passage[0] + else: + checked_passage = "no_passages_used" + + # get the topic + if checked_passage != "no_passages_used": + topic = checked_passage + else: + topic = sample["chosen_topic"] + + dialog_context = " [SEP] ".join(turn_list) + knowledge = checked_sentence + response = text + # add the response into the dialog history + turn_list.append(response) + + # write to the output files + fproc.write(topic + "\t" + dialog_context + "\t" + \ + knowledge + "\t" + response + "\n") + + if fknwl: + fknwl.write(knowledge + "\n") + if fresp: + # tokenize for evaluation + response = " ".join(word_tokenize(response)) + fresp.write(response + "\n") + + else: + assert "apprentice" in speaker + turn_list.append(text) + + fproc.close() + if fknwl: + fknwl.close() + if fresp: + fresp.close() + + +def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file): + """ + This is a function used for processing the wizard of internet (woi) dataset + Expected processed format: + topic \t dialogue context \t golden knowledge \t golden response + """ + + print("> Processing %s" % raw_file) + fproc = open(processed_file, "w") + fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None + fresp = open(resp_ref_file, "w") if resp_ref_file else None + + with open(raw_file, "r") as fr: + for i, line in tqdm(enumerate(fr)): + # read line by line, each line uses json format + line = line.strip() + item_dict = json.loads(line) + + # item_dict is a dictionary + # its key is the data id, and its value contains all the data content + item_dict = item_dict.values() + item_dict = list(item_dict)[0] # len(item_dict) == 1 + + # get the whole dialog data for a single dialog sample + dialog_data = item_dict['dialog_history'] + length = len(dialog_data) + + turn_list = [] # collect the dialog history + search_text = "" + for i in range(length): + item = dialog_data[i] + action = item['action'] + + if action == "Wizard => SearchAgent": + search_text = item['text'] + + elif action == "Wizard => Apprentice": + if len(turn_list) == 0: + # first turn + turn = item['text'] + turn_list.append(turn) + continue + + # get the relevant content + contents = item["context"]["contents"] + selects = item["context"]["selected_contents"] + flag = selects[0][0] + selects = selects[1:] + assert len(selects) == len(contents) + + # get the topic + if flag: + # no knowledge sentence is used for the response + topic = "no_topic" + knwl_sent = "no_passages_used" + else: + # we consider the search text as the topic + topic = search_text + # get the knowledge sentence + knwl_sent = "" + for content, select in zip(contents, selects): + content = content['content'] + assert len(content) == len(select) + for c, s in zip(content, select): + if s: + knwl_sent = c + break + + if knwl_sent == "": + # no knowledge is used for the response + topic = "no_topic" + knwl_sent = "no_passages_used" + + # get dialogue context, knowledge, and response + dialog_context = " [SEP] ".join(turn_list) + response = item['text'] + + # processing + topic = topic.replace("\n", "").replace("\r", \ + "").replace("\t", "") + dialog_context = dialog_context.replace("\n", "").replace("\r", \ + "").replace("\t", "") + knwl_sent = knwl_sent.replace("\n", "").replace("\r", \ + "").replace("\t", "") + response = response.replace("\n", "").replace("\r", \ + "").replace("\t", "") + + if topic != "no_topic": + # write to the ouput files + fproc.write(topic + "\t" + dialog_context + "\t" + \ + knwl_sent + "\t" + response + "\n") + if fknwl: + fknwl.write(knwl_sent + "\n") + if fresp: + # tokenize for evaluation + response = " ".join(word_tokenize(response)) + fresp.write(response + "\n") + + turn_list.append(response) + + elif action == "Apprentice => Wizard": + turn = item['text'] + turn_list.append(turn) + + else: + assert action == "SearchAgent => Wizard", \ + "Please check whether you have used the correct data!" + + fproc.close() + if fknwl: + fknwl.close() + if fresp: + fresp.close() + + +def get_database(test_datapath, train_datapath, data_type): + """Get the database by topics""" + + assert data_type in ["wow_seen", "wow_unseen", "woi"], \ + "Please input a correct data type!!" + + # get test data topic dictionary + print("> reading test data from %s" % test_datapath) + test_topics = {} + with open(test_datapath, "r") as f: + for i, line in enumerate(f): + line = line.strip() + splits = line.split("\t") + topic = splits[0] + test_topics[topic] = True + + print("> reading data from %s" % train_datapath) + train_data_by_topic = {} + dialog_data_by_topic = {} + dialog_examples = [] + with open(train_datapath, "r") as f: + for i, line in enumerate(f): + line = line.strip() + splits = line.split("\t") + topic = splits[0] + turns = splits[1].split(" [SEP] ")[-3:] + knowledge = splits[2] + response = splits[3] + # filtering data samples + if knowledge == "no_passages_used": + # when no knowledge is used + continue + if data_type != "wow_seen" and ("(" in knowledge or ")" in knowledge): + # when bracket exists in the knowledge + continue + if data_type != "wow_seen" and topic not in knowledge: + # when topic does not exist in the knowledge + continue + + # get the instance + last_turn = turns[-1] + instance = "( " + last_turn + " ) " + topic + " => " + knowledge + + # construct dialog example + dialog_example = "" + if data_type != "wow_seen": + dialog_example += "( " + topic + " ) " + for i, turn in enumerate(turns): + if i != 0: + dialog_example += " " + dialog_example += turn + + # check overlaps + if topic in test_topics: + if topic not in train_data_by_topic: + train_data_by_topic[topic] = [instance] + else: + train_data_by_topic[topic].append(instance) + + if topic not in dialog_data_by_topic: + dialog_data_by_topic[topic] = [dialog_example] + else: + dialog_data_by_topic[topic].append(dialog_example) + + else: + # filtering data samples + if len(knowledge.split()) > 20: + # knowledge is too long + continue + if knowledge.startswith("It") or knowledge.startswith("it") or \ + knowledge.startswith("This") or knowledge.startswith("this"): + continue + + # append all the data into dialogue examples list + dialog_examples.append((topic, dialog_example, instance)) + + return train_data_by_topic, dialog_data_by_topic, dialog_examples + + +emb_dict = {} +def select_prompts_based_on_similarity( + query, dialog_list, prompt_list, topic, tokenizer, encoder, topk): + """Select samples based on the similarity""" + + with torch.no_grad(): + # get the query embeddings + query_ids = tokenizer.encode(query) + query_ids = torch.LongTensor([query_ids]).cuda() + query_emb = encoder(input_ids=query_ids).pooler_output + query_emb = query_emb[0] + + # calculate embeddings for the samples in the database + if topic in emb_dict: + example_embeddings = emb_dict[topic] + example_embeddings = example_embeddings.cuda() + else: + for idx, example in enumerate(dialog_list): + example_ids = tokenizer.encode(example) + example_ids = torch.LongTensor([example_ids]).cuda() + example_emb = encoder(input_ids=example_ids).pooler_output + if idx == 0: + example_embeddings = example_emb + else: + example_embeddings = torch.cat( + (example_embeddings, example_emb), dim=0) + emb_dict[topic] = example_embeddings.cpu() + + # compare the similarity and select the topk samples + similarity_list = example_embeddings.matmul(query_emb) + _, indices = torch.topk(similarity_list, k=topk) + + indices = indices.tolist() + indices = indices[::-1] # reverse the order + selected_prompts = [] + for index in indices: + # index = index.item() + selected_prompts.append(prompt_list[index]) + + return selected_prompts + + +def prompt_selection_for_knowledge_generation( + test_datapath, train_datapath, model_path, output_prompt_path, data_type): + """Selecting prompts for the knowledge generation""" + + print("> Selecting prompts for the knowledge generation") + + train_data_by_topic, dialog_data_by_topic, dialog_examples = \ + get_database(test_datapath, train_datapath, data_type) + + from transformers import DPRQuestionEncoderTokenizer + print("> loading tokenizer and encoder") + tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( + 'facebook/dpr-question_encoder-single-nq-base') + encoder = torch.load(model_path).cuda() + + print("> getting dialog embeddings") + with torch.no_grad(): + for idx, example in tqdm(enumerate(dialog_examples)): + dialog = example[1] + dialog_ids = tokenizer.encode(dialog) + dialog_ids = torch.LongTensor([dialog_ids]).cuda() + dialog_emb = encoder(input_ids=dialog_ids).pooler_output + + if idx == 0: + dialog_embeddings = dialog_emb + else: + dialog_embeddings = torch.cat((dialog_embeddings, dialog_emb), dim=0) + + print("> reading test data from %s" % test_datapath) + prompt_list_for_each_sample = [] + with open(test_datapath, "r") as f: + for i, line in tqdm(enumerate(f)): + line = line.strip() + + splits = line.split("\t") + topic = splits[0] + turns = splits[1].split(" [SEP] ")[-3:] + + # get the query sentence + query_sent = "" + if data_type != "seen": + query_sent += "( " + topic + " ) " + for i, turn in enumerate(turns): + if i != 0: + query_sent += " " + query_sent += turn + + if topic not in train_data_by_topic: + # get the query embedding + query_ids = tokenizer.encode(query_sent) + query_ids = torch.LongTensor([query_ids]).cuda() + query_emb = encoder(input_ids=query_ids).pooler_output + query_emb = query_emb[0] + + # calculate the similarity + similarity_list = dialog_embeddings.matmul(query_emb) + _, indices = torch.sort(similarity_list) + indices = indices.tolist() + selected_topics = {} + selected_prompts = [] + num_prompt = 0 + for index in indices: + example = dialog_examples[index] + topic_temp = example[0] + if topic_temp not in selected_topics: + selected_topics[topic_temp] = True + selected_prompts.append(example[2]) + num_prompt += 1 + if num_prompt == 10: + break + + # get the selected samples + example_list = selected_prompts[::-1] + key = topic + " " + turns[-1] + prompt_list_for_each_sample.append({key: example_list}) + + else: + num_data_sample = min(len(train_data_by_topic[topic]), 10) + total_example_list = train_data_by_topic[topic] + + dialog_list = dialog_data_by_topic[topic] + assert len(dialog_list) == len(train_data_by_topic[topic]) + + # calculate the similarity + example_list = select_prompts_based_on_similarity( + query_sent, dialog_list, total_example_list, + topic, tokenizer, encoder, topk=num_data_sample) + + key = topic + " " + turns[-1] + prompt_list_for_each_sample.append({key: example_list}) + + print("writing to %s" % output_prompt_path) + with open(output_prompt_path, "w") as f: + for instance in tqdm(prompt_list_for_each_sample): + json.dump(instance, f) + f.write("\n") + + +def prompt_selection_for_response_generation(input_path, output_path, seed): + """Selecting prompts for the response generation""" + + print("> Selecting prompts for the response generation") + print("> set random seed") + np.random.seed(seed) + + prompt_example_list = [] + print("> reading data from %s" % input_path) + with open(input_path, "r") as f: + for i, line in tqdm(enumerate(f)): + line = line.strip() + splits = line.split("\t") + + # get the topic, context, knowledge and response + topic = splits[0] + dialog_context = splits[1] + knowledge = splits[2] + response = splits[3] + turns = dialog_context.split(" [SEP] ")[-3:] + if knowledge == "no_passages_used": + continue + + # calculate the overlap ratio + from nltk import word_tokenize + knowledge_sent_token_list = word_tokenize(knowledge) + knowledge_sent_token_dict = {token: True for token in knowledge_sent_token_list} + knowledge_len = len(knowledge_sent_token_list) + response_token_list = word_tokenize(response) + response_len = len(response_token_list) + num_overlap_token = 0 + accumulator = 0 + for token in response_token_list: + if token in knowledge_sent_token_dict: + accumulator += 1 + else: + if accumulator >= 10: + num_overlap_token += accumulator + accumulator = 0 + if accumulator >= 10: + num_overlap_token += accumulator + + # filtering the data based on the ratio + if num_overlap_token > response_len * 0.9 or num_overlap_token < response_len * 0.6: + continue + if num_overlap_token < knowledge_len * 0.8: + continue + + last_turn = " ".join(word_tokenize(turns[-1])) + knowledge = " ".join(word_tokenize(knowledge)) + response = " ".join(word_tokenize(response)) + prompt_example = "" + # add dialog context + prompt_example += "Topic: " + topic + ". " + prompt_example += "User says: " + last_turn + " " + prompt_example += "We know that: " + knowledge + " " + prompt_example += "System replies: " + response + + prompt_example_list.append(prompt_example) + + # shuffle the prompt examples + np.random.shuffle(prompt_example_list) + + print("> writing to %s" % output_path) + with open(output_path, "w") as f: + # f.write("Generate the System's response based on the knowledge sentence:\n") + for i in tqdm(range(20)): + example = prompt_example_list[i] + f.write(example + "\n") + + +def prepare_input_for_response_generation(test_file, knwl_gen_file, processed_file): + """Preparing inputs for the response generation""" + + print("> Reading knowledge file from %s" % knwl_gen_file) + # get the knowledge list + with open(knwl_gen_file, "r") as f: + knowledge_list = f.readlines() + + print("> Processing ...") + with open(test_file, "r") as fr: + with open(processed_file, "w") as fw: + for line_num, line in enumerate(tqdm(fr)): + line = line.strip() + splits = line.split("\t") + # prepare topic, context, knowledge and response + topic = splits[0] + dialog_context = splits[1] + response = splits[3] + knowledge = knowledge_list[line_num] + knowledge = knowledge.strip() + if "<|endoftext|>" in knowledge: + knowledge = knowledge.replace("<|endoftext|>", "") + + # write to the output file + fw.write(topic + "\t" + dialog_context + "\t" \ + + knowledge + "\t" + response + "\n") + + +if __name__ == "__main__": + + args = get_args() + if args.func == "process_wow_dataset": + process_wow_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file) + + elif args.func == "process_woi_dataset": + process_woi_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file) + + elif args.func == "get_knwl_gen_prompts": + prompt_selection_for_knowledge_generation( + args.test_file, args.train_file, args.model_file, + args.processed_file, args.data_type) + + elif args.func == "get_resp_gen_prompts": + prompt_selection_for_response_generation( + args.train_file, args.processed_file, args.seed) + + elif args.func == "prepare_input": + prepare_input_for_response_generation( + args.test_file, args.knwl_gen_file, args.processed_file) diff --git a/tasks/msdp/prompt.py b/tasks/msdp/prompt.py new file mode 100644 index 0000000..c1d1651 --- /dev/null +++ b/tasks/msdp/prompt.py @@ -0,0 +1,309 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Prompting the pretrained language model to generate knowledge/response""" + +import json +import torch +import requests +from nltk import word_tokenize +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.core import mpu +from megatron.legacy.model import GPTModel +from megatron.training import get_model +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.inference.text_generation import generate_and_post_process + + +def call_model_api(inputs, tokens_to_generate): + """Calling the model api to get the output generations""" + + args = get_args() + + # The following is an example of using the Megatron API + # You can also implement your own API function to place this part + headers = {'Content-Type': 'application/json; charset=UTF-8'} + data = {"prompts": [inputs], "tokens_to_generate": tokens_to_generate, "top_k": 1} + data_json = json.dumps(data) + outputs = requests.put(args.megatron_api_url, headers=headers, data=data_json).json()["text"][0] + + input_len = len(inputs) + outputs = outputs[input_len:] + outputs = outputs.split("\n")[0].strip() + + return outputs + + +def read_prompts(prompt_path, prompt_type, n_example): + """Read prompt data""" + + if prompt_type == "knowledge": + # prompts for the knowledge generation + prompt_examples_dict = {} + # read prompt_path + with open(prompt_path, "r") as f: + for i, line in enumerate(f): + line = line.strip() + line_dict = json.loads(line) + key = list(line_dict.keys())[0] + + if key not in prompt_examples_dict: + prompt_examples = line_dict[key] + prompt = "" + for instance in prompt_examples: + instance = instance.strip() + prompt += instance + " \n" + prompt_examples_dict[key] = prompt + + return prompt_examples_dict + + else: + # prompts for the response generation + # read prompt_path + prompt = "" + with open(prompt_path, "r") as f: + prompt_examples = f.readlines() + prompt_examples = prompt_examples[:n_example] + for instance in prompt_examples: + instance = instance.strip() + prompt += instance + " \n" + + return prompt + + +def generate_samples_by_calling_api(): + """ Generate outputs by calling""" + args = get_args() + assert args.prompt_type in ["knowledge", "response"], \ + "Please input a correct prompt type!" + + if args.prompt_type == "knowledge": + # read knowledge generation prompts + knwl_gen_prompt_dict = read_prompts( + args.prompt_file, args.prompt_type, args.num_prompt_examples) + + else: + resp_gen_prompt = read_prompts( + args.prompt_file, args.prompt_type, args.num_prompt_examples) + + # read the test data + fname = open(args.sample_input_file, "r") + test_sample_list = fname.readlines() + # create output file + fname_out = open(args.sample_output_file, "w") + + # call the api to get the output generations + for test_sample in test_sample_list: + test_sample = test_sample.strip() + splits = test_sample.split("\t") + topic = splits[0] + + # prepare the inputs for the api + if args.prompt_type == "knowledge": + ## inputs = prompt + current test + # get the prompt + turns = splits[1].split(" [SEP] ") + last_turn = turns[-1] + key = topic + " " + last_turn + inputs = knwl_gen_prompt_dict[key] + + # add current test + inputs += "( " + last_turn + " ) " + topic + " =>" + + else: + # inputs = prompt + current test + # get the prompt + inputs = resp_gen_prompt + + # add current test + turns = splits[1].split(" [SEP] ") + knowledge = splits[2] + last_turn = turns[-1] + last_turn = " ".join(word_tokenize(last_turn)) + knowledge = " ".join(word_tokenize(knowledge)) + knowledge = knowledge.strip() + last_turn = last_turn.strip() + inputs += "Topic: " + topic + ". " + inputs += "User says: " + last_turn + " " + inputs += "We know that: " + knowledge + " " + inputs += "System replies:" + + # get the output generations from the api, + # and write to the output file + generations = call_model_api(inputs, args.out_seq_length) + fname_out.write(generations) + fname_out.write("\n") + + fname.close() + fname_out.close() + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + return model + + +def generate_samples_by_prompting_input_from_file(model): + """Prompt a pretrained language model to generate knowledge/response""" + + # get tokenizer + args = get_args() + tokenizer = get_tokenizer() + + # Read the sample file and open the output file. + assert args.sample_input_file is not None, \ + 'sample input file is not provided.' + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: + fname = open(args.sample_input_file, "r") + all_raw_text = fname.readlines() + input_count = len(all_raw_text) + if args.sample_output_file is None: + sample_output_file = args.sample_input_file + ".out" + print('`sample-output-file` not specified, setting ' + 'it to {}'.format(sample_output_file)) + else: + sample_output_file = args.sample_output_file + + fname_out = open(sample_output_file, "w") + + # only two prompt types (i.e., knowledge and response) are allowed + assert args.prompt_type in ["knowledge", "response"], \ + "Please input a correct prompt type!" + + # Read the prompt file + if args.prompt_type == "knowledge": + # read the prompts for the knowledge generation + prompt_examples_dict = {} + with open(args.prompt_file, "r") as f: + for i, line in enumerate(f): + line = line.strip() + line_dict = json.loads(line) + key = list(line_dict.keys())[0] + + # get the prompt examples based on the key + if key not in prompt_examples_dict: + prompt_examples = line_dict[key] + prompt = "" + for instance in prompt_examples: + instance = instance.strip() + prompt += instance + " \n" + prompt_examples_dict[key] = prompt + + else: + # read the prompts for the response generation + # prompts are fixed for all test samples + with open(args.prompt_file, "r") as f: + prompt_examples = f.readlines() + prompt_examples = prompt_examples[:args.num_prompt_examples] + + prompt = "" + for instance in prompt_examples: + instance = instance.strip() + prompt += instance + " \n" + + input_pos = 0 + model.eval() + # perform prompting + with torch.no_grad(): + while True: + raw_text_len = 0 + if mpu.is_pipeline_first_stage() \ + and mpu.get_tensor_model_parallel_rank() == 0: + input_str = all_raw_text[input_pos] + input_str = input_str.strip() + splits = input_str.split("\t") + topic = splits[0] + + if args.prompt_type == "knowledge": + # first add the prompt into the raw_text + turns = splits[1].split(" [SEP] ") + last_turn = turns[-1] + key = topic + " " + last_turn + raw_text = prompt_examples_dict[key] + + # construct inputs for knowledge generation + # then add the constructed inputs into the raw_text + raw_text += "( " + last_turn + " ) " + topic + " =>" + + else: + # first add the prompt into the raw_text + raw_text = prompt + + # construct inputs for response generation + # then add the constructed inputs into the raw_text + turns = splits[1].split(" [SEP] ") + knowledge = splits[2] + last_turn = turns[-1] + last_turn = " ".join(word_tokenize(last_turn)) + knowledge = " ".join(word_tokenize(knowledge)) + knowledge = knowledge.strip() + last_turn = last_turn.strip() + raw_text += "Topic: " + topic + ". " + raw_text += "User says: " + last_turn + " " + raw_text += "We know that: " + knowledge + " " + raw_text += "System replies:" + + input_pos += 1 + raw_text_len = len(raw_text) + + else: + raw_text = "EMPTY TEXT" + + if input_pos % 100 == 0: + print_rank_0("input_pos: %d" % input_pos) + + outputs = generate_and_post_process( + model=model, + prompts=[raw_text], + tokens_to_generate=args.out_seq_length, + top_k_sampling=1) + prompts_plus_generations = outputs[0] + prompts_plus_generations = prompts_plus_generations[0] + + # write the generated output to the output file + if mpu.get_tensor_model_parallel_rank() == 0: + if mpu.is_pipeline_first_stage(): + + generations = prompts_plus_generations[raw_text_len:] + generations = generations.split("\n")[0] + generations = generations.strip() + fname_out.write(generations) + fname_out.write("\n") + + raw_text = None + if input_pos == input_count: + return + + +def main(): + + args = get_args() + if args.api_prompt: + # obtain the generations by calling the api + generate_samples_by_calling_api() + return + + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for text generation.") + exit() + + # Set up model and load checkpoint. + model = get_model(model_provider, wrap_with_ddp=False) + if args.load is not None: + _ = load_checkpoint(model, None, None) + + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + + # perform the prompting + generate_samples_by_prompting_input_from_file(model) diff --git a/tasks/orqa/README.md b/tasks/orqa/README.md new file mode 100644 index 0000000..58aa455 --- /dev/null +++ b/tasks/orqa/README.md @@ -0,0 +1,36 @@ +## End-to-End Training of Neural Retrievers for Open-Domain Question Answering + +Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408). + +## Retriever Training + +#### Unsupervised pretraining +1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body. + +
+python tools/preprocess_data.py \
+    --input /path/to/corpus.json \
+    --json-keys text title \
+    --split-sentences \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file /path/to/vocab.txt \
+    --output-prefix corpus_indexed \
+    --workers 10
+
+ +2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training. + +3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf). + +#### Supervised finetuning + +1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906). + +2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model. + +More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408). + +## Reader Training + +The reader component will be available soon. + diff --git a/tasks/orqa/evaluate_orqa.py b/tasks/orqa/evaluate_orqa.py new file mode 100644 index 0000000..f960425 --- /dev/null +++ b/tasks/orqa/evaluate_orqa.py @@ -0,0 +1,39 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Main tasks functionality.""" + +from megatron.training import get_args, print_rank_0 +from megatron.legacy.indexer import IndexBuilder +from tasks.orqa.evaluate_utils import ORQAEvaluator + +def main(): + """ + Main program + """ + + args = get_args() + + """ + Create a BlockData data structure by running an IndexBuilder over an + ICT Dataset and then evaluate on NQ task + """ + + print_rank_0("Starting index builder!") + + index_builder = IndexBuilder() + index_builder.build_and_save_index() + print_rank_0("Build and save indices: done!") + + + print_rank_0("Starting evaluations!") + + # Set up the model and evaluator + evaluator = ORQAEvaluator() + + # Run evaluation + if args.qa_data_dev is not None: + evaluator.evaluate(args.qa_data_dev, "DEV") + + if args.qa_data_test is not None: + evaluator.evaluate(args.qa_data_test, "TEST") + diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py new file mode 100644 index 0000000..b7ce3fc --- /dev/null +++ b/tasks/orqa/evaluate_utils.py @@ -0,0 +1,175 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + +from megatron.training import get_args, print_rank_0 +from megatron.training.checkpointing import load_biencoder_checkpoint +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset +from megatron.legacy.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex +from megatron.legacy.model.biencoder_model import get_model_provider +from megatron.training import get_model +from tasks.orqa.unsupervised.nq import get_nq_dataset +from tasks.orqa.unsupervised.nq import get_one_epoch_nq_dataloader +from tasks.orqa.unsupervised.nq import process_nq_batch +from tasks.orqa.unsupervised.qa_utils import calculate_matches + + +class ORQAEvaluator(object): + def __init__(self): + args = get_args() + self.embedding_size = args.hidden_size + self.faiss_use_gpu = args.faiss_use_gpu + self.evidence_embedder_obj = None + self.evidence_dataset = None + self.mips_index = None + self.eval_dataset = None + + # Get Evidence (Wikipedia) dataset + self.get_evidence_dataset() + + # Load query encoder checkpoint + only_query_model = True + if args.biencoder_shared_query_context_model: + only_query_model = False + + model = get_model(get_model_provider(only_query_model=only_query_model, + biencoder_shared_query_context_model=args.biencoder_shared_query_context_model)) + + self.model = load_biencoder_checkpoint(model, + only_query_model=only_query_model) + + assert len(self.model) == 1 + self.model[0].eval() + + # Load faiss indexer + self.faiss_wrapper() + + def get_evidence_embedding(self): + # This will load the embedding from the embedding path + self.evidence_embedder_obj = OpenRetreivalDataStore(load_from_path=True) + + def get_evidence_dataset(self): + self.evidence_dataset = get_open_retrieval_wiki_dataset() + + def faiss_wrapper(self): + # Initialize FAISS wrapper on local rank = 0 as the evidence embeddings + # is distributed over all the GPUs in a node and FAISS is not + # thread-safe + args = get_args() + if args.local_rank == 0: + # Get evidence embeddings computed using context encoder + self.get_evidence_embedding() + + assert self.evidence_embedder_obj is not None + self.mips_index = FaissMIPSIndex(embed_size=self.embedding_size, + embed_data=self.evidence_embedder_obj, + use_gpu=self.faiss_use_gpu) + + # Wait for the FAISS index to be initialized in all the nodes + torch.distributed.barrier() + + def generate_query_vectors(self, qa_data, split): + + self.eval_dataset = get_nq_dataset(qa_data, split) + dataloader = get_one_epoch_nq_dataloader(self.eval_dataset) + + query_vectors = [] + reference_list = [] + + for batch in dataloader: + # batch also has query_tokens and query_pad_data + query_tokens, query_mask, query_types, \ + query_len, reference = process_nq_batch(batch) + + assert len(self.model) == 1 + unwrapped_model = self.model[0] + while not hasattr(unwrapped_model, 'embed_text'): + unwrapped_model = unwrapped_model.module + + with torch.no_grad(): + query_logits = unwrapped_model.embed_text( + unwrapped_model.query_model, query_tokens, + query_mask, query_types) + + reference_list.extend(reference) + query_vectors.extend(query_logits.split(1, dim=0)) + if len(query_vectors) % 100 == 0: + print_rank_0('Encoded queries {}'.format(len(query_vectors))) + + query_tensor = torch.cat(query_vectors, dim=0) + print_rank_0('Total encoded queries tensor {}'.format(query_tensor.size())) + + assert query_tensor.size(0) == len(self.eval_dataset) + return query_tensor, reference_list + + def evaluate(self, qa_data, split): + args = get_args() + query_tensor, reference_list = self.generate_query_vectors(qa_data, \ + split) + local_rank = args.local_rank + rank = torch.distributed.get_rank() + device_count = torch.cuda.device_count() + num_nodes = torch.distributed.get_world_size() // device_count + node_id = rank // device_count + + for node in range(num_nodes): + start_rank = node * device_count + end_rank = (node + 1) * device_count + ranks_list = list(range(start_rank, end_rank)) + node_group = torch.distributed.new_group(ranks=ranks_list) + + if node_id == node: + device_start_rank = start_rank + group = node_group + + input_ = torch.empty_like(query_tensor).copy_(query_tensor).detach_() + tensor_list = [torch.empty_like(input_) for _ in range(device_count)] + torch.distributed.all_gather(tensor_list, query_tensor, group=group) + + if local_rank == 0 and self.mips_index is not None: + all_query_tensor = torch.cat(tensor_list, dim=0).contiguous() + + distance, topkindex = self.mips_index.search_mips_index( + all_query_tensor, top_k=args.faiss_topk_retrievals, + reconstruct=False) + distance = torch.from_numpy(distance).cuda() + topkindex = torch.LongTensor(topkindex).cuda() + + if local_rank != 0: + distance = torch.empty(device_count * len(query_tensor), \ + args.faiss_topk_retrievals, dtype=torch.float32).cuda() + topkindex = torch.empty(device_count * len(query_tensor), \ + args.faiss_topk_retrievals, dtype=torch.int64).cuda() + + torch.distributed.broadcast(distance, src=device_start_rank, \ + group=group) + torch.distributed.broadcast(topkindex, src=device_start_rank, \ + group=group) + + distance = torch.split(distance, len(query_tensor), dim=0)\ + [local_rank] + topkindex = torch.split(topkindex, len(query_tensor), dim=0)\ + [local_rank] + + top_ids_and_scores = [] + for darray, topkarray in zip(distance, topkindex): + top_ids_and_scores.append((topkarray.tolist(), darray.tolist())) + + passages = self.evidence_dataset.id2text + match_stats = calculate_matches(passages, + reference_list, + top_ids_and_scores, + workers_num=args.num_workers, + match_type=args.faiss_match) + top_k_hits = match_stats.top_k_hits + + print_rank_0("{} SET RESULTS".format(split)) + print_rank_0("topk-{} documents hits {}".format( + args.faiss_topk_retrievals, top_k_hits)) + top_k_hits = [v / len(top_ids_and_scores) for v in top_k_hits] + print_rank_0("top-k documents hits accuracy {}".format(top_k_hits)) + + for i in args.retriever_report_topk_accuracies: + print_rank_0("top-{}: {:.2f}".format(i, top_k_hits[i-1] * 100)) + + return diff --git a/tasks/orqa/supervised/data.py b/tasks/orqa/supervised/data.py new file mode 100644 index 0000000..89ae60c --- /dev/null +++ b/tasks/orqa/supervised/data.py @@ -0,0 +1,287 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""ORQA dataset.""" + +import json +import random +from abc import ABC +from abc import abstractmethod + +import numpy as np +from torch.utils.data import Dataset + +from megatron.training import print_rank_0, get_args +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask + +def build_token_types_from_context_list(ctx_list, tokenizer, max_seq_length): + ctx_id_list, ctx_types_list = [], [] + for context in ctx_list: + title_ids = tokenizer.tokenize(context['title']) + ctx_ids = tokenizer.tokenize(context['text']) + ctx_ids = title_ids + [tokenizer.sep_id] + ctx_ids + + ctx_ids, ctx_types, _ = build_tokens_types_paddings_from_ids(ctx_ids, + max_seq_length, tokenizer.cls, + tokenizer.sep, tokenizer.pad) + ctx_id_list.append(ctx_ids) + ctx_types_list.append(ctx_types) + + return ctx_id_list, ctx_types_list + + +def build_tokens_types_paddings_from_text(query, context, + tokenizer, max_seq_length): + """Build token types and paddings, trim if needed, and pad if needed.""" + + query_ids = tokenizer.tokenize(query) + query_ids, query_types, query_pad_mask = \ + build_tokens_types_paddings_from_ids(query_ids, max_seq_length, \ + tokenizer.cls, tokenizer.sep, tokenizer.pad) + + # Appending the title of the context at front + extended_ctx_ids = None + if context is not None: + title_ids = tokenizer.tokenize(context['title']) + ctx_ids = tokenizer.tokenize(context['text']) + extended_ctx_ids = title_ids + [tokenizer.sep] + ctx_ids + + ctx_ids, ctx_types, ctx_pad_mask = \ + build_tokens_types_paddings_from_ids(extended_ctx_ids, + max_seq_length, tokenizer.cls, tokenizer.sep, tokenizer.pad) + + return query_ids, query_types, query_pad_mask, \ + ctx_ids, ctx_types, ctx_pad_mask + + +# Similar code tasks/data_utils with some changes +def build_tokens_types_paddings_from_ids(text_ids, max_seq_length, + cls_id, sep_id, pad_id): + """Build token types and paddings, trim if needed, and pad if needed.""" + enc_ids = [] + tokentypes_enc = [] + + # [CLS]. + enc_ids.append(cls_id) + tokentypes_enc.append(0) + + # A. + len_src = len(text_ids) + enc_ids.extend(text_ids) + tokentypes_enc.extend([0] * len_src) + + # Cap the size. + if len(enc_ids) > max_seq_length - 1: + enc_ids = enc_ids[0: max_seq_length - 1] + tokentypes_enc = tokentypes_enc[0: max_seq_length - 1] + + # [SEP]. + enc_ids.append(sep_id) + tokentypes_enc.append(0) + + num_tokens_enc = len(enc_ids) + # Padding. + padding_length = max_seq_length - len(enc_ids) + if padding_length > 0: + enc_ids.extend([pad_id] * padding_length) + tokentypes_enc.extend([pad_id] * padding_length) + + pad_mask = ([1] * num_tokens_enc) + ([0] * padding_length) + pad_mask = np.array(pad_mask, dtype=np.int64) + + return enc_ids, tokentypes_enc, pad_mask + + +def build_sample(query_ids, query_types, query_pad_mask, + ctx_ids, ctx_types, ctx_pad_mask, answers, + neg_ctx_id_list=None, neg_ctx_types_list=None, + include_neg=False): + """Convert to numpy and return a sample consumed by the batch producer.""" + + query_ids = np.array(query_ids, dtype=np.int64) + query_types = np.array(query_types, dtype=np.int64) + query_mask = make_attention_mask(query_ids, query_ids) + + ctx_ids = np.array(ctx_ids, dtype=np.int64) + ctx_types = np.array(ctx_types, dtype=np.int64) + ctx_mask = make_attention_mask(ctx_ids, ctx_ids) + + sample = ({ + 'query': query_ids, + 'query_mask': query_mask, + 'query_types': query_types, + 'query_pad_mask': query_pad_mask, + 'context': ctx_ids, + 'context_mask': ctx_mask, + 'context_types': ctx_types, + 'context_pad_mask': ctx_pad_mask, + 'reference': answers + }) + + if include_neg: + neg_ctx_ids = np.array(neg_ctx_id_list, dtype=np.int64) + neg_ctx_id_types = np.array(neg_ctx_types_list, dtype=np.int64) + neg_ctx_mask = np.array([make_attention_mask(ids, ids) \ + for ids in neg_ctx_ids], dtype=np.int64) + + sample['neg_context'] = neg_ctx_ids + sample['neg_context_types'] = neg_ctx_id_types + sample['neg_context_mask'] = neg_ctx_mask + + return sample + + +class OpenRetrievalAbstractDataset(ABC, Dataset): + """Open Retrieval base dataset class.""" + + def __init__(self, task_name, dataset_name, datapaths, tokenizer, \ + max_seq_length, evaluate=False): + # Store inputs. + args = get_args() + self.evaluate = evaluate + self.val_av_rank_hard_neg = args.val_av_rank_hard_neg + self.val_av_rank_other_neg = args.val_av_rank_other_neg + self.train_with_neg = args.train_with_neg + self.train_hard_neg = args.train_hard_neg + + self.task_name = task_name + self.dataset_name = dataset_name + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + print_rank_0(' > building {} dataset for {}:'.format(self.task_name, + self.dataset_name)) + # Process the files. + string = ' > paths:' + for path in datapaths: + string += ' ' + path + print_rank_0(string) + self.samples = [] + for datapath in datapaths: + self.samples.extend(self.process_samples_from_single_path(datapath)) + + args = get_args() + if args.sample_rate < 1: # subsample + k = int(len(self.samples) * args.sample_rate) + self.samples = random.sample(self.samples, k) + + print_rank_0(' >> total number of samples: {}'.format( + len(self.samples))) + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + raw_sample = self.samples[idx] + + query_ids, query_types, query_pad_mask, ctx_ids, ctx_types, \ + ctx_pad_mask = build_tokens_types_paddings_from_text( \ + raw_sample['question'], raw_sample['pos_context'], \ + self.tokenizer, self.max_seq_length) + + if self.evaluate: + neg_ctx_list = \ + raw_sample['negative_context'][:self.val_av_rank_other_neg] + \ + raw_sample['hard_negative_context'][:self.val_av_rank_hard_neg] + neg_ctx_id_list, neg_ctx_types_list = \ + build_token_types_from_context_list(neg_ctx_list, \ + self.tokenizer, self.max_seq_length) + + elif self.train_with_neg: + hard_negative_ctx = raw_sample['hard_negative_context'] + negative_ctx = raw_sample['negative_context'] + if True: # TODO: fix this or remove this condition + random.shuffle(hard_negative_ctx) + random.shuffle(negative_ctx) + + neg_ctx_list = hard_negative_ctx[:self.train_hard_neg] + # In the Google NQ dataset by DPR paper, there are around more than + # 50 missing hard negatives in training data. + # In those cases, substitute hard negatives by simple negatives. + if len(neg_ctx_list) < self.train_hard_neg: + neg_ctx_list += negative_ctx[:self.train_hard_neg - \ + len(neg_ctx_list)] + + neg_ctx_id_list, neg_ctx_types_list = \ + build_token_types_from_context_list(neg_ctx_list, + self.tokenizer, self.max_seq_length) + else: + neg_ctx_id_list = None + neg_ctx_types_list = None + + sample = build_sample(query_ids, query_types, query_pad_mask, + ctx_ids, ctx_types, ctx_pad_mask, + raw_sample['answers'], + neg_ctx_id_list, neg_ctx_types_list, + include_neg=self.evaluate or self.train_with_neg) + + return sample + + @staticmethod + @abstractmethod + def process_samples_from_single_path(filename): + """Abstract method that takes a filename and + returns a list of dataset samples, each sample being a dict of + {'text': string, 'text': string} + """ + pass + + + +def normalize_question(question): + if question[-1] == '?': + question = question[:-1] + return question + +# The following class reads the datasets for training retriever as +# prepared by the DPR codebase (https://github.com/facebookresearch/DPR) + +class NQSupervisedDataset(OpenRetrievalAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, \ + evaluate=False): + super().__init__('natural_questions_ret', + name, + datapaths, + tokenizer, + max_seq_length, + evaluate=evaluate) + + @staticmethod + def process_samples_from_single_path(filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + samples = [] + total = 0 + + with open(filename, 'r', encoding="utf-8") as f: + data = json.load(f) + for row in data: + question = normalize_question(row['question']) + pos_context = row['positive_ctxs'][0] + + # Hard Negative Contexts + if len(row['hard_negative_ctxs']) > 0: + hard_neg_context = row['hard_negative_ctxs'] + else: + hard_neg_context = [] + + # Negative Contexts + if len(row['negative_ctxs']) > 0: + neg_context = row['negative_ctxs'] + else: + neg_context = [] + + answers = row['answers'] + sample = {'question': question, + 'pos_context': pos_context, + 'hard_negative_context': hard_neg_context, + 'negative_context': neg_context, + 'answers': answers} + total += 1 + samples.append(sample) + + if total % 5000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples + diff --git a/tasks/orqa/supervised/eval_utils.py b/tasks/orqa/supervised/eval_utils.py new file mode 100644 index 0000000..27af475 --- /dev/null +++ b/tasks/orqa/supervised/eval_utils.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Evaluation utilities.""" +from collections import OrderedDict +import math +import numpy as np +import time +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from megatron.training import get_args, print_rank_0 +from megatron.core import mpu +from megatron.training.utils import average_losses_across_data_parallel_group +from tasks.finetune_utils import build_data_loader + +def task_collate_fn(batch_data): + # generate batch + batch_size = len(batch_data) + tensorized = OrderedDict() + for d in batch_data: + for k, v in d.items(): + tensorized.setdefault(k, []).append(v) + + tensorized['query'] = torch.LongTensor(tensorized['query']) + tensorized['query_mask'] = torch.LongTensor(tensorized['query_mask']) + tensorized['query_types'] = torch.LongTensor(tensorized['query_types']) + tensorized['query_pad_mask'] = \ + torch.LongTensor(tensorized['query_pad_mask']) + + tensorized['context'] = torch.LongTensor(tensorized['context']) + tensorized['context_mask'] = \ + torch.LongTensor(tensorized['context_mask']) + tensorized['context_types'] = \ + torch.LongTensor(tensorized['context_types']) + tensorized['context_pad_mask'] = \ + torch.LongTensor(tensorized['context_pad_mask']) + + if 'neg_context' in tensorized: + tensorized['neg_context'] = \ + torch.LongTensor(np.concatenate(tensorized['neg_context'])) + tensorized['neg_context_mask'] = \ + torch.LongTensor(np.concatenate(tensorized['neg_context_mask'])) + tensorized['neg_context_types'] = \ + torch.LongTensor(np.concatenate(tensorized['neg_context_types'])) + + return tensorized + + + +def process_batch(batch): + """Process batch and produce inputs for the model.""" + query_tokens = batch['query'].long().cuda() + query_mask = (batch['query_mask'] < 0.5).cuda() + query_types = batch['query_types'].long().cuda() + query_pad_mask = batch['query_pad_mask'].long().cuda() + + context_tokens = batch['context'].long().cuda() + context_mask = (batch['context_mask'] < 0.5).cuda() + context_types = batch['context_types'].long().cuda() + context_pad_mask = batch['context_pad_mask'].long().cuda() + + if 'neg_context' in batch: + neg_context_tokens = batch['neg_context'].long().cuda() + neg_context_mask = (batch['neg_context_mask'] < 0.5).cuda() + neg_context_types = batch['neg_context_types'].long().cuda() + else: + neg_context_tokens = None + neg_context_mask = None + neg_context_types = None + + reference = batch['reference'] + + return query_tokens, query_mask, query_types, query_pad_mask, \ + context_tokens, context_mask, context_types, context_pad_mask, \ + neg_context_tokens, neg_context_mask, neg_context_types, reference + +def accuracy_func_provider(single_dataset_provider, rank0sampler=False): + """Provide function that calculates accuracies.""" + args = get_args() + + print_rank_0("accuracy_func_provider is CALLED") + + # Build dataloaders + datapath = args.valid_data + dataset = single_dataset_provider(datapath) + + drop_last = False + if mpu.get_data_parallel_world_size() > 1 and not rank0sampler: + drop_last = True + + print_rank_0(datapath) + print_rank_0(rank0sampler) + + dataloader = build_data_loader(dataset, + args.eval_micro_batch_size, + num_workers=args.num_workers, + drop_last=drop_last, + task_collate_fn=task_collate_fn) + dataloaders = (dataset.dataset_name, dataloader) + + def metrics_func(model, epoch, output_predictions=False): + print_rank_0('calculating metrics by accuracy func in ORQA...') + + if output_predictions: + assert rank0sampler + names = 'predictions' + name, dataloader = dataloaders + if args.task == "RET-FINETUNE-NQ": + start_time = time.time() + output = retrieval_loss(model, dataloader) + stats_dict, total = output + format_string = "" + for k, v in stats_dict.items(): + format_string += "|{} = {:.2f}".format(k, v / total) + print_rank_0("epoch:{}{}".format(epoch, format_string)) + print_rank_0("taken time to calcuate metrics {:.3f}".format(\ + time.time() - start_time)) + else: + raise AssertionError("{} Task not supported".format(args.task)) + + return metrics_func + + +def retrieval_loss(model, dataloader): + args = get_args() + total = 0 + topk_stats_dict = {'top{}_acc'.format(k): 0 for k in \ + args.retriever_report_topk_accuracies} + stats_dict = dict(rank=0, **topk_stats_dict) + + assert len(model) == 1 + unwrapped_model = model[0] + unwrapped_model.eval() + + with torch.no_grad(): + # For all the batches in the dataset. + for batch in dataloader: + # Run the model forward. + query_tokens, query_mask, query_types, _, \ + context_tokens, context_mask, context_types, _, \ + neg_context_tokens, neg_context_mask, neg_context_types, \ + reference = process_batch(batch) + + query_logits, context_logits = unwrapped_model(query_tokens, + query_mask, query_types, + torch.cat([context_tokens, neg_context_tokens]), + torch.cat([context_mask, neg_context_mask]), + torch.cat([context_types, neg_context_types])) + + retrieval_scores = torch.matmul(query_logits, + torch.transpose(context_logits, 0, 1)) + + if args.retriever_score_scaling: + retrieval_scores = retrieval_scores / \ + math.sqrt(args.hidden_size) + + local_batch_size = query_logits.shape[0] + labels = torch.arange(local_batch_size).long().cuda() + + softmax_scores = F.softmax(retrieval_scores, dim=1) + sorted_vals, sorted_indices = torch.topk(softmax_scores, + k=softmax_scores.shape[1], + sorted=True) + + def topk_accuracy(k): + return torch.cuda.FloatTensor( + [sum([int(labels[i] in sorted_indices[i, :k]) for i in \ + range(local_batch_size)])]) + + def get_rank(): + return torch.cuda.FloatTensor( + [sum([torch.nonzero(labels[i] == sorted_indices[i])[0][0] \ + for i in range(local_batch_size)])]) + + topk_accs = [topk_accuracy(k) for k in \ + args.retriever_report_topk_accuracies] + rank = get_rank() + losses = average_losses_across_data_parallel_group([rank, \ + *topk_accs]) + + # create stats_dict with retrieval loss and all specified + # top-k accuracies + topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \ + zip(args.retriever_report_topk_accuracies, losses[1:])} + temp_stats_dict = dict(rank=losses[0], **topk_acc_dict) + for k in stats_dict.keys(): + stats_dict[k] += temp_stats_dict[k] + total += local_batch_size + + unwrapped_model.train() + + return stats_dict, total diff --git a/tasks/orqa/supervised/finetune.py b/tasks/orqa/supervised/finetune.py new file mode 100644 index 0000000..f09c403 --- /dev/null +++ b/tasks/orqa/supervised/finetune.py @@ -0,0 +1,238 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""ORQA finetuning/evaluation.""" + +from functools import partial +import sys + +import math +import torch +import torch.nn.functional as F + +from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0 +from megatron.core import mpu +from megatron.legacy.indexer import IndexBuilder +from megatron.legacy.model.biencoder_model import biencoder_model_provider +from megatron.training.utils import average_losses_across_data_parallel_group +from pretrain_ict import get_group_world_size_rank +from tasks.finetune_utils import finetune +from tasks.orqa.supervised.eval_utils import accuracy_func_provider +from tasks.orqa.supervised.eval_utils import process_batch, task_collate_fn +from tasks.orqa.evaluate_utils import ORQAEvaluator + +# input_ is a 2D tensor +def check_and_append_tensor_for_gather(group, rank, world_size, input_): + + # gather the size of the first dimension of the tensor from all ranks + current_length = input_.size()[0] + first_dim = torch.tensor([[current_length]], + device=torch.cuda.current_device()) + input_list = [torch.empty_like(first_dim) for _ in range(world_size)] + input_list[rank].copy_(first_dim) + torch.distributed.all_gather(input_list, first_dim, group=group) + all_input_list = torch.cat(input_list, dim=0).contiguous() + max_length = torch.max(all_input_list) + + # if the size are different than the max, extend the tensor + # accordingly + if max_length > current_length: + padding=tuple([0] * (input_.dim() * 2 - 1)) + \ + tuple([max_length - current_length]) + input_ = F.pad(input=input_, pad=padding) + + return input_ + +def orqa(Dataset): + + def cross_entropy_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + timers = get_timers() + tokenizer = get_tokenizer() + + # Get the batch. + timers('batch generator', log_level=2).start() + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + + group, rank, world_size = get_group_world_size_rank() + + query_tokens, query_mask, query_types, query_pad_mask, \ + context_tokens, context_mask, context_types, context_pad_mask, \ + neg_context_tokens, neg_context_mask, neg_context_types, \ + reference = process_batch(batch_) + + timers('batch generator').stop() + local_batch_size = query_tokens.shape[0] + + # Text representation of query and context + query_list, context_list = [], [] + for i in range(local_batch_size): + query_list.append(tokenizer.decode(query_tokens[i].tolist())) + context_list.append(tokenizer.decode(context_tokens[i].tolist())) + + if neg_context_tokens is not None: + neg_context_tokens = check_and_append_tensor_for_gather(group, + rank, world_size, neg_context_tokens) + neg_context_mask = check_and_append_tensor_for_gather(group, + rank, world_size, neg_context_mask) + neg_context_types = check_and_append_tensor_for_gather(group, + rank, world_size, neg_context_types) + + if neg_context_tokens is not None: + context_tokens = torch.cat([context_tokens, neg_context_tokens]) + context_mask = torch.cat([context_mask, neg_context_mask]) + context_types = torch.cat([context_types, neg_context_types]) + + # Forward model. + output_tensor = model(query_tokens, query_mask, + query_types, context_tokens, + context_mask, context_types) + return output_tensor, partial(cross_entropy_loss_func, query_tokens, context_tokens) + + + def cross_entropy_loss_func(query_tokens, context_tokens, output_tensor): + args = get_args() + + local_batch_size = query_tokens.shape[0] + group, rank, world_size = get_group_world_size_rank() + # recall we assert that model_parallel_size == 1 + global_batch_size = world_size * local_batch_size + + query_logits, context_logits = output_tensor + + if world_size > 1: + input_ = torch.empty_like(context_logits).copy_(\ + context_logits).detach_() + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank].copy_(input_) + torch.distributed.all_gather(tensor_list, input_, group=group) + + # Check if all-gather happens in order + assert tensor_list[rank].sum().item() == \ + context_logits.sum().item() + + # Preserves the gradient + tensor_list[rank] = context_logits + all_context_logits = torch.cat(tensor_list, dim=0).contiguous() + + # Query tensors + input_ = torch.empty_like(query_logits).copy_(\ + query_logits).detach_() + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank].copy_(input_) + torch.distributed.all_gather(tensor_list, input_, group=group) + + # Check if all-gather happens in order + assert tensor_list[rank].sum().item() == query_logits.sum().item() + + # Preserves the gradient + tensor_list[rank] = query_logits + all_query_logits = torch.cat(tensor_list, dim=0).contiguous() + else: + all_query_logits = query_logits + all_context_logits = context_logits + + retrieval_scores = torch.matmul(all_query_logits, + torch.transpose(all_context_logits, 0, 1)) + # Scaling the retrieval scores + if args.retriever_score_scaling: + retrieval_scores = retrieval_scores / math.sqrt(args.hidden_size) + + if args.train_with_neg: + # if the world size is 3, local batch size is 4, and + # local context size is 8, what we want is + # labels = [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19] + labels = [] + local_context_size = context_tokens.shape[0] + for i in range(world_size): + j = i * local_context_size + labels.extend(list(range(j, j + local_batch_size))) + labels = torch.LongTensor(labels).cuda() + assert len(labels) == global_batch_size + else: + labels = torch.arange(global_batch_size).long().cuda() + + # Cross-entropy loss. + softmax_scores = F.log_softmax(retrieval_scores, dim=1) + + loss = F.nll_loss(softmax_scores, labels, reduction='mean') + + max_score, max_idxs = torch.max(softmax_scores, 1) + correct_predictions_count = (max_idxs == labels).sum().float() + + # Reduce loss for logging. + reduced_loss = average_losses_across_data_parallel_group([loss, \ + correct_predictions_count]) + + # Loss scaling for correct losses in Supervised Retrieval + loss = loss * mpu.get_data_parallel_world_size() + + return loss, {'lm loss': reduced_loss[0], + 'correct_prediction_count': reduced_loss[1]} + + + def train_valid_datasets_provider(): + """Build train and validation dataset.""" + args = get_args() + tokenizer = get_tokenizer() + + train_dataset = Dataset('training', + args.train_data, + tokenizer, + args.retriever_seq_length, + evaluate=False) + valid_dataset = Dataset('validation', + args.valid_data, + tokenizer, + args.retriever_seq_length, + evaluate=True) + return train_dataset, valid_dataset + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + print_rank_0('building retriever model for {} ...'.format(args.task)) + + model = biencoder_model_provider(only_context_model=False, + only_query_model=False, + biencoder_shared_query_context_model=\ + args.biencoder_shared_query_context_model, + pre_process=pre_process, post_process=post_process) + + return model + + def single_dataset_provider(datapath): + args = get_args() + tokenizer = get_tokenizer() + + name = datapath[0].split('/')[-1].split('.')[0] + return Dataset(name, + datapath, + tokenizer, + args.retriever_seq_length, + evaluate=True) + + def metrics_func_provider(): + """Provide metrics callback function.""" + return accuracy_func_provider(single_dataset_provider) + + """Finetune/evaluate.""" + finetune(train_valid_datasets_provider, + model_provider, + forward_step=cross_entropy_forward_step, + end_of_epoch_callback_provider=metrics_func_provider, + task_collate_fn=task_collate_fn) + +def main(): + args = get_args() + + if args.task == 'RET-FINETUNE-NQ': + from tasks.orqa.supervised.data import NQSupervisedDataset as Dataset + else: + raise NotImplementedError('ORQA task {} is not implemented.'.format( + args.task)) + + orqa(Dataset) + diff --git a/tasks/orqa/unsupervised/nq.py b/tasks/orqa/unsupervised/nq.py new file mode 100644 index 0000000..2d1bfca --- /dev/null +++ b/tasks/orqa/unsupervised/nq.py @@ -0,0 +1,215 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +""" + Data Loader for Google NQ dataset +""" + +from abc import ABC +import csv +from collections import OrderedDict +import numpy as np + +import torch +from torch.utils.data import DataLoader +from torch.utils.data import Dataset, BatchSampler + +from megatron.training import print_rank_0, get_args, get_tokenizer +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask + +def get_nq_dataset(qa_data, split): + args = get_args() + tokenizer = get_tokenizer() + + dataset = NQDataset('Google NQ {} Split'.format(split), + 'Google Natural Questions', + qa_data, + tokenizer, + args.retriever_seq_length) + return dataset + + +def process_nq_batch(batch): + query_tokens = batch['token_ids'].long().cuda() + query_mask = (batch['token_mask'] < 0.5).cuda() + query_types = batch['token_types'].long().cuda() + query_len = batch['seq_len'].long().cuda() + reference = batch['reference'] + + return query_tokens, query_mask, query_types, query_len, reference + + +class CustomDataLoader(DataLoader): + def __init__(self, dataset, eval=False, **kwargs): + if kwargs.get('collate_fn', None) is None: + kwargs['collate_fn'] = self._collate_fn + self.eval = eval + super().__init__(dataset, **kwargs) + + def _collate_fn(self, batch_data): + # generate batch + batch_size = len(batch_data) + tensorized = OrderedDict() + for d in batch_data: + for k, v in d.items(): + tensorized.setdefault(k, []).append(v) + assert len(tensorized) == 5 + + tensorized['token_ids'] = torch.LongTensor(tensorized['token_ids']) + tensorized['token_mask'] = torch.LongTensor(tensorized['token_mask']) + tensorized['token_types'] = torch.LongTensor(tensorized['token_types']) + tensorized['seq_len'] = torch.LongTensor(tensorized['seq_len']) + return tensorized + + +def get_one_epoch_nq_dataloader(dataset, micro_batch_size=None): + """Data loader. Note that batch-size is the local (per GPU) batch-size. + NOTE: This dataloader is not distributed !!! + """ + + args = get_args() + if micro_batch_size is None: + micro_batch_size = args.micro_batch_size + num_workers = args.num_workers + + sampler = torch.utils.data.SequentialSampler(dataset) + # importantly, drop_last must be False to get all the data. + batch_sampler = BatchSampler(sampler, + batch_size=micro_batch_size, + drop_last=False) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = CustomDataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + pin_memory=True) + return data_loader + + +def build_tokens_types_paddings_from_text(src_text, tokenizer, max_seq_length): + """Build token types and paddings, trim if needed, and pad if needed.""" + + src_text_ids = tokenizer.tokenize(src_text) + + return build_tokens_types_paddings_from_ids(src_text_ids, + max_seq_length, + tokenizer.cls, + tokenizer.sep, + tokenizer.pad) + + +def build_tokens_types_paddings_from_ids(src_ids, max_seq_length, cls_id, \ + sep_id, pad_id): + """ + Build token types and paddings, trim if needed, and pad if needed. + + TODO: Design modular interface to reuse this function. This is getting + repeated multiple times in different tasks + """ + + enc_ids = [] + tokentypes_enc = [] + + # [CLS]. + enc_ids.append(cls_id) + tokentypes_enc.append(0) + + # A. + len_src = len(src_ids) + enc_ids.extend(src_ids) + tokentypes_enc.extend([0] * len_src) + + # Cap the size. + if len(enc_ids) > max_seq_length - 1: + enc_ids = enc_ids[0: max_seq_length - 1] + tokentypes_enc = tokentypes_enc[0: max_seq_length - 1] + + # [SEP]. + enc_ids.append(sep_id) + tokentypes_enc.append(0) + + num_tokens_enc = len(enc_ids) + # Padding. + padding_length = max_seq_length - len(enc_ids) + if padding_length > 0: + enc_ids.extend([pad_id] * padding_length) + tokentypes_enc.extend([pad_id] * padding_length) + + return enc_ids, tokentypes_enc, num_tokens_enc + + +def build_sample(token_ids, token_types, num_tokens, reference): + """ + Convert to numpy and return a sample consumed by the + batch producer. + """ + + token_ids = np.array(token_ids, dtype=np.int64) + token_types = np.array(token_types, dtype=np.int64) + token_mask = make_attention_mask(token_ids, token_ids) + + sample = ({ + 'token_ids': token_ids, + 'token_mask': token_mask, + 'token_types': token_types, + 'seq_len': num_tokens, + 'reference': reference + }) + return sample + + +class NQDataset(ABC, Dataset): + """ + Open Retrieval Question Answering evaluation using Google NQ dataset. + """ + + def __init__(self, task_name, dataset_name, datapath, + tokenizer, max_seq_length): + # Store inputs. + self.task_name = task_name + self.dataset_name = dataset_name + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + print_rank_0(' > building {} dataset for {}:'.format(self.task_name, + self.dataset_name)) + print_rank_0(datapath) + self.samples = self.process_samples_from_single_path(datapath) + print_rank_0(' >> total number of samples: {}'.format(\ + len(self.samples))) + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + raw_sample = self.samples[idx] + + ques_tokens, tokentypes_enc, num_tokens_ques = \ + build_tokens_types_paddings_from_text(raw_sample['question'], + self.tokenizer, self.max_seq_length) + + sample = build_sample(ques_tokens, + tokentypes_enc, + num_tokens_ques, + raw_sample['answers']) + return sample + + @staticmethod + def process_samples_from_single_path(filename): + print_rank_0(' > Processing {} ...'.format(filename)) + samples = [] + total = 0 + + with open(filename, 'r') as ifile: + reader = csv.reader(ifile, delimiter='\t') + for row in reader: + question = row[0] + answers = eval(row[1]) + + sample = {'question': question, 'answers': answers} + total += 1 + samples.append(sample) + + if total % 1000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/orqa/unsupervised/qa_utils.py b/tasks/orqa/unsupervised/qa_utils.py new file mode 100644 index 0000000..811a058 --- /dev/null +++ b/tasks/orqa/unsupervised/qa_utils.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# + +# The following code has been taken from +# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0 +# licensed as of now. More details on the license can be found +# at https://github.com/facebookresearch/DPR/blob/master/LICENSE + +""" + Set of utilities for Q&A results validation tasks - Retriver passage + validation and Reader predicted answer validation +""" + +import collections +import logging +import string +import unicodedata +from functools import partial +from multiprocessing import Pool as ProcessPool +from typing import Tuple, List, Dict + +import regex as re +from tasks.orqa.unsupervised.tokenizers import SimpleTokenizer + +logger = logging.getLogger(__name__) + +QAMatchStats = collections.namedtuple('QAMatchStats', ['top_k_hits',\ + 'questions_doc_hits']) + +def calculate_matches(all_docs: Dict[object, Tuple[str, str]], + answers: List[List[str]], closest_docs: List[Tuple[List[object], + List[float]]], workers_num: int, match_type: str) -> QAMatchStats: + """ + Evaluates answers presence in the set of documents. This function is + supposed to be used with a large collection of documents and results. + It internally forks multiple sub-processes for evaluation and then + merges results + :param all_docs: dictionary of the entire documents database. + doc_id -> (doc_text, title) + :param answers: list of answers's list. One list per question + :param closest_docs: document ids of the top results along with their + scores + :param workers_num: amount of parallel threads to process data + :param match_type: type of answer matching. Refer to has_answer code for + available options + :return: matching information tuple. + top_k_hits - a list where the index is the amount of top documents retrieved + and the value is the total amount of valid matches across an entire + dataset. + questions_doc_hits - more detailed info with answer matches for every + question and every retrieved document + """ + global dpr_all_documents + dpr_all_documents = all_docs + + tok_opts = {} + tokenizer = SimpleTokenizer(**tok_opts) + + processes = ProcessPool( + processes=workers_num, + ) + + logger.info('Matching answers in top docs...') + + get_score_partial = partial(check_answer, match_type=match_type, + tokenizer=tokenizer) + + questions_answers_docs = zip(answers, closest_docs) + + scores = processes.map(get_score_partial, questions_answers_docs) + + logger.info('Per question validation results len=%d', len(scores)) + + n_docs = len(closest_docs[0][0]) + top_k_hits = [0] * n_docs + for question_hits in scores: + best_hit = next((i for i, x in enumerate(question_hits) if x), None) + if best_hit is not None: + top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]] + + return QAMatchStats(top_k_hits, scores) + + +def check_answer(questions_answers_docs, tokenizer, match_type) -> List[bool]: + """ + Search through all the top docs to see if they have any of the answers. + """ + answers, (doc_ids, doc_scores) = questions_answers_docs + + global dpr_all_documents + hits = [] + + for i, doc_id in enumerate(doc_ids): + doc = dpr_all_documents[doc_id] + text = doc[0] + + answer_found = False + if text is None: # cannot find the document for some reason + logger.warning("no doc in db") + hits.append(False) + continue + + if has_answer(answers, text, tokenizer, match_type): + answer_found = True + hits.append(answer_found) + return hits + + +def has_answer(answers, text, tokenizer, match_type) -> bool: + """ + Check if a document contains an answer string. + If `match_type` is string, token matching is done between the text + and answer. + If `match_type` is regex, we search the whole text with the regex. + """ + text = _normalize(text) + + if match_type == 'string': + # Answer is a list of possible strings + text = tokenizer.tokenize(text).words(uncased=True) + + for single_answer in answers: + single_answer = _normalize(single_answer) + single_answer = tokenizer.tokenize(single_answer) + single_answer = single_answer.words(uncased=True) + + for i in range(0, len(text) - len(single_answer) + 1): + if single_answer == text[i: i + len(single_answer)]: + return True + + elif match_type == 'regex': + # Answer is a regex + for single_answer in answers: + single_answer = _normalize(single_answer) + if regex_match(text, single_answer): + return True + return False + + +def regex_match(text, pattern): + """Test if a regex pattern is contained within a text.""" + try: + pattern = re.compile( + pattern, + flags=re.IGNORECASE + re.UNICODE + re.MULTILINE, + ) + except BaseException: + return False + return pattern.search(text) is not None + + +# function for the reader model answer validation +def exact_match_score(prediction, ground_truth): + return _normalize_answer(prediction) == _normalize_answer(ground_truth) + + +def _normalize_answer(s): + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def _normalize(text): + return unicodedata.normalize('NFD', text) diff --git a/tasks/orqa/unsupervised/tokenizers.py b/tasks/orqa/unsupervised/tokenizers.py new file mode 100644 index 0000000..fb23887 --- /dev/null +++ b/tasks/orqa/unsupervised/tokenizers.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# + +# The following code has been taken from +# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0 +# licensed as of now. More details on the license can be found +# at https://github.com/facebookresearch/DPR/blob/master/LICENSE + +""" +Most of the tokenizers code here is copied from DrQA codebase to avoid adding extra dependency +""" + +import copy +import logging + +import regex +import spacy + +logger = logging.getLogger(__name__) + + +class Tokens(object): + """A class to represent a list of tokenized text.""" + TEXT = 0 + TEXT_WS = 1 + SPAN = 2 + POS = 3 + LEMMA = 4 + NER = 5 + + def __init__(self, data, annotators, opts=None): + self.data = data + self.annotators = annotators + self.opts = opts or {} + + def __len__(self): + """The number of tokens.""" + return len(self.data) + + def slice(self, i=None, j=None): + """Return a view of the list of tokens from [i, j).""" + new_tokens = copy.copy(self) + new_tokens.data = self.data[i: j] + return new_tokens + + def untokenize(self): + """Returns the original text (with whitespace reinserted).""" + return ''.join([t[self.TEXT_WS] for t in self.data]).strip() + + def words(self, uncased=False): + """Returns a list of the text of each token + + Args: + uncased: lower cases text + """ + if uncased: + return [t[self.TEXT].lower() for t in self.data] + else: + return [t[self.TEXT] for t in self.data] + + def offsets(self): + """Returns a list of [start, end) character offsets of each token.""" + return [t[self.SPAN] for t in self.data] + + def pos(self): + """Returns a list of part-of-speech tags of each token. + Returns None if this annotation was not included. + """ + if 'pos' not in self.annotators: + return None + return [t[self.POS] for t in self.data] + + def lemmas(self): + """Returns a list of the lemmatized text of each token. + Returns None if this annotation was not included. + """ + if 'lemma' not in self.annotators: + return None + return [t[self.LEMMA] for t in self.data] + + def entities(self): + """Returns a list of named-entity-recognition tags of each token. + Returns None if this annotation was not included. + """ + if 'ner' not in self.annotators: + return None + return [t[self.NER] for t in self.data] + + def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True): + """Returns a list of all ngrams from length 1 to n. + + Args: + n: upper limit of ngram length + uncased: lower cases text + filter_fn: user function that takes in an ngram list and returns + True or False to keep or not keep the ngram + as_string: return the ngram as a string vs list + """ + + def _skip(gram): + if not filter_fn: + return False + return filter_fn(gram) + + words = self.words(uncased) + ngrams = [(s, e + 1) + for s in range(len(words)) + for e in range(s, min(s + n, len(words))) + if not _skip(words[s:e + 1])] + + # Concatenate into strings + if as_strings: + ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams] + + return ngrams + + def entity_groups(self): + """Group consecutive entity tokens with the same NER tag.""" + entities = self.entities() + if not entities: + return None + non_ent = self.opts.get('non_ent', 'O') + groups = [] + idx = 0 + while idx < len(entities): + ner_tag = entities[idx] + # Check for entity tag + if ner_tag != non_ent: + # Chomp the sequence + start = idx + while (idx < len(entities) and entities[idx] == ner_tag): + idx += 1 + groups.append((self.slice(start, idx).untokenize(), ner_tag)) + else: + idx += 1 + return groups + + +class Tokenizer(object): + """Base tokenizer class. + Tokenizers implement tokenize, which should return a Tokens class. + """ + + def tokenize(self, text): + raise NotImplementedError + + def shutdown(self): + pass + + def __del__(self): + self.shutdown() + + +class SimpleTokenizer(Tokenizer): + ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' + NON_WS = r'[^\p{Z}\p{C}]' + + def __init__(self, **kwargs): + """ + Args: + annotators: None or empty set (only tokenizes). + """ + self._regexp = regex.compile( + '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), + flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE + ) + if len(kwargs.get('annotators', {})) > 0: + logger.warning('%s only tokenizes! Skipping annotators: %s' % + (type(self).__name__, kwargs.get('annotators'))) + self.annotators = set() + + def tokenize(self, text): + data = [] + matches = [m for m in self._regexp.finditer(text)] + for i in range(len(matches)): + # Get text + token = matches[i].group() + + # Get whitespace + span = matches[i].span() + start_ws = span[0] + if i + 1 < len(matches): + end_ws = matches[i + 1].span()[0] + else: + end_ws = span[1] + + # Format data + data.append(( + token, + text[start_ws: end_ws], + span, + )) + return Tokens(data, self.annotators) + + +class SpacyTokenizer(Tokenizer): + + def __init__(self, **kwargs): + """ + Args: + annotators: set that can include pos, lemma, and ner. + model: spaCy model to use (either path, or keyword like 'en'). + """ + model = kwargs.get('model', 'en') + self.annotators = copy.deepcopy(kwargs.get('annotators', set())) + nlp_kwargs = {'parser': False} + if not any([p in self.annotators for p in ['lemma', 'pos', 'ner']]): + nlp_kwargs['tagger'] = False + if 'ner' not in self.annotators: + nlp_kwargs['entity'] = False + self.nlp = spacy.load(model, **nlp_kwargs) + + def tokenize(self, text): + # We don't treat new lines as tokens. + clean_text = text.replace('\n', ' ') + tokens = self.nlp.tokenizer(clean_text) + if any([p in self.annotators for p in ['lemma', 'pos', 'ner']]): + self.nlp.tagger(tokens) + if 'ner' in self.annotators: + self.nlp.entity(tokens) + + data = [] + for i in range(len(tokens)): + # Get whitespace + start_ws = tokens[i].idx + if i + 1 < len(tokens): + end_ws = tokens[i + 1].idx + else: + end_ws = tokens[i].idx + len(tokens[i].text) + + data.append(( + tokens[i].text, + text[start_ws: end_ws], + (tokens[i].idx, tokens[i].idx + len(tokens[i].text)), + tokens[i].tag_, + tokens[i].lemma_, + tokens[i].ent_type_, + )) + + # Set special option for non-entity tag: '' vs 'O' in spaCy + return Tokens(data, self.annotators, opts={'non_ent': ''}) diff --git a/tasks/race/data.py b/tasks/race/data.py new file mode 100644 index 0000000..0c22108 --- /dev/null +++ b/tasks/race/data.py @@ -0,0 +1,135 @@ + +import glob +import json +import os +import time + +from torch.utils.data import Dataset + +from megatron.training import print_rank_0 +from tasks.data_utils import build_sample +from tasks.data_utils import build_tokens_types_paddings_from_ids +from tasks.data_utils import clean_text + + +NUM_CHOICES = 4 +MAX_QA_LENGTH = 128 + + +class RaceDataset(Dataset): + + def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length, + max_qa_length=MAX_QA_LENGTH): + + self.dataset_name = dataset_name + print_rank_0(' > building RACE dataset for {}:'.format( + self.dataset_name)) + + string = ' > paths:' + for path in datapaths: + string += ' ' + path + print_rank_0(string) + + self.samples = [] + for datapath in datapaths: + self.samples.extend(process_single_datapath(datapath, tokenizer, + max_qa_length, + max_seq_length)) + + print_rank_0(' >> total number of samples: {}'.format( + len(self.samples))) + + # This indicates that each "sample" has multiple samples that + # will collapse into batch dimension + self.sample_multiplier = NUM_CHOICES + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + return self.samples[idx] + + +def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length): + """Read in RACE files, combine, clean-up, tokenize, and convert to + samples.""" + + print_rank_0(' > working on {}'.format(datapath)) + start_time = time.time() + + # Get list of files. + filenames = glob.glob(os.path.join(datapath, '*.txt')) + + samples = [] + num_docs = 0 + num_questions = 0 + num_samples = 0 + # Load all the files + for filename in filenames: + with open(filename, 'r') as f: + for line in f: + data = json.loads(line) + num_docs += 1 + + context = data["article"] + questions = data["questions"] + choices = data["options"] + answers = data["answers"] + # Check the length. + assert len(questions) == len(answers) + assert len(questions) == len(choices) + + # Context: clean up and convert to ids. + context = clean_text(context) + context_ids = tokenizer.tokenize(context) + + # Loop over questions. + for qi, question in enumerate(questions): + num_questions += 1 + # Label. + label = ord(answers[qi]) - ord("A") + assert label >= 0 + assert label < NUM_CHOICES + assert len(choices[qi]) == NUM_CHOICES + + # For each question, build num-choices samples. + ids_list = [] + types_list = [] + paddings_list = [] + for ci in range(NUM_CHOICES): + choice = choices[qi][ci] + # Merge with choice. + if "_" in question: + qa = question.replace("_", choice) + else: + qa = " ".join([question, choice]) + # Clean QA. + qa = clean_text(qa) + # Tokenize. + qa_ids = tokenizer.tokenize(qa) + # Trim if needed. + if len(qa_ids) > max_qa_length: + qa_ids = qa_ids[0:max_qa_length] + + # Build the sample. + ids, types, paddings \ + = build_tokens_types_paddings_from_ids( + qa_ids, context_ids, max_seq_length, + tokenizer.cls, tokenizer.sep, tokenizer.pad) + + ids_list.append(ids) + types_list.append(types) + paddings_list.append(paddings) + + # Convert to numpy and add to samples + samples.append(build_sample(ids_list, types_list, + paddings_list, label, + num_samples)) + num_samples += 1 + + elapsed_time = time.time() - start_time + print_rank_0(' > processed {} document, {} questions, and {} samples' + ' in {:.2f} seconds'.format(num_docs, num_questions, + num_samples, elapsed_time)) + + return samples diff --git a/tasks/race/finetune.py b/tasks/race/finetune.py new file mode 100644 index 0000000..09d9e73 --- /dev/null +++ b/tasks/race/finetune.py @@ -0,0 +1,55 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Race.""" + +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.legacy.model.multiple_choice import MultipleChoice +from tasks.eval_utils import accuracy_func_provider +from tasks.finetune_utils import finetune +from tasks.race.data import RaceDataset +from megatron.training.arguments import core_transformer_config_from_args + + +def train_valid_datasets_provider(): + """Provide train and validation datasets.""" + args = get_args() + tokenizer = get_tokenizer() + + train_dataset = RaceDataset('training', args.train_data, + tokenizer, args.seq_length) + valid_dataset = RaceDataset('validation', args.valid_data, + tokenizer, args.seq_length) + + return train_dataset, valid_dataset + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + config = core_transformer_config_from_args(get_args()) + print_rank_0('building multichoice model for RACE ...') + model = MultipleChoice(config=config, + num_tokentypes=2, + pre_process=pre_process, + post_process=post_process) + + return model + + +def metrics_func_provider(): + """Privde metrics callback function.""" + args = get_args() + tokenizer = get_tokenizer() + + def single_dataset_provider(datapath): + name = datapath.split('RACE')[-1].strip('/').replace('/', '-') + return RaceDataset(name, [datapath], tokenizer, args.seq_length) + + return accuracy_func_provider(single_dataset_provider) + + +def main(): + + finetune(train_valid_datasets_provider, model_provider, + end_of_epoch_callback_provider=metrics_func_provider) diff --git a/tasks/vision/classification/classification.py b/tasks/vision/classification/classification.py new file mode 100644 index 0000000..3398df8 --- /dev/null +++ b/tasks/vision/classification/classification.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Vision-classification finetuning/evaluation.""" + +import torch.nn.functional as F +from functools import partial +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0 +from megatron.legacy.model.vision.classification import VitClassificationModel +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from tasks.vision.classification.eval_utils import accuracy_func_provider +from tasks.vision.finetune_utils import finetune +from megatron.training.utils import average_losses_across_data_parallel_group + + +def classification(): + def train_valid_datasets_provider(): + """Build train and validation dataset.""" + args = get_args() + + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w), + ) + return train_ds, valid_ds + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + + print_rank_0("building classification model for ImageNet ...") + + return VitClassificationModel(num_classes=args.num_classes, finetune=True, + pre_process=pre_process, post_process=post_process) + + def process_batch(batch): + """Process batch and produce inputs for the model.""" + images = batch[0].cuda().contiguous() + labels = batch[1].cuda().contiguous() + return images, labels + + def cross_entropy_loss_func(labels, output_tensor): + logits = output_tensor + + # Cross-entropy loss. + loss = F.cross_entropy(logits.contiguous().float(), labels) + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + def _cross_entropy_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + timers = get_timers() + + # Get the batch. + timers("batch generator", log_level=2).start() + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + images, labels = process_batch(batch_) + timers("batch generator").stop() + + # Forward model. + output_tensor = model(images) + + return output_tensor, partial(cross_entropy_loss_func, labels) + + """Finetune/evaluate.""" + finetune( + train_valid_datasets_provider, + model_provider, + forward_step=_cross_entropy_forward_step, + end_of_epoch_callback_provider=accuracy_func_provider, + ) + +def main(): + classification() + diff --git a/tasks/vision/classification/eval_utils.py b/tasks/vision/classification/eval_utils.py new file mode 100644 index 0000000..45cc4ea --- /dev/null +++ b/tasks/vision/classification/eval_utils.py @@ -0,0 +1,116 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Evaluation utilities.""" + +import os +from functools import partial + +import torch + +from megatron.training import get_args +from megatron.training import print_rank_0, print_rank_last +from megatron.core import mpu +from megatron.schedules import get_forward_backward_func +from tasks.vision.finetune_utils import build_data_loader +from tasks.vision.finetune_utils import process_batch +from torchvision import datasets, transforms + + +def accuracy_func_provider(): + """Provide function that calculates accuracies.""" + args = get_args() + data_path = args.data_path + crop_size = (args.img_h, args.img_w) + + # Build dataloaders. + val_data_path = data_path[1] + normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + transform_val = transforms.Compose( + [ + transforms.Resize(crop_size), + transforms.CenterCrop(crop_size), + transforms.ToTensor(), + normalize, + ] + ) + dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val) + + dataloader = build_data_loader( + dataset, + args.micro_batch_size, + num_workers=args.num_workers, + drop_last=(mpu.get_data_parallel_world_size() > 1), + shuffle=False + ) + + def metrics_func(model, epoch): + print_rank_0("calculating metrics ...") + correct, total = calculate_correct_answers(model, dataloader, epoch) + percent = float(correct) * 100.0 / float(total) + print_rank_last( + " >> |epoch: {}| overall: correct / total = {} / {} = " + "{:.4f} %".format(epoch, correct, total, percent) + ) + + return metrics_func + + +def calculate_correct_answers(model, dataloader, epoch): + """Calculate correct over total answers""" + + forward_backward_func = get_forward_backward_func() + for m in model: + m.eval() + + def loss_func(labels, output_tensor): + logits = output_tensor + + loss_dict = {} + # Compute the correct answers. + predicted = torch.argmax(logits, dim=-1) + corrects = (predicted == labels).float() + # Add to the counters. + loss_dict['total'] = labels.size(0) + loss_dict['correct'] = corrects.sum().item() + + return 0, loss_dict + + #defined inside to capture output_predictions + def correct_answers_forward_step(batch, model): + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + images, labels = process_batch(batch_) + + # Forward model. + output_tensor = model(images) + + return output_tensor, partial(loss_func, labels) + + with torch.no_grad(): + # For all the batches in the dataset. + total = 0 + correct = 0 + for _, batch in enumerate(dataloader): + + loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model, + optimizer=None, timers=None, forward_only=True) + + for loss_dict in loss_dicts: + total += loss_dict['total'] + correct += loss_dict['correct'] + + for m in model: + m.train() + + # Reduce. + if mpu.is_pipeline_last_stage(): + unreduced = torch.cuda.LongTensor([correct, total]) + torch.distributed.all_reduce(unreduced, + group=mpu.get_data_parallel_group()) + + # Print on screen. + correct_ans = unreduced[0].item() + total_count = unreduced[1].item() + return correct_ans, total_count diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py new file mode 100644 index 0000000..ced2e67 --- /dev/null +++ b/tasks/vision/finetune_utils.py @@ -0,0 +1,297 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Finetune utilities.""" + +import torch +import torch.nn.functional as F +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import utils +from megatron.core import mpu +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint +from megatron.training import evaluate_and_print_results +from megatron.training import setup_model_and_optimizer +from megatron.training import train_step +from megatron.training import training_log +from megatron.training.utils import check_adlr_autoresume_termination +from megatron.training.utils import average_losses_across_data_parallel_group, print_params_min_max_norm +from megatron.core.enums import ModelType + +def process_batch(batch): + """Process batch and produce inputs for the model.""" + images = batch[0].cuda().contiguous() + labels = batch[1].cuda().contiguous() + return images, labels + + +def build_data_loader(dataset, micro_batch_size, + num_workers, drop_last, shuffle): + """Data loader. Note that batch-size is the local (per GPU) batch-size.""" + + # Sampler. + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, + drop_last=drop_last, shuffle=shuffle + ) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=micro_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + ) + + return data_loader + + +def _build_infinite_size_dataloader(dataloader): + """Build a looped dataloader with infinite size.""" + + iterator = dataloader.__iter__() + while True: + try: + yield iterator.__next__() + except StopIteration: + iterator = dataloader.__iter__() + + +def _build_train_valid_dataloaders(train_dataset, valid_dataset): + """Traing and validation dataloaders.""" + args = get_args() + + print_rank_0('building train and validation dataloaders ...') + # Training dataset. + train_dataloader = build_data_loader(train_dataset, args.micro_batch_size, + args.num_workers, False, True) + # Set the training iterations. + args.train_iters_per_epoch = len(train_dataloader) + args.train_iters = args.epochs * args.train_iters_per_epoch + # Validation dataset. For this dataset, we do not need to set up + # shuffling so we can just use a simple infinite loop. + valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size, + args.num_workers, True, False) + valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_) + + # Now that we've built the data loaders, set batch_size arguments + # to the actual batch size the model will see for this dataset. + # This is necessary so pipeline transfers know what size they are + # and the LR schedule, which is based on samples seen, gets set + # correctly. + args.orig_micro_batch_size = args.micro_batch_size + args.orig_global_batch_size = args.global_batch_size + + return train_dataloader, valid_dataloader + + +def _train( + model, + optimizer, + opt_param_scheduler, + forward_step, + train_dataloader, + valid_dataloader, + end_of_epoch_callback, + process_non_loss_data_func=None +): + """Train the model.""" + args = get_args() + timers = get_timers() + + # Turn on training mode which enables dropout. + for m in model: + m.train() + + # Tracking loss. + losses_dict_sum = {} + + # Starting epoch and iteration + start_epoch = args.iteration // args.train_iters_per_epoch + start_iteration = args.iteration % args.train_iters_per_epoch + iteration = args.iteration + + # Memory reporting flag. + report_memory_flag = True + + # For each remaining epoch + timers("interval-time", log_level=0).start(barrier=True) + for epoch in range(start_epoch, args.epochs): + print_rank_0("working on epoch {} ...".format(epoch + 1)) + + # Set the data loader epoch to shuffle the index iterator. + train_dataloader.sampler.set_epoch(args.seed + epoch) + train_dataloader.dataset.set_epoch(epoch) + + # For all the batches in the dataset. + for iteration_, batch in enumerate(train_dataloader): + + # Ignore the iterations before starting value + if iteration_ < start_iteration: + continue + # Set to zero so the next epoch does not skip any batches. + start_iteration = 0 + + # Train for one step. + losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step( + forward_step, batch, model, optimizer, opt_param_scheduler + ) + iteration += 1 + + # Logging. + params_norm = None + + report_memory_flag = training_log( + losses_dict, + losses_dict_sum, + optimizer.param_groups[0]["lr"], + iteration, + optimizer.get_loss_scale().item(), + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad + ) + + # Autoresume + if args.adlr_autoresume and \ + iteration % args.adlr_autoresume_interval == 0: + check_adlr_autoresume_termination(iteration, model, optimizer, + opt_param_scheduler) + + # Checkpointing + if args.save and args.save_interval and \ + iteration % args.save_interval == 0: + save_checkpoint(iteration, model, optimizer, + opt_param_scheduler) + + # Evaluation + if args.eval_interval and iteration % args.eval_interval == 0: + prefix = "iteration {}".format(iteration) + evaluate_and_print_results( + prefix, + forward_step, + valid_dataloader, + model, + iteration, + process_non_loss_data_func, + False, + ) + + # Callback at the end of each epoch. + if end_of_epoch_callback is not None: + end_of_epoch_callback(model, epoch) + + +def finetune( + train_valid_datasets_provider, + model_provider, + forward_step, + model_type=ModelType.encoder_or_decoder, + process_non_loss_data_func=None, + end_of_epoch_callback_provider=None, +): + """Main finetune function used across all tasks.""" + args = get_args() + timers = get_timers() + + # Train and validation data loaders. + timers("train/valid/test dataset/dataloder", log_level=0).start() + if args.epochs > 0: + train_dataset, valid_dataset = train_valid_datasets_provider() + train_dataloader, valid_dataloader = _build_train_valid_dataloaders( + train_dataset, valid_dataset + ) + timers("train/valid/test dataset/dataloder").stop() + + # Build calback function. + timers("callback function", log_level=0).start() + end_of_epoch_callback = None + if end_of_epoch_callback_provider is not None: + end_of_epoch_callback = end_of_epoch_callback_provider() + timers("callback function").stop() + + # Build model, optimizer and learning rate scheduler. + timers("model and optimizer", log_level=0).start() + model, optimizer, opt_param_scheduler = \ + setup_model_and_optimizer( + model_provider, + model_type, + scale_lr_cond=lambda name, param: ".head." in name, + lr_mult=args.head_lr_mult) + timers("model and optimizer").stop() + + # If pretrained checkpoint is provided and we have not trained for + # any iteration (i.e., iteration is zero), then load the pretrained + # checkpoint. + timers("pretrained checkpoint", log_level=0).start(barrier=True) + if args.iteration == 0 and args.pretrained_checkpoint is not None: + if args.pretrained_checkpoint_type == 'default': + original_load = args.load + args.load = args.pretrained_checkpoint + _ = load_checkpoint(model, None, None, strict=False) + args.load = original_load + elif args.pretrained_checkpoint_type == 'external': + unwrap_model = utils.unwrap_model(model) + state_dict = torch.load(args.pretrained_checkpoint, + map_location="cpu") + unwrap_model[0].module.backbone.load_state_dict(state_dict, + strict=False) + elif args.pretrained_checkpoint_type == 'constrastive': + unwrap_model = utils.unwrap_model(model) + state_dict = torch.load(args.pretrained_checkpoint, + map_location="cpu") + state_dict = state_dict["model"] + state_dict = {k.replace("teacher.backbone.", ""): v + for k, v in state_dict.items() + if k.startswith("teacher.backbone.")} + unwrap_model[0].module.backbone.load_state_dict(state_dict, + strict=False) + else: + raise Exception("pretrained checkpoint type {} not supported".format(args.pretrained_checkpoint_type)) + + # This is critical when only model is loaded. We should make sure + # master parameters are also updated. + optimizer.reload_model_params() + + timers("pretrained checkpoint").stop() + + # Print setup timing. + print_rank_0("done with setups ...") + timers.log( + [ + "train/valid/test dataset/dataloder", + "callback function", + "model and optimizer", + "pretrained checkpoint", + ] + ) + print_rank_0("training ...") + + # Finetune the model. + if args.epochs > 0: + _train( + model, + optimizer, + opt_param_scheduler, + forward_step, + train_dataloader, + valid_dataloader, + end_of_epoch_callback, + process_non_loss_data_func, + ) + # Or just evaluate. + else: + if end_of_epoch_callback is not None: + print_rank_0("evaluation only mode, setting epoch to -1") + end_of_epoch_callback(model, epoch=-1) + + print_rank_0("done :-)") + diff --git a/tasks/vision/main.py b/tasks/vision/main.py new file mode 100644 index 0000000..7975f6e --- /dev/null +++ b/tasks/vision/main.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Main tasks functionality.""" + +import os +import sys + +sys.path.append( + os.path.abspath( + os.path.join( + os.path.join(os.path.dirname(__file__), os.path.pardir), + os.path.pardir, + ) + ) +) +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron + +def get_tasks_args(parser): + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title="tasks") + + group.add_argument('--task', type=str, default='segment', + choices=['classify', 'segment_setr', 'segment_segformer'], + help='task name.') + group.add_argument("--epochs", type=int, default=None, + help="Number of finetunning epochs. Zero results in " + "evaluation only.") + group.add_argument('--pretrained-checkpoint-type', type=str, default='default', + choices=['default', 'external', 'constrastive'], + help='Type of pretrained checkpoint') + group.add_argument("--pretrained-checkpoint", type=str, default=None, + help="Pretrained checkpoint used for finetunning.") + group.add_argument('--seg-stride', type=int, default=None, + help='sliding window stride during evaluation') + return parser + + +if __name__ == "__main__": + + initialize_megatron(extra_args_provider=get_tasks_args) + args = get_args() + + if args.task == 'classify': + from tasks.vision.classification.classification import main + main() + elif args.task == 'segment_setr': + from tasks.vision.segmentation.finetune_setr import main + main() + elif args.task == 'segment_segformer': + from tasks.vision.segmentation.finetune_segformer import main + main() + diff --git a/tasks/vision/segmentation/cityscapes.py b/tasks/vision/segmentation/cityscapes.py new file mode 100644 index 0000000..af63a6f --- /dev/null +++ b/tasks/vision/segmentation/cityscapes.py @@ -0,0 +1,207 @@ +# BSD 3-Clause License +# +# Copyright (c) Soumith Chintala 2016, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# code taken from +# https://github.com/pytorch/vision/blob/main/torchvision/datasets/cityscapes.py +# modified it to change max label index from 255 to 19 (num_classes) + +import torch +import json +import os +from collections import namedtuple +from typing import Any, Callable, Dict, List, Optional, Union, Tuple +import numpy as np +from torchvision.datasets.utils import extract_archive, verify_str_arg, iterable_to_str +from torchvision.datasets import VisionDataset +from PIL import Image +from megatron.training import print_rank_0 + + +class Cityscapes(VisionDataset): + """`Cityscapes `_ Dataset. + Args: + root (string): Root directory of dataset where directory ``leftImg8bit`` + and ``gtFine`` or ``gtCoarse`` are located. + split (string, optional): The image split to use, ``train``, ``test`` or ``val`` if mode="fine" + otherwise ``train``, ``train_extra`` or ``val`` + mode (string, optional): The quality mode to use, ``fine`` or ``coarse`` + target_type (string or list, optional): Type of target to use, ``instance``, ``semantic``, ``polygon`` + or ``color``. Can also be a list to output a tuple with all specified target types. + transform (callable, optional): A function/transform that takes in a PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + transforms (callable, optional): A function/transform that takes input sample and its target as entry + and returns a transformed version. + Examples: + Get semantic segmentation target + .. code-block:: python + dataset = Cityscapes('./data/cityscapes', split='train', mode='fine', + target_type='semantic') + img, smnt = dataset[0] + Get multiple targets + .. code-block:: python + dataset = Cityscapes('./data/cityscapes', split='train', mode='fine', + target_type=['instance', 'color', 'polygon']) + img, (inst, col, poly) = dataset[0] + Validate on the "coarse" set + .. code-block:: python + dataset = Cityscapes('./data/cityscapes', split='val', mode='coarse', + target_type='semantic') + img, smnt = dataset[0] + """ + num_classes = 19 + ignore_index = 19 + color_table = torch.tensor( + [[128, 64, 128], + [244, 35, 232], + [70, 70, 70], + [102, 102, 156], + [190, 153, 153], + [153, 153, 153], + [250, 170, 30], + [220, 220, 0], + [107, 142, 35], + [152, 251, 152], + [70, 130, 180], + [220, 20, 60], + [255, 0, 0], + [0, 0, 142], + [0, 0, 70], + [0, 60, 100], + [0, 80, 100], + [0, 0, 230], + [119, 11, 32], + [0, 0, 0]], dtype=torch.float, device='cuda') + + + # Based on https://github.com/mcordts/cityscapesScripts + CityscapesClass = namedtuple('CityscapesClass', ['name', 'id', 'train_id', + 'category', 'category_id', 'has_instances', 'ignore_in_eval', 'color']) + + classes = [ + CityscapesClass('unlabeled', 0, 19, 'void', 0, False, True, (0, 0, 0)), + CityscapesClass('ego vehicle', 1, 19, 'void', 0, False, True, (0, 0, 0)), + CityscapesClass('rectification border', 2, 19, 'void', 0, False, True, (0, 0, 0)), + CityscapesClass('out of roi', 3, 19, 'void', 0, False, True, (0, 0, 0)), + CityscapesClass('static', 4, 19, 'void', 0, False, True, (0, 0, 0)), + CityscapesClass('dynamic', 5, 19, 'void', 0, False, True, (111, 74, 0)), + CityscapesClass('ground', 6, 19, 'void', 0, False, True, (81, 0, 81)), + CityscapesClass('road', 7, 0, 'flat', 1, False, False, (128, 64, 128)), + CityscapesClass('sidewalk', 8, 1, 'flat', 1, False, False, (244, 35, 232)), + CityscapesClass('parking', 9, 19, 'flat', 1, False, True, (250, 170, 160)), + CityscapesClass('rail track', 10, 19, 'flat', 1, False, True, (230, 150, 140)), + CityscapesClass('building', 11, 2, 'construction', 2, False, False, (70, 70, 70)), + CityscapesClass('wall', 12, 3, 'construction', 2, False, False, (102, 102, 156)), + CityscapesClass('fence', 13, 4, 'construction', 2, False, False, (190, 153, 153)), + CityscapesClass('guard rail', 14, 19, 'construction', 2, False, True, (180, 165, 180)), + CityscapesClass('bridge', 15, 19, 'construction', 2, False, True, (150, 100, 100)), + CityscapesClass('tunnel', 16, 19, 'construction', 2, False, True, (150, 120, 90)), + CityscapesClass('pole', 17, 5, 'object', 3, False, False, (153, 153, 153)), + CityscapesClass('polegroup', 18, 19, 'object', 3, False, True, (153, 153, 153)), + CityscapesClass('traffic light', 19, 6, 'object', 3, False, False, (250, 170, 30)), + CityscapesClass('traffic sign', 20, 7, 'object', 3, False, False, (220, 220, 0)), + CityscapesClass('vegetation', 21, 8, 'nature', 4, False, False, (107, 142, 35)), + CityscapesClass('terrain', 22, 9, 'nature', 4, False, False, (152, 251, 152)), + CityscapesClass('sky', 23, 10, 'sky', 5, False, False, (70, 130, 180)), + CityscapesClass('person', 24, 11, 'human', 6, True, False, (220, 20, 60)), + CityscapesClass('rider', 25, 12, 'human', 6, True, False, (255, 0, 0)), + CityscapesClass('car', 26, 13, 'vehicle', 7, True, False, (0, 0, 142)), + CityscapesClass('truck', 27, 14, 'vehicle', 7, True, False, (0, 0, 70)), + CityscapesClass('bus', 28, 15, 'vehicle', 7, True, False, (0, 60, 100)), + CityscapesClass('caravan', 29, 19, 'vehicle', 7, True, True, (0, 0, 90)), + CityscapesClass('trailer', 30, 19, 'vehicle', 7, True, True, (0, 0, 110)), + CityscapesClass('train', 31, 16, 'vehicle', 7, True, False, (0, 80, 100)), + CityscapesClass('motorcycle', 32, 17, 'vehicle', 7, True, False, (0, 0, 230)), + CityscapesClass('bicycle', 33, 18, 'vehicle', 7, True, False, (119, 11, 32)), + CityscapesClass('license plate', -1, -1, 'vehicle', 7, False, True, (0, 0, 142)), + ] + + # label2trainid + label2trainid = { label.id : label.train_id for label in classes} + + def __init__( + self, + root: str, + split: str = "train", + mode: str = "fine", + resolution: int = 1024, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + transforms: Optional[Callable] = None, + ) -> None: + super(Cityscapes, self).__init__(root, transforms, transform, target_transform) + self.mode = 'gtFine' if mode == 'fine' else 'gtCoarse' + self.images_dir = os.path.join(self.root, 'leftImg8bit_trainvaltest/leftImg8bit', split) + self.targets_dir = os.path.join(self.root, 'gtFine_trainvaltest/gtFine', split) + self.split = split + self.resolution = resolution + self.images = [] + self.targets = [] + + for city in sorted(os.listdir(self.images_dir)): + img_dir = os.path.join(self.images_dir, city) + target_dir = os.path.join(self.targets_dir, city) + for file_name in os.listdir(img_dir): + target_name = '{}_{}_labelIds.png'.format(file_name.split('_leftImg8bit')[0], self.mode) + self.images.append(os.path.join(img_dir, file_name)) + self.targets.append(os.path.join(target_dir, target_name)) + + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is a tuple of all target types if target_type is a list with more + than one item. Otherwise target is a json object if target_type="polygon", else the image segmentation. + """ + image = Image.open(self.images[index]).convert('RGB') + + target = Image.open(self.targets[index]) + target = np.array(target) + + target_copy = target.copy() + for k, v in Cityscapes.label2trainid.items(): + binary_target = (target == k) + target_copy[binary_target] = v + target = target_copy + + target = Image.fromarray(target.astype(np.uint8)) + + if self.transforms is not None: + image, target = self.transforms(image, target) + + return image, target + + def __len__(self) -> int: + # len(self.images) + return len(self.images) + diff --git a/tasks/vision/segmentation/data.py b/tasks/vision/segmentation/data.py new file mode 100644 index 0000000..a0ea612 --- /dev/null +++ b/tasks/vision/segmentation/data.py @@ -0,0 +1,154 @@ +import random +import os +import math +import mmcv +import torch +import numpy as np +import torchvision.transforms as T +from torchvision import datasets +from torch.utils.data import Dataset +from megatron.legacy.data.autoaugment import ImageNetPolicy +from tasks.vision.segmentation.cityscapes import Cityscapes +import tasks.vision.segmentation.transforms as ET +from megatron.legacy.data.autoaugment import ImageNetPolicy +from megatron.training import get_args +from PIL import Image, ImageOps + + +class VitSegmentationJointTransform(): + def __init__(self, train=True, resolution=None): + self.train = train + if self.train: + self.transform0 = ET.RandomSizeAndCrop(resolution) + self.transform1 = ET.RandomHorizontallyFlip() + + def __call__(self, img, mask): + if self.train: + img, mask = self.transform0(img, mask) + img, mask = self.transform1(img, mask) + return img, mask + + +class VitSegmentationImageTransform(): + def __init__(self, train=True, resolution=None): + args = get_args() + self.train = train + assert args.fp16 or args.bf16 + self.data_type = torch.half if args.fp16 else torch.bfloat16 + self.mean_std = args.mean_std + if self.train: + assert resolution is not None + self.transform = T.Compose([ + ET.PhotoMetricDistortion(), + T.ToTensor(), + T.Normalize(*self.mean_std), + T.ConvertImageDtype(self.data_type) + ]) + else: + self.transform = T.Compose([ + T.ToTensor(), + T.Normalize(*self.mean_std), + T.ConvertImageDtype(self.data_type) + ]) + + def __call__(self, input): + output = self.transform(input) + return output + + +class VitSegmentationTargetTransform(): + def __init__(self, train=True, resolution=None): + self.train = train + + def __call__(self, input): + output = torch.from_numpy(np.array(input, dtype=np.int32)).long() + return output + + +class RandomSeedSegmentationDataset(Dataset): + def __init__(self, + dataset, + joint_transform, + image_transform, + target_transform): + + args = get_args() + self.base_seed = args.seed + self.curr_seed = self.base_seed + self.dataset = dataset + self.joint_transform = joint_transform + self.image_transform = image_transform + self.target_transform = target_transform + + def __len__(self): + return len(self.dataset) + + def set_epoch(self, epoch): + self.curr_seed = self.base_seed + 100 * epoch + + def __getitem__(self, idx): + seed = idx + self.curr_seed + img, mask = self.dataset[idx] + + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + img, mask = self.joint_transform(img, mask) + img = self.image_transform(img) + mask = self.target_transform(mask) + + return img, mask + + +def build_cityscapes_train_valid_datasets(data_path, image_size): + args = get_args() + args.num_classes = Cityscapes.num_classes + args.ignore_index = Cityscapes.ignore_index + args.color_table = Cityscapes.color_table + args.mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + + train_joint_transform = \ + VitSegmentationJointTransform(train=True, resolution=image_size) + val_joint_transform = \ + VitSegmentationJointTransform(train=False, resolution=image_size) + train_image_transform = \ + VitSegmentationImageTransform(train=True, resolution=image_size) + val_image_transform = \ + VitSegmentationImageTransform(train=False, resolution=image_size) + train_target_transform = \ + VitSegmentationTargetTransform(train=True, resolution=image_size) + val_target_transform = \ + VitSegmentationTargetTransform(train=False, resolution=image_size) + + # training dataset + train_data = Cityscapes( + root=data_path[0], + split='train', + mode='fine', + resolution=image_size + ) + train_data = RandomSeedSegmentationDataset( + train_data, + joint_transform=train_joint_transform, + image_transform=train_image_transform, + target_transform=train_target_transform) + + # validation dataset + val_data = Cityscapes( + root=data_path[0], + split='val', + mode='fine', + resolution=image_size + ) + + val_data = RandomSeedSegmentationDataset( + val_data, + joint_transform=val_joint_transform, + image_transform=val_image_transform, + target_transform=val_target_transform) + + return train_data, val_data + + +def build_train_valid_datasets(data_path, image_size): + return build_cityscapes_train_valid_datasets(data_path, image_size) diff --git a/tasks/vision/segmentation/finetune_segformer.py b/tasks/vision/segmentation/finetune_segformer.py new file mode 100644 index 0000000..300f107 --- /dev/null +++ b/tasks/vision/segmentation/finetune_segformer.py @@ -0,0 +1,239 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Vision-classification finetuning/evaluation.""" + +import numpy as np +import torch +import torch.nn.functional as F +from functools import partial +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0, print_rank_last +from megatron.core import mpu +from tasks.vision.finetune_utils import finetune +from tasks.vision.finetune_utils import build_data_loader +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.schedules import get_forward_backward_func +from tasks.vision.segmentation.data import build_train_valid_datasets +from tasks.vision.segmentation.seg_models import SegformerSegmentationModel +from megatron.legacy.model.vision.utils import resize + + +def calculate_iou(hist_data): + acc = np.diag(hist_data).sum() / hist_data.sum() + acc_cls = np.diag(hist_data) / hist_data.sum(axis=1) + acc_cls = np.nanmean(acc_cls) + divisor = hist_data.sum(axis=1) + hist_data.sum(axis=0) - \ + np.diag(hist_data) + iu = np.diag(hist_data) / divisor + return iu, acc, acc_cls + + +def fast_hist(pred, gtruth, num_classes): + # mask indicates pixels we care about + mask = (gtruth >= 0) & (gtruth < num_classes) + + # stretch ground truth labels by num_classes + # class 0 -> 0 + # class 1 -> 19 + # class 18 -> 342 + # + # TP at 0 + 0, 1 + 1, 2 + 2 ... + # + # TP exist where value == num_classes*class_id + class_id + # FP = row[class].sum() - TP + # FN = col[class].sum() - TP + hist = np.bincount(num_classes * gtruth[mask].astype(int) + pred[mask], + minlength=num_classes ** 2) + hist = hist.reshape(num_classes, num_classes) + return hist + + +def segmentation(): + + def train_valid_datasets_provider(): + """Build train and validation dataset.""" + args = get_args() + + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + + ) + return train_ds, valid_ds + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + + model = SegformerSegmentationModel(num_classes=args.num_classes, + pre_process=pre_process, + post_process=post_process) + print_rank_0("model = {}".format(model)) + return model + + def process_batch(batch): + """Process batch and produce inputs for the model.""" + images = batch[0].cuda().contiguous() + masks = batch[1].cuda().contiguous() + return images, masks + + def calculate_weight(masks, num_classes): + bins = torch.histc(masks, bins=num_classes, min=0.0, max=num_classes) + hist_norm = bins.float()/bins.sum() + hist = ((bins != 0).float() * (1. - hist_norm)) + 1.0 + return hist + + def cross_entropy_loss_func(images, masks, output_tensor, + non_loss_data=False): + args = get_args() + ignore_index = args.ignore_index + color_table = args.color_table + logits = output_tensor.contiguous().float() + logits = resize(logits, size=masks.shape[1:], + mode='bilinear', align_corners=False) + + # Cross-entropy loss. + # weight = calculate_weight(masks, num_classes) + loss = F.cross_entropy(logits, masks, ignore_index=ignore_index) + + if not non_loss_data: + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + return loss, {'lm loss': averaged_loss[0]} + else: + seg_mask = logits.argmax(dim=1) + output_mask = F.embedding(seg_mask, color_table).permute(0, 3, 1, 2) + gt_mask = F.embedding(masks, color_table).permute(0, 3, 1, 2) + return torch.cat((images, output_mask, gt_mask), dim=2), loss + + def _cross_entropy_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + timers = get_timers() + + # Get the batch. + timers("batch generator", log_level=2).start() + import types + if isinstance(batch, types.GeneratorType): + batch_ = next(batch) + else: + batch_ = batch + images, masks = process_batch(batch_) + timers("batch generator").stop() + + # Forward model. + output_tensor = model(images) + + return output_tensor, partial(cross_entropy_loss_func, images, masks) + + def calculate_correct_answers(model, dataloader, epoch): + """Calculate correct over total answers""" + + forward_backward_func = get_forward_backward_func() + for m in model: + m.eval() + + def loss_func(labels, output_tensor): + args = get_args() + logits = output_tensor + logits = resize(logits, size=labels.shape[1:], + mode='bilinear', align_corners=False) + + loss_dict = {} + # Compute the correct answers. + probs = logits.contiguous().float().softmax(dim=1) + max_probs, preds = torch.max(probs, 1) + + preds = preds.cpu().numpy() + performs = fast_hist(preds.flatten(), + labels.cpu().numpy().flatten(), + args.ignore_index) + loss_dict['performs'] = performs + return 0, loss_dict + + # defined inside to capture output_predictions + def correct_answers_forward_step(batch, model): + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + images, labels = process_batch(batch_) + + # Forward model. + output_tensor = model(images) + + return output_tensor, partial(loss_func, labels) + + with torch.no_grad(): + # For all the batches in the dataset. + performs = None + for _, batch in enumerate(dataloader): + loss_dicts = forward_backward_func(correct_answers_forward_step, + batch, model, + optimizer=None, + timers=None, + forward_only=True) + for loss_dict in loss_dicts: + if performs is None: + performs = loss_dict['performs'] + else: + performs += loss_dict['performs'] + + for m in model: + m.train() + # Reduce. + if mpu.is_pipeline_last_stage(): + performs_tensor = torch.cuda.FloatTensor(performs) + torch.distributed.all_reduce(performs_tensor, + group=mpu.get_data_parallel_group()) + hist = performs_tensor.cpu().numpy() + iu, acc, acc_cls = calculate_iou(hist) + miou = np.nanmean(iu) + + return iu, miou + + def accuracy_func_provider(): + """Provide function that calculates accuracies.""" + args = get_args() + + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + ) + dataloader = build_data_loader( + valid_ds, + args.micro_batch_size, + num_workers=args.num_workers, + drop_last=(mpu.get_data_parallel_world_size() > 1), + shuffle=False + ) + + def metrics_func(model, epoch): + print_rank_0("calculating metrics ...") + iou, miou = calculate_correct_answers(model, dataloader, epoch) + print_rank_last( + " >> |epoch: {}| overall: iou = {}," + "miou = {:.4f} %".format(epoch, iou, miou*100.0) + ) + return metrics_func + + def dump_output_data(data, iteration, writer): + for (output_tb, loss) in data: + # output_tb[output_tb < 0] = 0 + # output_tb[output_tb > 1] = 1 + writer.add_images("image-outputseg-realseg", output_tb, + global_step=None, walltime=None, + dataformats='NCHW') + + """Finetune/evaluate.""" + finetune( + train_valid_datasets_provider, + model_provider, + forward_step=_cross_entropy_forward_step, + process_non_loss_data_func=dump_output_data, + end_of_epoch_callback_provider=accuracy_func_provider, + ) + + +def main(): + segmentation() + diff --git a/tasks/vision/segmentation/finetune_setr.py b/tasks/vision/segmentation/finetune_setr.py new file mode 100644 index 0000000..10ff886 --- /dev/null +++ b/tasks/vision/segmentation/finetune_setr.py @@ -0,0 +1,213 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Vision-classification finetuning/evaluation.""" + +import torch +import torch.nn.functional as F +from functools import partial +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0, print_rank_last +from megatron.core import mpu +from tasks.vision.finetune_utils import finetune +from tasks.vision.finetune_utils import build_data_loader +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.schedules import get_forward_backward_func +from tasks.vision.segmentation.metrics import CFMatrix +from tasks.vision.segmentation.data import build_train_valid_datasets +from tasks.vision.segmentation.seg_models import SetrSegmentationModel +from tasks.vision.segmentation.utils import slidingcrops, slidingjoins + +def segmentation(): + def train_valid_datasets_provider(): + """Build train and validation dataset.""" + args = get_args() + + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + + ) + return train_ds, valid_ds + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + args = get_args() + + return SetrSegmentationModel(num_classes=args.num_classes, + pre_process=pre_process, + post_process=post_process) + + def process_batch(batch): + """Process batch and produce inputs for the model.""" + images = batch[0].cuda().contiguous() + masks = batch[1].cuda().contiguous() + return images, masks + + def calculate_weight(masks, num_classes): + bins = torch.histc(masks, bins=num_classes, min=0.0, max=num_classes) + hist_norm = bins.float()/bins.sum() + hist = ((bins != 0).float() * (1. - hist_norm)) + 1.0 + return hist + + def cross_entropy_loss_func(images, masks, output_tensor, non_loss_data=False): + args = get_args() + ignore_index = args.ignore_index + color_table = args.color_table + weight = calculate_weight(masks, args.num_classes) + logits = output_tensor.contiguous().float() + loss = F.cross_entropy(logits, masks, weight=weight, ignore_index=ignore_index) + + if not non_loss_data: + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + else: + seg_mask = logits.argmax(dim=1) + output_mask = F.embedding(seg_mask, color_table).permute(0, 3, 1, 2) + gt_mask = F.embedding(masks, color_table).permute(0, 3, 1, 2) + return torch.cat((images, output_mask, gt_mask), dim=2), loss + + def _cross_entropy_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers("batch generator", log_level=2).start() + import types + if isinstance(batch, types.GeneratorType): + batch_ = next(batch) + else: + batch_ = batch + images, masks = process_batch(batch_) + timers("batch generator").stop() + + # Forward model. + if not model.training: + images, masks, _, _ = slidingcrops(images, masks) + #print_rank_0("images size = {}".format(images.size())) + + if not model.training: + output_tensor = torch.cat([model(image) for image in torch.split(images, args.micro_batch_size)]) + else: + output_tensor = model(images) + + return output_tensor, partial(cross_entropy_loss_func, images, masks) + + def calculate_correct_answers(model, dataloader, epoch): + """Calculate correct over total answers""" + + forward_backward_func = get_forward_backward_func() + for m in model: + m.eval() + + def loss_func(labels, slices_info, img_size, output_tensor): + args = get_args() + logits = output_tensor + + loss_dict = {} + # Compute the correct answers. + probs = logits.contiguous().float().softmax(dim=1) + max_probs, preds = torch.max(probs, 1) + preds = preds.int() + preds, labels = slidingjoins(preds, max_probs, labels, slices_info, img_size) + _, performs = CFMatrix()(preds, labels, args.ignore_index) + + loss_dict['performs'] = performs + return 0, loss_dict + + # defined inside to capture output_predictions + def correct_answers_forward_step(batch, model): + args = get_args() + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + images, labels = process_batch(batch_) + + assert not model.training + images, labels, slices_info, img_size = slidingcrops(images, labels) + # Forward model. + output_tensor = torch.cat([model(image) for image in torch.split(images, args.micro_batch_size)]) + + return output_tensor, partial(loss_func, labels, slices_info, img_size) + + with torch.no_grad(): + # For all the batches in the dataset. + performs = None + for _, batch in enumerate(dataloader): + loss_dicts = forward_backward_func(correct_answers_forward_step, + batch, model, + optimizer=None, + timers=None, + forward_only=True) + for loss_dict in loss_dicts: + if performs is None: + performs = loss_dict['performs'] + else: + performs += loss_dict['performs'] + + for m in model: + m.train() + # Reduce. + if mpu.is_pipeline_last_stage(): + torch.distributed.all_reduce(performs, + group=mpu.get_data_parallel_group()) + # Print on screen. + # performs[int(ch), :] = [nb_tp, nb_fp, nb_tn, nb_fn] + true_positive = performs[:, 0] + false_positive = performs[:, 1] + false_negative = performs[:, 3] + + iou = true_positive / (true_positive + false_positive + false_negative) + miou = iou[~torch.isnan(iou)].mean() + + return iou.tolist(), miou.item() + + def accuracy_func_provider(): + """Provide function that calculates accuracies.""" + args = get_args() + + train_ds, valid_ds = build_train_valid_datasets( + data_path=args.data_path, + image_size=(args.img_h, args.img_w) + ) + dataloader = build_data_loader( + valid_ds, + args.micro_batch_size, + num_workers=args.num_workers, + drop_last=(mpu.get_data_parallel_world_size() > 1), + shuffle=False + ) + + def metrics_func(model, epoch): + print_rank_0("calculating metrics ...") + iou, miou = calculate_correct_answers(model, dataloader, epoch) + print_rank_last( + " >> |epoch: {}| overall: iou = {}," + "miou = {:.4f} %".format(epoch, iou, miou*100.0) + ) + return metrics_func + + def dump_output_data(data, iteration, writer): + for (output_tb, loss) in data: + # output_tb[output_tb < 0] = 0 + # output_tb[output_tb > 1] = 1 + writer.add_images("image-outputseg-realseg", output_tb, + global_step=None, walltime=None, + dataformats='NCHW') + + """Finetune/evaluate.""" + finetune( + train_valid_datasets_provider, + model_provider, + forward_step=_cross_entropy_forward_step, + process_non_loss_data_func=dump_output_data, + end_of_epoch_callback_provider=accuracy_func_provider, + ) + + +def main(): + segmentation() + diff --git a/tasks/vision/segmentation/metrics.py b/tasks/vision/segmentation/metrics.py new file mode 100644 index 0000000..750c10a --- /dev/null +++ b/tasks/vision/segmentation/metrics.py @@ -0,0 +1,594 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +#copyright (c) go-hiroaki & Chokurei +#email: guangmingwu2010@gmail.com +# guozhilingty@gmail.com +# +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +eps = 1e-6 + +def _binarize(y_data, threshold): + """ + args: + y_data : [float] 4-d tensor in [batch_size, channels, img_rows, img_cols] + threshold : [float] [0.0, 1.0] + return 4-d binarized y_data + """ + y_data[y_data < threshold] = 0.0 + y_data[y_data >= threshold] = 1.0 + return y_data + +def _argmax(y_data, dim): + """ + args: + y_data : 4-d tensor in [batch_size, chs, img_rows, img_cols] + dim : int + return 3-d [int] y_data + """ + return torch.argmax(y_data, dim).int() + + +def _get_tp(y_pred, y_true): + """ + args: + y_true : [int] 3-d in [batch_size, img_rows, img_cols] + y_pred : [int] 3-d in [batch_size, img_rows, img_cols] + return [float] true_positive + """ + return torch.sum(y_true * y_pred).float() + + +def _get_fp(y_pred, y_true): + """ + args: + y_true : 3-d ndarray in [batch_size, img_rows, img_cols] + y_pred : 3-d ndarray in [batch_size, img_rows, img_cols] + return [float] false_positive + """ + return torch.sum((1 - y_true) * y_pred).float() + + +def _get_tn(y_pred, y_true): + """ + args: + y_true : 3-d ndarray in [batch_size, img_rows, img_cols] + y_pred : 3-d ndarray in [batch_size, img_rows, img_cols] + return [float] true_negative + """ + return torch.sum((1 - y_true) * (1 - y_pred)).float() + + +def _get_fn(y_pred, y_true): + """ + args: + y_true : 3-d ndarray in [batch_size, img_rows, img_cols] + y_pred : 3-d ndarray in [batch_size, img_rows, img_cols] + return [float] false_negative + """ + return torch.sum(y_true * (1 - y_pred)).float() + + +def _get_weights(y_true, nb_ch): + """ + args: + y_true : 3-d ndarray in [batch_size, img_rows, img_cols] + nb_ch : int + return [float] weights + """ + batch_size, img_rows, img_cols = y_true.shape + pixels = batch_size * img_rows * img_cols + weights = [torch.sum(y_true==ch).item() / pixels for ch in range(nb_ch)] + return weights + + +class CFMatrix(object): + def __init__(self, des=None): + self.des = des + + def __repr__(self): + return "ConfusionMatrix" + + def __call__(self, y_pred, y_true, ignore_index, threshold=0.5): + + """ + args: + y_true : 3-d ndarray in [batch_size, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return confusion matrix + """ + batch_size, img_rows, img_cols = y_pred.shape + chs = ignore_index + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + nb_tp = _get_tp(y_pred, y_true) + nb_fp = _get_fp(y_pred, y_true) + nb_tn = _get_tn(y_pred, y_true) + nb_fn = _get_fn(y_pred, y_true) + mperforms = [nb_tp, nb_fp, nb_tn, nb_fn] + performs = None + else: + performs = torch.zeros(chs, 4).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_false_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_false_ch[torch.logical_and((y_true != ch), (y_true != ignore_index))] = 1 + y_pred_ch[y_pred == ch] = 1 + nb_tp = _get_tp(y_pred_ch, y_true_ch) + nb_fp = torch.sum(y_false_ch * y_pred_ch).float() + nb_tn = torch.sum(y_false_ch * (1 - y_pred_ch)).float() + nb_fn = _get_fn(y_pred_ch, y_true_ch) + performs[int(ch), :] = torch.FloatTensor([nb_tp, nb_fp, nb_tn, nb_fn]) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class OAAcc(object): + def __init__(self, des="Overall Accuracy"): + self.des = des + + def __repr__(self): + return "OAcc" + + def __call__(self, y_pred, y_true, threshold=0.5): + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return (tp+tn)/total + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + + nb_tp_tn = torch.sum(y_true == y_pred).float() + mperforms = nb_tp_tn / (batch_size * img_rows * img_cols) + performs = None + return mperforms, performs + + +class Precision(object): + def __init__(self, des="Precision"): + self.des = des + + def __repr__(self): + return "Prec" + + def __call__(self, y_pred, y_true, threshold=0.5): + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return tp/(tp+fp) + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + nb_tp = _get_tp(y_pred, y_true) + nb_fp = _get_fp(y_pred, y_true) + mperforms = nb_tp / (nb_tp + nb_fp + esp) + performs = None + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + performs = torch.zeros(chs, 1).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_pred_ch[y_pred == ch] = 1 + nb_tp = _get_tp(y_pred_ch, y_true_ch) + nb_fp = _get_fp(y_pred_ch, y_true_ch) + performs[int(ch)] = nb_tp / (nb_tp + nb_fp + esp) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class Recall(object): + def __init__(self, des="Recall"): + self.des = des + + def __repr__(self): + return "Reca" + + def __call__(self, y_pred, y_true, threshold=0.5): + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return tp/(tp+fn) + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + nb_tp = _get_tp(y_pred, y_true) + nb_fn = _get_fn(y_pred, y_true) + mperforms = nb_tp / (nb_tp + nb_fn + esp) + performs = None + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + performs = torch.zeros(chs, 1).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_pred_ch[y_pred == ch] = 1 + nb_tp = _get_tp(y_pred_ch, y_true_ch) + nb_fn = _get_fn(y_pred_ch, y_true_ch) + performs[int(ch)] = nb_tp / (nb_tp + nb_fn + esp) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class F1Score(object): + def __init__(self, des="F1Score"): + self.des = des + + def __repr__(self): + return "F1Sc" + + def __call__(self, y_pred, y_true, threshold=0.5): + + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return 2*precision*recall/(precision+recall) + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + nb_tp = _get_tp(y_pred, y_true) + nb_fp = _get_fp(y_pred, y_true) + nb_fn = _get_fn(y_pred, y_true) + _precision = nb_tp / (nb_tp + nb_fp + esp) + _recall = nb_tp / (nb_tp + nb_fn + esp) + mperforms = 2 * _precision * _recall / (_precision + _recall + esp) + performs = None + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + performs = torch.zeros(chs, 1).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_pred_ch[y_pred == ch] = 1 + nb_tp = _get_tp(y_pred_ch, y_true_ch) + nb_fp = _get_fp(y_pred_ch, y_true_ch) + nb_fn = _get_fn(y_pred_ch, y_true_ch) + _precision = nb_tp / (nb_tp + nb_fp + esp) + _recall = nb_tp / (nb_tp + nb_fn + esp) + performs[int(ch)] = 2 * _precision * \ + _recall / (_precision + _recall + esp) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class Kappa(object): + def __init__(self, des="Kappa"): + self.des = des + + def __repr__(self): + return "Kapp" + + def __call__(self, y_pred, y_true, threshold=0.5): + + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return (Po-Pe)/(1-Pe) + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + nb_tp = _get_tp(y_pred, y_true) + nb_fp = _get_fp(y_pred, y_true) + nb_tn = _get_tn(y_pred, y_true) + nb_fn = _get_fn(y_pred, y_true) + nb_total = nb_tp + nb_fp + nb_tn + nb_fn + Po = (nb_tp + nb_tn) / nb_total + Pe = ((nb_tp + nb_fp) * (nb_tp + nb_fn) + + (nb_fn + nb_tn) * (nb_fp + nb_tn)) / (nb_total**2) + mperforms = (Po - Pe) / (1 - Pe + esp) + performs = None + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + performs = torch.zeros(chs, 1).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_pred_ch[y_pred == ch] = 1 + nb_tp = _get_tp(y_pred_ch, y_true_ch) + nb_fp = _get_fp(y_pred_ch, y_true_ch) + nb_tn = _get_tn(y_pred_ch, y_true_ch) + nb_fn = _get_fn(y_pred_ch, y_true_ch) + nb_total = nb_tp + nb_fp + nb_tn + nb_fn + Po = (nb_tp + nb_tn) / nb_total + Pe = ((nb_tp + nb_fp) * (nb_tp + nb_fn) + + (nb_fn + nb_tn) * (nb_fp + nb_tn)) / (nb_total**2) + performs[int(ch)] = (Po - Pe) / (1 - Pe + esp) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class Jaccard(object): + def __init__(self, des="Jaccard"): + self.des = des + + def __repr__(self): + return "Jacc" + + def __call__(self, y_pred, y_true, threshold=0.5): + """ + args: + y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols] + threshold : [0.0, 1.0] + return intersection / (sum-intersection) + """ + batch_size, chs, img_rows, img_cols = y_true.shape + device = y_true.device + if chs == 1: + y_pred = _binarize(y_pred, threshold) + y_true = _binarize(y_true, threshold) + _intersec = torch.sum(y_true * y_pred).float() + _sum = torch.sum(y_true + y_pred).float() + mperforms = _intersec / (_sum - _intersec + esp) + performs = None + else: + y_pred = _argmax(y_pred, 1) + y_true = _argmax(y_true, 1) + performs = torch.zeros(chs, 1).to(device) + weights = _get_weights(y_true, chs) + for ch in range(chs): + y_true_ch = torch.zeros(batch_size, img_rows, img_cols) + y_pred_ch = torch.zeros(batch_size, img_rows, img_cols) + y_true_ch[y_true == ch] = 1 + y_pred_ch[y_pred == ch] = 1 + _intersec = torch.sum(y_true_ch * y_pred_ch).float() + _sum = torch.sum(y_true_ch + y_pred_ch).float() + performs[int(ch)] = _intersec / (_sum - _intersec + esp) + mperforms = sum([i*j for (i, j) in zip(performs, weights)]) + return mperforms, performs + + +class MSE(object): + def __init__(self, des="Mean Square Error"): + self.des = des + + def __repr__(self): + return "MSE" + + def __call__(self, y_pred, y_true, dim=1, threshold=None): + """ + args: + y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + threshold : [0.0, 1.0] + return mean_squared_error, smaller the better + """ + if threshold: + y_pred = _binarize(y_pred, threshold) + return torch.mean((y_pred - y_true) ** 2) + + +class PSNR(object): + def __init__(self, des="Peak Signal to Noise Ratio"): + self.des = des + + def __repr__(self): + return "PSNR" + + def __call__(self, y_pred, y_true, dim=1, threshold=None): + """ + args: + y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + threshold : [0.0, 1.0] + return PSNR, larger the better + """ + if threshold: + y_pred = _binarize(y_pred, threshold) + mse = torch.mean((y_pred - y_true) ** 2) + return 10 * torch.log10(1 / mse) + + +class SSIM(object): + ''' + modified from https://github.com/jorge-pessoa/pytorch-msssim + ''' + def __init__(self, des="structural similarity index"): + self.des = des + + def __repr__(self): + return "SSIM" + + def gaussian(self, w_size, sigma): + gauss = torch.Tensor([math.exp(-(x - w_size//2)**2/float(2*sigma**2)) for x in range(w_size)]) + return gauss/gauss.sum() + + def create_window(self, w_size, channel=1): + _1D_window = self.gaussian(w_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) + window = _2D_window.expand(channel, 1, w_size, w_size).contiguous() + return window + + def __call__(self, y_pred, y_true, w_size=11, size_average=True, full=False): + """ + args: + y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + w_size : int, default 11 + size_average : boolean, default True + full : boolean, default False + return ssim, larger the better + """ + # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh). + if torch.max(y_pred) > 128: + max_val = 255 + else: + max_val = 1 + + if torch.min(y_pred) < -0.5: + min_val = -1 + else: + min_val = 0 + L = max_val - min_val + + padd = 0 + (_, channel, height, width) = y_pred.size() + window = self.create_window(w_size, channel=channel).to(y_pred.device) + + mu1 = F.conv2d(y_pred, window, padding=padd, groups=channel) + mu2 = F.conv2d(y_true, window, padding=padd, groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d(y_pred * y_pred, window, padding=padd, groups=channel) - mu1_sq + sigma2_sq = F.conv2d(y_true * y_true, window, padding=padd, groups=channel) - mu2_sq + sigma12 = F.conv2d(y_pred * y_true, window, padding=padd, groups=channel) - mu1_mu2 + + C1 = (0.01 * L) ** 2 + C2 = (0.03 * L) ** 2 + + v1 = 2.0 * sigma12 + C2 + v2 = sigma1_sq + sigma2_sq + C2 + cs = torch.mean(v1 / v2) # contrast sensitivity + + ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2) + + if size_average: + ret = ssim_map.mean() + else: + ret = ssim_map.mean(1).mean(1).mean(1) + + if full: + return ret, cs + return ret + + +class AE(object): + """ + Modified from matlab : colorangle.m, MATLAB V2019b + angle = acos(RGB1' * RGB2 / (norm(RGB1) * norm(RGB2))); + angle = 180 / pi * angle; + """ + def __init__(self, des='average Angular Error'): + self.des = des + + def __repr__(self): + return "AE" + + def __call__(self, y_pred, y_true): + """ + args: + y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols] + return average AE, smaller the better + """ + dotP = torch.sum(y_pred * y_true, dim=1) + Norm_pred = torch.sqrt(torch.sum(y_pred * y_pred, dim=1)) + Norm_true = torch.sqrt(torch.sum(y_true * y_true, dim=1)) + ae = 180 / math.pi * torch.acos(dotP / (Norm_pred * Norm_true + eps)) + return ae.mean(1).mean(1) + + +if __name__ == "__main__": + for ch in [3, 1]: + batch_size, img_row, img_col = 1, 224, 224 + y_true = torch.rand(batch_size, ch, img_row, img_col) + noise = torch.zeros(y_true.size()).data.normal_(0, std=0.1) + y_pred = y_true + noise + for cuda in [False, True]: + if cuda: + y_pred = y_pred.cuda() + y_true = y_true.cuda() + + print('#'*20, 'Cuda : {} ; size : {}'.format(cuda, y_true.size())) + ########### similarity metrics + metric = MSE() + acc = metric(y_pred, y_true).item() + print("{} ==> {}".format(repr(metric), acc)) + + metric = PSNR() + acc = metric(y_pred, y_true).item() + print("{} ==> {}".format(repr(metric), acc)) + + metric = SSIM() + acc = metric(y_pred, y_true).item() + print("{} ==> {}".format(repr(metric), acc)) + + metric = LPIPS(cuda) + acc = metric(y_pred, y_true).item() + print("{} ==> {}".format(repr(metric), acc)) + + metric = AE() + acc = metric(y_pred, y_true).item() + print("{} ==> {}".format(repr(metric), acc)) + + ########### accuracy metrics + metric = OAAcc() + maccu, accu = metric(y_pred, y_true) + print('mAccu:', maccu, 'Accu', accu) + + metric = Precision() + mprec, prec = metric(y_pred, y_true) + print('mPrec:', mprec, 'Prec', prec) + + metric = Recall() + mreca, reca = metric(y_pred, y_true) + print('mReca:', mreca, 'Reca', reca) + + metric = F1Score() + mf1sc, f1sc = metric(y_pred, y_true) + print('mF1sc:', mf1sc, 'F1sc', f1sc) + + metric = Kappa() + mkapp, kapp = metric(y_pred, y_true) + print('mKapp:', mkapp, 'Kapp', kapp) + + metric = Jaccard() + mjacc, jacc = metric(y_pred, y_true) + print('mJacc:', mjacc, 'Jacc', jacc) + diff --git a/tasks/vision/segmentation/seg_heads.py b/tasks/vision/segmentation/seg_heads.py new file mode 100644 index 0000000..6d06cbc --- /dev/null +++ b/tasks/vision/segmentation/seg_heads.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import math +import einops +import torch +import apex +import torch.nn.functional as F +from megatron.training import get_args +from megatron.legacy.model import LayerNorm +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.utils import resize + + +class SetrSegmentationHead(MegatronModule): + def __init__(self, hidden_size, num_classes): + super(SetrSegmentationHead, self).__init__() + args = get_args() + self.hidden_size = hidden_size + self.num_classes = num_classes + self.img_h = args.img_h + self.img_w = args.img_w + self.patch_dim = args.patch_dim + + self.layernorm = LayerNorm(hidden_size, eps=args.layernorm_epsilon) + self.conv_0 = torch.nn.Conv2d(hidden_size, hidden_size, + 1, 1, bias=False) + self.norm_0 = apex.parallel.SyncBatchNorm(hidden_size) + self.conv_1 = torch.nn.Conv2d(hidden_size, num_classes, 1, 1) + + def to_2D(self, x): + n, hw, c = x.shape + h = self.img_h // self.patch_dim + w = self.img_w // self.patch_dim + assert(hw == h * w) + x = x.transpose(1, 2).reshape(n, c, h, w) + return x + + def forward(self, hidden_states): + # [b c h w] + hidden_states = self.layernorm(hidden_states) + hidden_states = self.to_2D(hidden_states) + + hidden_states = self.conv_0(hidden_states) + hidden_states = self.norm_0(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.conv_1(hidden_states) + + # [b c h w] + result = F.interpolate(hidden_states, + size=(self.img_h, self.img_w), + mode='bilinear') + + return result + + +class MLP(torch.nn.Module): + """ + Linear Embedding + """ + def __init__(self, input_dim=2048, embed_dim=768): + super().__init__() + self.proj = torch.nn.Linear(input_dim, embed_dim) + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class SegformerSegmentationHead(MegatronModule): + def __init__(self, feature_strides, in_channels, + embedding_dim, dropout_ratio): + super(SegformerSegmentationHead, self).__init__() + assert len(feature_strides) == len(in_channels) + assert min(feature_strides) == feature_strides[0] + args = get_args() + self.feature_strides = feature_strides + self.in_channels = in_channels + self.embedding_dim = embedding_dim + self.num_classes = args.num_classes + self.dropout_ratio = dropout_ratio + + c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \ + self.in_channels + + self.linear_c4 = MLP(input_dim=c4_in_channels, + embed_dim=self.embedding_dim) + self.linear_c3 = MLP(input_dim=c3_in_channels, + embed_dim=self.embedding_dim) + self.linear_c2 = MLP(input_dim=c2_in_channels, + embed_dim=self.embedding_dim) + self.linear_c1 = MLP(input_dim=c1_in_channels, + embed_dim=self.embedding_dim) + + self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, + self.embedding_dim, 1, 1) + self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim) + + self.dropout = torch.nn.Dropout2d(self.dropout_ratio) + self.linear_pred = torch.nn.Conv2d(self.embedding_dim, + self.num_classes, + kernel_size=1) + + def forward(self, inputs): + c1, c2, c3, c4 = inputs + + ############## MLP decoder on C1-C4 ########### + n, _, h, w = c4.shape + + _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3]) + _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3]) + _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3]) + _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3]) + + _c = self.conv_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) + x = self.norm(_c) + x = F.relu(x, inplace=True) + x = self.dropout(x) + x = self.linear_pred(x) + + return x + diff --git a/tasks/vision/segmentation/seg_models.py b/tasks/vision/segmentation/seg_models.py new file mode 100644 index 0000000..9b152d0 --- /dev/null +++ b/tasks/vision/segmentation/seg_models.py @@ -0,0 +1,79 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import math +import einops +import torch +import apex +import torch.nn.functional as F +from megatron.training import get_args +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead +from megatron.legacy.model.vision.mit_backbone import mit_b3, mit_b5 +from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead + + +class SetrSegmentationModel(MegatronModule): + + def __init__(self, + num_classes, + pre_process=True, + post_process=True): + super(SetrSegmentationModel, self).__init__() + args = get_args() + assert post_process & pre_process + self.hidden_size = args.hidden_size + self.num_classes = num_classes + self.backbone = VitBackbone( + pre_process=pre_process, + post_process=post_process, + class_token=False, + post_layer_norm=False, + drop_path_rate=0.1 + ) + + self.head = SetrSegmentationHead( + self.hidden_size, + self.num_classes + ) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + pass + + def forward(self, input): + # [b hw c] + hidden_states = self.backbone(input) + result_final = self.head(hidden_states) + return result_final + + +class SegformerSegmentationModel(MegatronModule): + + def __init__(self, + num_classes, + pre_process=True, + post_process=True): + super(SegformerSegmentationModel, self).__init__() + args = get_args() + self.hidden_size = args.hidden_size + self.num_classes = num_classes + self.pre_process = pre_process + self.post_process = post_process + + self.backbone = mit_b5() + self.head = SegformerSegmentationHead( + feature_strides=[4, 8, 16, 32], + in_channels=[64, 128, 320, 512], + embedding_dim=768, + dropout_ratio=0.1 + ) + + def set_input_tensor(self, input_tensor): + """See megatron.legacy.model.transformer.set_input_tensor()""" + pass + + def forward(self, input): + # [b hw c] + hidden_states = self.backbone(input) + hidden_states = self.head(hidden_states) + return hidden_states + diff --git a/tasks/vision/segmentation/transforms.py b/tasks/vision/segmentation/transforms.py new file mode 100644 index 0000000..51e11ab --- /dev/null +++ b/tasks/vision/segmentation/transforms.py @@ -0,0 +1,433 @@ +# Copyright (c) 2020 The MMSegmenation Authors. +# +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +import random +import os +import math +import mmcv +import torch +import numpy as np +import torchvision.transforms as T +from torchvision import datasets +from torch.utils.data import Dataset +from megatron.training import print_rank_0 +from megatron.training import get_args +from PIL import Image, ImageOps, ImageEnhance +import torchvision.transforms as torch_tr + +def _is_pil_image(img): + return isinstance(img, Image.Image) + +class PhotoMetricDistortion(object): + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def convert(self, img, alpha=1, beta=0): + """Multiple with alpha and add beat with clip.""" + img = img.astype(np.float32) * alpha + beta + img = np.clip(img, 0, 255) + return img.astype(np.uint8) + + def brightness(self, img): + """Brightness distortion.""" + if random.randint(0, 1): + return self.convert( + img, + beta=random.uniform(-self.brightness_delta, + self.brightness_delta)) + return img + + def contrast(self, img): + """Contrast distortion.""" + if random.randint(0, 1): + return self.convert( + img, + alpha=random.uniform(self.contrast_lower, self.contrast_upper)) + return img + + def saturation(self, img): + """Saturation distortion.""" + if random.randint(0, 1): + img = mmcv.bgr2hsv(img) + img[:, :, 1] = self.convert( + img[:, :, 1], + alpha=random.uniform(self.saturation_lower, + self.saturation_upper)) + img = mmcv.hsv2bgr(img) + return img + + def hue(self, img): + """Hue distortion.""" + if random.randint(0, 1): + img = mmcv.bgr2hsv(img) + img[:, :, + 0] = (img[:, :, 0].astype(int) + + random.randint(-self.hue_delta, self.hue_delta)) % 180 + img = mmcv.hsv2bgr(img) + return img + + def __call__(self, img): + """Call function to perform photometric distortion on images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Result dict with images distorted. + """ + img = np.array(img) + + # random brightness + img = self.brightness(img) + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(0, 1) + if mode == 1: + img = self.contrast(img) + + # random saturation + img = self.saturation(img) + + # random hue + img = self.hue(img) + + # random contrast + if mode == 0: + img = self.contrast(img) + + img = Image.fromarray(img.astype(np.uint8)).convert('RGB') + return img + + +class RandomCrop(object): + """ + Take a random crop from the image. + + First the image or crop size may need to be adjusted if the incoming image + is too small... + + If the image is smaller than the crop, then: + the image is padded up to the size of the crop + unless 'nopad', in which case the crop size is shrunk to fit the image + + A random crop is taken such that the crop fits within the image. + + + if cfg.DATASET.TRANSLATION_AUG_FIX is set, we insure that there's always + translation randomness of at least that value around the image. + + if image < crop_size: + # slide crop within image, random offset + else: + # slide image within crop + """ + def __init__(self, crop_size): + args = get_args() + self.size = crop_size + self.cat_max_ratio = 0.75 + self.ignore_index = args.ignore_index + self.pad_color = (0, 0, 0) + + def get_crop_bbox(self, img): + """Randomly get a crop bounding box.""" + img_w, img_h = img.size + target_h, target_w = self.size #[H W] + margin_h = max(img_h - target_h, 0) + margin_w = max(img_w - target_w, 0) + offset_h = random.randint(0, margin_h) + offset_w = random.randint(0, margin_w) + crop_y1, crop_y2 = offset_h, offset_h + target_h + crop_x1, crop_x2 = offset_w, offset_w + target_w + + return crop_y1, crop_y2, crop_x1, crop_x2 + + def crop(self, img, crop_bbox): + """Crop from ``img``""" + crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox + img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2)) + return img + + @staticmethod + def crop_in_image(target_w, target_h, w, h, img, mask): + if w == target_w: + x1 = 0 + else: + x1 = random.randint(0, w - target_w) + if h == target_h: + y1 = 0 + else: + y1 = random.randint(0, h - target_h) + + return [img.crop((x1, y1, x1 + target_w, y1 + target_h)), + mask.crop((x1, y1, x1 + target_w, y1 + target_h))] + + + def __call__(self, img, mask): + w, h = img.size + target_h, target_w = self.size # ASSUME H, W + + if w == target_w and h == target_h: + return img, mask + + # Pad image if image < crop + if target_h > h: + pad_h = (target_h - h) // 2 + 1 + else: + pad_h = 0 + if target_w > w: + pad_w = (target_w - w) // 2 + 1 + else: + pad_w = 0 + border = (pad_w, pad_h, pad_w, pad_h) + if pad_h or pad_w: + img = ImageOps.expand(img, border=border, fill=(0, 0, 0)) + mask = ImageOps.expand(mask, border=border, fill=self.ignore_index) + w, h = img.size + + crop_bbox = self.get_crop_bbox(img) + if self.cat_max_ratio < 1.: + # Repeat 10 times + for _ in range(10): + seg_temp = self.crop(mask, crop_bbox) + labels, cnt = np.unique(seg_temp, return_counts=True) + cnt = cnt[labels != self.ignore_index] + if len(cnt) > 1 and np.max(cnt) / np.sum( + cnt) < self.cat_max_ratio: + break + crop_bbox = self.get_crop_bbox(img) + + # crop the image + img = self.crop(img, crop_bbox) + + # crop semantic seg + mask = self.crop(mask, crop_bbox) + assert(img.size[0] == self.size[1] and img.size[1] == self.size[0]) + + return img, mask + + +class RandomSizeAndCrop(object): + def __init__(self, + crop_size, + scale_min=0.5, + scale_max=2.0): + self.crop = RandomCrop(crop_size) + self.scale_min = scale_min + self.scale_max = scale_max + + def __call__(self, img, mask): + + scale_amt = random.uniform(self.scale_min, self.scale_max) + w, h = [int(i * scale_amt) for i in img.size] + + resized_img = img.resize((w, h), Image.BICUBIC) + resized_mask = mask.resize((w, h), Image.NEAREST) + img, mask = self.crop(resized_img, resized_mask) + return img, mask + +class RandomHorizontallyFlip(object): + def __call__(self, img, mask): + if random.random() < 0.5: + return img.transpose(Image.FLIP_LEFT_RIGHT), mask.transpose( + Image.FLIP_LEFT_RIGHT) + return img, mask + + +def adjust_brightness(img, brightness_factor): + """Adjust brightness of an Image. + + Args: + img (PIL Image): PIL Image to be adjusted. + brightness_factor (float): How much to adjust the brightness. Can be + any non negative number. 0 gives a black image, 1 gives the + original image while 2 increases the brightness by a factor of 2. + + Returns: + PIL Image: Brightness adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Brightness(img) + img = enhancer.enhance(brightness_factor) + return img + + +def adjust_contrast(img, contrast_factor): + """Adjust contrast of an Image. + + Args: + img (PIL Image): PIL Image to be adjusted. + contrast_factor (float): How much to adjust the contrast. Can be any + non negative number. 0 gives a solid gray image, 1 gives the + original image while 2 increases the contrast by a factor of 2. + + Returns: + PIL Image: Contrast adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(contrast_factor) + return img + + +def adjust_saturation(img, saturation_factor): + """Adjust color saturation of an image. + + Args: + img (PIL Image): PIL Image to be adjusted. + saturation_factor (float): How much to adjust the saturation. 0 will + give a black and white image, 1 will give the original image while + 2 will enhance the saturation by a factor of 2. + + Returns: + PIL Image: Saturation adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Color(img) + img = enhancer.enhance(saturation_factor) + return img + + +def adjust_hue(img, hue_factor): + """Adjust hue of an image. + + The image hue is adjusted by converting the image to HSV and + cyclically shifting the intensities in the hue channel (H). + The image is then converted back to original image mode. + + `hue_factor` is the amount of shift in H channel and must be in the + interval `[-0.5, 0.5]`. + + See https://en.wikipedia.org/wiki/Hue for more details on Hue. + + Args: + img (PIL Image): PIL Image to be adjusted. + hue_factor (float): How much to shift the hue channel. Should be in + [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in + HSV space in positive and negative direction respectively. + 0 means no shift. Therefore, both -0.5 and 0.5 will give an image + with complementary colors while 0 gives the original image. + + Returns: + PIL Image: Hue adjusted image. + """ + if not(-0.5 <= hue_factor <= 0.5): + raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor)) + + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + input_mode = img.mode + if input_mode in {'L', '1', 'I', 'F'}: + return img + + h, s, v = img.convert('HSV').split() + + np_h = np.array(h, dtype=np.uint8) + # uint8 addition take cares of rotation across boundaries + with np.errstate(over='ignore'): + np_h += np.uint8(hue_factor * 255) + h = Image.fromarray(np_h, 'L') + + img = Image.merge('HSV', (h, s, v)).convert(input_mode) + return img + + +class ColorJitter(object): + """Randomly change the brightness, contrast and saturation of an image. + + Args: + brightness (float): How much to jitter brightness. brightness_factor + is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. + contrast (float): How much to jitter contrast. contrast_factor + is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. + saturation (float): How much to jitter saturation. saturation_factor + is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. + hue(float): How much to jitter hue. hue_factor is chosen uniformly from + [-hue, hue]. Should be >=0 and <= 0.5. + """ + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): + self.brightness = brightness + self.contrast = contrast + self.saturation = saturation + self.hue = hue + + @staticmethod + def get_params(brightness, contrast, saturation, hue): + """Get a randomized transform to be applied on image. + + Arguments are same as that of __init__. + + Returns: + Transform which randomly adjusts brightness, contrast and + saturation in a random order. + """ + transforms = [] + if brightness > 0: + brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness) + transforms.append( + torch_tr.Lambda(lambda img: adjust_brightness(img, brightness_factor))) + + if contrast > 0: + contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast) + transforms.append( + torch_tr.Lambda(lambda img: adjust_contrast(img, contrast_factor))) + + if saturation > 0: + saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation) + transforms.append( + torch_tr.Lambda(lambda img: adjust_saturation(img, saturation_factor))) + + if hue > 0: + hue_factor = np.random.uniform(-hue, hue) + transforms.append( + torch_tr.Lambda(lambda img: adjust_hue(img, hue_factor))) + + np.random.shuffle(transforms) + transform = torch_tr.Compose(transforms) + + return transform + + def __call__(self, img): + """ + Args: + img (PIL Image): Input image. + + Returns: + PIL Image: Color jittered image. + """ + transform = self.get_params(self.brightness, self.contrast, + self.saturation, self.hue) + return transform(img) + diff --git a/tasks/vision/segmentation/utils.py b/tasks/vision/segmentation/utils.py new file mode 100644 index 0000000..f9cfb82 --- /dev/null +++ b/tasks/vision/segmentation/utils.py @@ -0,0 +1,85 @@ +import math +import torch +import numpy as np +from megatron.training import get_args + +def slidingcrops(img, mask): + # img: [b c h w] + # mask: [b h w] + args = get_args() + assert args.img_h == args.img_w + crop_size = args.img_h + stride = args.seg_stride + ignore_index = args.ignore_index + n, c, h, w = img.shape + assert h >= crop_size + assert w >= crop_size + long_size = max(h, w) + + img_slices, mask_slices, slices_info = [], [], [] + if long_size > crop_size: + assert stride <= crop_size + h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1 + w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1 + for yy in range(h_step_num): + for xx in range(w_step_num): + sy, sx = yy * stride, xx * stride + ey, ex = sy + crop_size, sx + crop_size + img_sub = img[:, :, sy: ey, sx: ex] + mask_sub = mask[:, sy: ey, sx: ex] + + # padding + sub_h, sub_w = img_sub.shape[2:] + pad_h = max(crop_size - sub_h, 0) + pad_w = max(crop_size - sub_w, 0) + img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index) + mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h)) + + img_slices.append(img_sub) + mask_slices.append(mask_sub) + slices_info.append([sy, ey, sx, ex, sub_h, sub_w]) + + return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w) + else: + return img, mask, [[0, h, 0, w, h, w]], (h, w) + + +def slidingjoins(preds, probs, labels, slices_info, img_size): + args = get_args() + num_slices = len(slices_info) + + if num_slices == 1: + return preds, labels + + h, w = img_size + split_size = args.micro_batch_size + + preds_split = torch.split(preds, split_size) + probs_split = torch.split(probs, split_size) + labels_split = torch.split(labels, split_size) + + assert(len(preds_split) == num_slices) + + total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda') + total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') + total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') + + for i in range(num_slices): + sy, ey, sx, ex, sub_h, sub_w = slices_info[i] + assert sy + sub_h <= h + assert sx + sub_w <= w + curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] + curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w] + + local_max_probs = probs_split[i][:, :sub_h, : sub_w] + local_preds = preds_split[i][:, :sub_h, :sub_w] + + result_max_probs = torch.maximum(curr_max_probs, local_max_probs) + result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds) + + total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs + total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds + total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w] + + return total_preds, total_labels + diff --git a/tasks/zeroshot_gpt/datasets.py b/tasks/zeroshot_gpt/datasets.py new file mode 100644 index 0000000..98618e8 --- /dev/null +++ b/tasks/zeroshot_gpt/datasets.py @@ -0,0 +1,148 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Zero-shot datasets.""" + +import json +import math + +import numpy as np +import torch + +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from .detokenizer import get_detokenizer + + +def build_dataset(task): + """Helper function to select and build dataset.""" + + if task == 'LAMBADA': + return _build_lambada_dataset() + if task == 'WIKITEXT103': + return _build_wikitext103_dataset() + + raise NotImplementedError('dataset for {} task is not ' + 'implemented.'.format(task)) + + +class _LMDataset(torch.utils.data.Dataset): + + def __init__(self, tokens, seq_len, pad_idx, num_original_tokens, + num_tokenized_tokens, overalapping_eval=None): + self.tokens = tokens + self.seq_len = seq_len + self.pad_idx = pad_idx + self.overalapping_eval = overalapping_eval + if self.overalapping_eval is None: + self.overalapping_eval = self.seq_len + self.overalapping_eval = max(1, self.overalapping_eval) + self.num_original_tokens = num_original_tokens + self.num_tokenized_tokens = num_tokenized_tokens + self.total_targets = len(self.tokens) - 1 + # remove first sequence tokens + targets = max(self.total_targets - self.overalapping_eval, 0) + self.total_sequences = max( + math.ceil(targets / self.overalapping_eval) + 1, 1) + + def __len__(self): + return self.total_sequences + + def __getitem__(self, idx): + start_idx = idx * self.overalapping_eval + end_idx = start_idx + self.seq_len + tokens = self.tokens[start_idx:end_idx + 1] + num_tokens = len(tokens) + pad_mask = [1] * num_tokens + if num_tokens < self.seq_len + 1: + num_pad = (self.seq_len + 1 - num_tokens) + pad_mask += [0] * (num_pad) + tokens += [self.pad_idx] * num_pad + pad_mask = np.array(pad_mask[1:]) + if self.overalapping_eval != self.seq_len and idx != 0: + pad_mask[:-self.overalapping_eval] *= 0 + + return {'text': np.array(tokens), 'pad_mask': pad_mask} + + +class _LambadaDataset(torch.utils.data.Dataset): + + def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False): + print_rank_0('> building lambada dataset from {} ...'.format(path)) + self.seq_len = seq_len + self.pad_idx = pad_idx + self.tokenizer = tokenizer + self.strict = strict + + self.tokens = [] + self.labels = [] + with open(path, 'r') as f: + for line in f.readlines(): + text = json.loads(line)['text'] + tokens, labels = self.get_tokens(text) + self.tokens.append(tokens) + self.labels.append(labels) + + def get_tokens(self, text): + if not self.strict: + tokens = self.tokenizer.tokenize(text) + return tokens[:-1], [tokens[-1]] + last_token = text.split()[-1] + start_idx = text.rfind(last_token) + beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip()) + last_token = self.tokenizer.tokenize(' ' + last_token) + return beginning_tokens, last_token + + def __len__(self): + return len(self.tokens) + + def __getitem__(self, idx): + tokens = self.tokens[idx] + num_tokens = len(tokens) + pad_mask = [0] * num_tokens + labels = self.labels[idx] + pad_mask += [1] * len(labels) + tokens = tokens + labels + num_tokens = len(tokens) + if num_tokens < self.seq_len + 1: + num_pad = (self.seq_len + 1 - num_tokens) + pad_mask += [0] * (num_pad) + tokens += [self.pad_idx] * num_pad + pad_mask = np.array(pad_mask[1:]) + + return {'text': np.array(tokens), 'pad_mask': pad_mask} + + +def _build_lambada_dataset(): + """Build lambada dataset.""" + args = get_args() + tokenizer = get_tokenizer() + + assert len(args.valid_data) == 1 + val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer, + args.seq_length, args.strict_lambada) + print_rank_0(' > found {} samples.'.format(len(val_dataset))) + + return val_dataset + + +def _build_wikitext103_dataset(): + """""" + args = get_args() + tokenizer = get_tokenizer() + + assert len(args.valid_data) == 1 + with open(args.valid_data[0], "rb") as reader: + entire_data = reader.read().decode('utf-8') + num_original_tokens = len(entire_data.strip().split(" ")) + entire_data = get_detokenizer(args.valid_data[0])(entire_data) + tokenized_data = tokenizer.tokenize(entire_data) + num_tokenized_tokens = len(tokenized_data) + + val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod, + num_original_tokens, num_tokenized_tokens, + args.overlapping_eval) + print_rank_0(' > number of original tokens: {}, number of detokenized ' + 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens)) + + return val_dataset diff --git a/tasks/zeroshot_gpt/detokenizer.py b/tasks/zeroshot_gpt/detokenizer.py new file mode 100644 index 0000000..8cd7a5f --- /dev/null +++ b/tasks/zeroshot_gpt/detokenizer.py @@ -0,0 +1,67 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Detokenization.""" + +import re + + +def ptb_detokenizer(string): + string = string.replace(" '", "'") + string = string.replace(" \n", "\n") + string = string.replace("\n ", "\n") + string = string.replace(" n't", "n't") + string = string.replace(" N ", "1 ") + string = string.replace("$ 1", "$1") + string = string.replace("# 1", "#1") + return string + + +def wikitext_detokenizer(string): + # contractions + string = string.replace("s '", "s'") + string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) + # number separators + string = string.replace(" @-@ ", "-") + string = string.replace(" @,@ ", ",") + string = string.replace(" @.@ ", ".") + # punctuation + string = string.replace(" : ", ": ") + string = string.replace(" ; ", "; ") + string = string.replace(" . ", ". ") + string = string.replace(" ! ", "! ") + string = string.replace(" ? ", "? ") + string = string.replace(" , ", ", ") + # double brackets + string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) + string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) + string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) + string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) + string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) + # miscellaneous + string = string.replace("= = = =", "====") + string = string.replace("= = =", "===") + string = string.replace("= =", "==") + string = string.replace(" " + chr(176) + " ", chr(176)) + string = string.replace(" \n", "\n") + string = string.replace("\n ", "\n") + string = string.replace(" N ", " 1 ") + string = string.replace(" 's", "'s") + + return string + + +def lambada_detokenizer(string): + return string + + +_DETOKENIZERS = { + 'ptb': ptb_detokenizer, + 'wiki': wikitext_detokenizer, + 'lambada': lambada_detokenizer, +} + + +def get_detokenizer(path): + for key in _DETOKENIZERS.keys(): + if key in path: + return _DETOKENIZERS[key] diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py new file mode 100644 index 0000000..e3c02b3 --- /dev/null +++ b/tasks/zeroshot_gpt/evaluate.py @@ -0,0 +1,210 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""GPT zero-shot evaluation.""" + +import math + +import torch + +from megatron.training import get_args +from megatron.training import print_rank_0, is_last_rank +from megatron.training import get_tokenizer +from megatron.core import parallel_state, tensor_parallel +from megatron.training.checkpointing import load_checkpoint +from megatron.legacy.model import GPTModel +from megatron.training import get_model +from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward +from megatron.training.arguments import core_transformer_config_from_args +from tasks.finetune_utils import build_data_loader + +from .datasets import build_dataset + + +def get_model_provider(eval_metric): + """Based on evaluation metric set the parallel-output flag and + return the model provider.""" + + def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + config = core_transformer_config_from_args(get_args()) + + if eval_metric == 'loss': + parallel_output = True + elif eval_metric == 'accuracy': + parallel_output = False + else: + raise NotImplementedError('output type for {} evaluation metric ' + 'is not supported.'.format(eval_metric)) + + print_rank_0('building GPT model ...') + model = GPTModel(config, num_tokentypes=0, parallel_output=parallel_output, + pre_process=pre_process, post_process=post_process) + + return model + + return model_provider + + +def process_batch(batch): + """Process batch and produce inputs for the model.""" + args = get_args() + tokenizer = get_tokenizer() + + loss_mask = batch['pad_mask'].long().cuda().contiguous().byte() + tokens_ = batch['text'].long().cuda().contiguous() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, _, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + + return tokens, labels, attention_mask, position_ids, loss_mask + + +def forward_step(batch, model, eval_metric, config): + """Forward step.""" + + # Get the batch. + tokens, labels, attention_mask, position_ids, loss_mask = process_batch( + batch) + + # Tell the model what our actual batch size will be + args = get_args() + args.micro_batch_size = len(labels) + + tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) + input_tensor = recv_forward(tensor_shape, config) + + # Forward pass through the model. + unwrapped_model = unwrap_model(model) + unwrapped_model.set_input_tensor(input_tensor) + output = model(tokens, position_ids, attention_mask) + + send_forward(output, config) + + if parallel_state.is_pipeline_last_stage(): + # For loss, return the unreduced loss. + if eval_metric == 'loss': + losses = tensor_parallel.vocab_parallel_cross_entropy( + output.contiguous().float(), labels.contiguous()) + loss = torch.sum( + losses.view(-1) * loss_mask.contiguous().view(-1).float()) + return loss + + # For accuracy, return the number of correctly predicted samples. + if eval_metric == 'accuracy': + outputs = torch.argmax(output, -1) + correct = (outputs == labels).float() + correct[(1 - loss_mask).bool()] = 1 + correct = correct.prod(-1) + return correct.sum() + + raise NotImplementedError('forward method for evaluation metric {} ' + 'is not implemented.'.format(eval_metric)) + return None + + +def evaluate(data_loader, model, eval_metric): + """Evaluation.""" + args = get_args() + config = core_transformer_config_from_args(args) + + # Turn on evaluation mode which disables dropout. + model.eval() + + total_output = 0.0 + with torch.no_grad(): + # For all the batches in the dataset. + for iteration, batch in enumerate(data_loader): + if iteration % args.log_interval == 0: + print_rank_0('> working on iteration: {}'.format(iteration)) + # Forward evaluation. + output = forward_step(batch, model, eval_metric, config) + + # Reduce across processes. + if parallel_state.is_pipeline_last_stage(): + torch.distributed.all_reduce(output, + group=parallel_state.get_data_parallel_group()) + + total_output += output + + return total_output + + +def evaluate_and_print_results(task, data_loader, model, eval_metric): + """Evaluate and print results on screen.""" + + # Evaluate and get results. + output = evaluate(data_loader, model, eval_metric) + + string = ' validation results on {} | '.format(task) + if is_last_rank(): + if eval_metric == 'loss': + num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens + num_original_tokens = data_loader.dataset.num_original_tokens + val_loss = output / (num_tokenized_tokens - 1) + ppl = math.exp(min(20, val_loss)) + token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1) + adjusted_ppl = math.exp(min(20, val_loss * token_ratio)) + string += 'avg loss: {:.4E} | '.format(val_loss) + string += 'ppl: {:.4E} | '.format(ppl) + string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) + string += 'token ratio: {} |'.format(token_ratio) + + elif eval_metric == 'accuracy': + num_examples = len(data_loader.dataset) + acc = output / num_examples + string += 'number correct: {:.4E} | '.format(output) + string += 'total examples: {:.4E} | '.format(num_examples) + string += 'avg accuracy: {:.4E}'.format(acc) + + else: + raise NotImplementedError('evaluation method for {} metric is not ' + 'implemented yet.'.format(eval_metric)) + + length = len(string) + 1 + print('-' * length) + print(string) + print('-' * length) + + +def main(): + """Main program.""" + args = get_args() + + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for text generation.") + exit() + + if args.task == 'LAMBADA': + eval_metric = 'accuracy' + elif args.task == 'WIKITEXT103': + eval_metric = 'loss' + else: + raise NotImplementedError('{} task is not implemented.'.format( + args.task)) + + # Set up model and load checkpoint. + model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False) + if args.load is not None: + _ = load_checkpoint(model, None, None) + + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + + # Data stuff. + dataset = build_dataset(args.task) + dataloader = build_data_loader(dataset, args.micro_batch_size, + args.num_workers, drop_last=False) + + # Run evaluation. + evaluate_and_print_results(args.task, dataloader, model, eval_metric) + + print_rank_0('done :-)') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/functional_tests/__init__.py b/tests/functional_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/functional_tests/jet_recipes/MR-bert.yaml b/tests/functional_tests/jet_recipes/MR-bert.yaml new file mode 100644 index 0000000..076160e --- /dev/null +++ b/tests/functional_tests/jet_recipes/MR-bert.yaml @@ -0,0 +1,58 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{scope}_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}\ + _{platforms}_{nodes}N{gpus}G" + model: bert + variant: 345m + build: mcore-pyt + scope: mr + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: False + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 128 # GBS, JET schema requires 'batch_size' + precision: bf16 + time_limit: 1200 + artifacts: {/workspace/data/bert_data: text/the_pile/bert_shard00} + ckpt_format: torch_dist + ckpt_resume: 0 + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh \ + DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={100 if ckpt_resume else 50} \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + CHECKPOINT_RESUME_TEST={ckpt_resume} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + # MCore + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--spec local"'], args_meta: ["local_spec"]} + # Non-MCore + - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--transformer-impl local"']} + - {use_mcore: [False], tp_size: [1], pp_size: [4], vp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--transformer-impl local"']} diff --git a/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml new file mode 100644 index 0000000..ddf73dc --- /dev/null +++ b/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml @@ -0,0 +1,46 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +launchers: + type:slurm: + ntasks_per_node: '{gpus}' + no_container_mount_home: 'true' +spec: + name: "{model}_{variant}_{scope}_\ + mbs{mbs}_gbs{gbs}_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_'+args_meta if args_meta else ''} + _{platforms}_{nodes}N{gpus}G" + model: gpt3-nemo + variant: 126m + build: mcore-nemo + scope: mr + nodes: 1 + gpus: 8 + platforms: dgx_a100 + steps: 50 + extra_args: null + args_meta: null + precision: bf16 + time_limit: 1200 + use_mcore: True + use_te: True + vp_size: null + script: |- + cd /opt/NeMo + + /opt/megatron-lm/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={steps} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={mbs} \ + GBS={gbs} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {tp_size: [1], pp_size: [1], mbs: [4], gbs: [64], vp_size: [null]} + - {tp_size: [2], pp_size: [4], mbs: [1], gbs: [8], vp_size: [3], extra_args: ['"model.sequence_parallel=True model.overlap_p2p_comm=True model.batch_p2p_comm=False"'], args_meta: ["seq_par_overlap_p2p"]} diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml new file mode 100644 index 0000000..97a44ed --- /dev/null +++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml @@ -0,0 +1,119 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{scope}_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_reshard_'+str(reshard_tp_size)+'x'+str(reshard_pp_size)+'x'+str(reshard_ep_size) if reshard_tp_size or reshard_pp_size or reshard_ep_size else ''}\ + {'_'+args_meta if args_meta else ''}\ + {'_uninstall_te' if uninstall_te==1 else ''}\ + _{platforms}_{nodes}N{gpus}G" + model: gpt3 + variant: 345m + build: mcore-pyt + scope: mr + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: True + use_mcore: True + vp_size: null + ep_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 32 # GBS, JET schema requires 'batch_size' + moe_grouped_gemm: 0 + precision: bf16 + time_limit: 1500 + artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00} + ckpt_format: torch_dist + ckpt_resume: 0 + allow_nondeterministic: 0 + uninstall_te: 0 + gradient_accumulation_fusion: False + reshard_tp_size: null + reshard_pp_size: null + reshard_ep_size: null + skip_pytest: null + script: |- + ls + cd /workspace/megatron-lm + + if [[ {uninstall_te} == 1 ]]; then + pip uninstall -y transformer_engine + pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely + fi + + ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh \ + DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + VOCAB_FILE=/workspace/data/gpt3_data/bpe/vocab.json \ + MERGE_FILE=/workspace/data/gpt3_data/bpe/merges.txt \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + USE_GA={"1" if gradient_accumulation_fusion else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={100 if ckpt_resume else 50} \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + EP_SIZE={ep_size if ep_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + MOE_GROUPED_GEMM={moe_grouped_gemm} \ + CKPT_FORMAT={ckpt_format} \ + CHECKPOINT_RESUME_TEST={ckpt_resume} \ + ALLOW_NONDETERMINISTIC={allow_nondeterministic} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} \ + {'RESUME_OVERRIDE_TP_SIZE='+str(reshard_tp_size)+' RESUME_OVERRIDE_PP_SIZE='+str(reshard_pp_size) if reshard_tp_size or reshard_pp_size else ''} \ + {'RESUME_OVERRIDE_EP_SIZE='+str(reshard_ep_size) if reshard_ep_size else ''} \ + {'SKIP_PYTEST=1' if skip_pytest else ''} +products: + # MCore + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-create-attention-mask-in-dataloader"], args_meta: ["no_create_attention_mask_in_dataloader"]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--no-mmap-bin-files --no-ckpt-fully-parallel-save"], args_meta: ["no_mmap_bin_files"]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--ddp-average-in-collective"], args_meta: ["ddp_average_in_collective"]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]} + - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--qk-layernorm --test-mode"'], args_meta: ["qk_layernorm_test_mode"]} + - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope --no-ckpt-fully-parallel-save"'], args_meta: ["rope_embeddings"]} + - {tp_size: [1], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--position-embedding-type rope --rotary-interleaved --no-rope-fusion"'], args_meta: ["rope_embeddings_interleaved_no_fusion"]} + - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --async-save"'], args_meta: ["disable_bias_linear"]} + - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--swiglu --ckpt-fully-parallel-load --async-save"'], args_meta: ["swiglu"]} + - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ["--untie-embeddings-and-output-weights"], args_meta: ["untie_embeddings_and_outputs"]} + - {tp_size: [1], pp_size: [4], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --ckpt-fully-parallel-load"'], args_meta: ["sequence_parallel"]} + - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--recompute-granularity full --recompute-method uniform --recompute-num-layers 1"'], args_meta: ["uniform_full_recompute"]} + - {tp_size: [2], pp_size: [1,2], ckpt_resume: [0, 1], allow_nondeterministic: [1], extra_args: ['"--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"'], args_meta: ["cp2_nondeterministic"]} + - {tp_size: [2], pp_size: [1], ep_size: [2], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel"]} + - {tp_size: [2], pp_size: [1], ep_size: [2], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 8 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel_dist_optimizer"]} + ## TODO: MoE GroupedMLP dist-ckpt not supported, so must use 'torch' ckpt format + - {tp_size: [2], pp_size: [1], ep_size: [2], ckpt_resume: [0, 1], extra_args: ['"--no-ckpt-fully-parallel-save --moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --overlap-grad-reduce --overlap-param-gather"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM"]} + - {tp_size: [2], pp_size: [1], ep_size: [2], ckpt_resume: [0, 1], extra_args: ['"--no-ckpt-fully-parallel-save --moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_groupedGEMM"]} + - {tp_size: [2], pp_size: [1], ep_size: [2], ckpt_resume: [0, 1], extra_args: ['"--disable-bias-linear --sequence-parallel --num-experts 8 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"'], moe_grouped_gemm: [1], args_meta: ["te_8experts2parallel_top2router"]} + - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --no-ckpt-fully-parallel-save --async-save"'], args_meta: ["dist_optimizer"]} + - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --no-mmap-bin-files"'], args_meta: ["dist_optimizer_no_mmap_bin_files"]} + - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]} + - {tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch], extra_args: ['"--decoupled-lr 0.0002"'], args_meta: ["decoupled_lr"]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --untie-embeddings-and-output-weights"'], args_meta: ["dist_optimizer_overlap_grad_reduce_untied"]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], gradient_accumulation_fusion: [True], extra_args: ['"--defer-embedding-wgrad-compute --wgrad-deferral-limit 2"'], args_meta: ["defer_embedding_wgrad_compute"]} + - {tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather --check-weight-hash-across-dp-replicas-interval 10 --ckpt-fully-parallel-load"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--cross-entropy-loss-fusion"], args_meta: ["cross_entropy_loss_fusion"]} + # Mcore, no TE + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], uninstall_te: [1], use_te: [False], extra_args: ['"--no-persist-layer-norm --no-masked-softmax-fusion"'], skip_pytest: [1]} ## TODO(ashors): add baseline + # Non-MCore, only legacy checkpoints supported + - {use_mcore: [False], use_te: [False, True], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch]} + - {use_mcore: [False], use_te: [False], tp_size: [1], pp_size: [4], vp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch]} + # TPxPP resharding tests (TP changing results in non-deterministic losses) + - {tp_size: [2], pp_size: [2], ckpt_resume: [1], allow_nondeterministic: [1], reshard_tp_size: [1], reshard_pp_size: [4]} + - {tp_size: [4], pp_size: [2], ckpt_resume: [1], allow_nondeterministic: [1], reshard_tp_size: [8], reshard_pp_size: [1], extra_args: ['"--use-distributed-optimizer --async-save --ckpt-fully-parallel-save"']} + - {tp_size: [1], pp_size: [2], ep_size: [2], ckpt_resume: [1], allow_nondeterministic: [1], reshard_tp_size: [2], reshard_pp_size: [1], reshard_ep_size: [4], extra_args: ['"--sequence-parallel --num-experts 8 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1 --ckpt-fully-parallel-save --ckpt-fully-parallel-load"'], args_meta: ["te_8experts2parallel_dist_optimizer"]} diff --git a/tests/functional_tests/jet_recipes/MR-multimodal.yaml b/tests/functional_tests/jet_recipes/MR-multimodal.yaml new file mode 100644 index 0000000..d28e62b --- /dev/null +++ b/tests/functional_tests/jet_recipes/MR-multimodal.yaml @@ -0,0 +1,55 @@ +type: basic +format_version: 1 +maintainers: [trintamaki] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}\ + _{platforms}_{nodes}N{gpus}G" + model: multimodal + variant: llava + build: mcore-pyt + scope: mr + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: True + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 32 # GBS, JET schema requires 'batch_size' + moe_grouped_gemm: 0 + precision: bf16 + time_limit: 1200 + ckpt_format: torch + ckpt_resume: 0 + allow_nondeterministic: 0 + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={100 if ckpt_resume else 50} \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + MOE_GROUPED_GEMM={moe_grouped_gemm} \ + CKPT_FORMAT={ckpt_format} \ + CHECKPOINT_RESUME_TEST={ckpt_resume} \ + ALLOW_NONDETERMINISTIC={allow_nondeterministic} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {use_te: [True], tp_size: [1], pp_size: [1], ckpt_resume: [0, 1]} diff --git a/tests/functional_tests/jet_recipes/MR-t5.yaml b/tests/functional_tests/jet_recipes/MR-t5.yaml new file mode 100644 index 0000000..d8831fe --- /dev/null +++ b/tests/functional_tests/jet_recipes/MR-t5.yaml @@ -0,0 +1,52 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}\ + _{platforms}_{nodes}N{gpus}G" + model: t5 + variant: 220m + build: mcore-pyt + scope: mr + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: False + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 32 # GBS, JET schema requires 'batch_size' + precision: bf16 + time_limit: 1800 + ckpt_format: torch + ckpt_resume: 0 + artifacts: {/workspace/data/t5_data: text/the_pile/t5_shard00} + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh \ + DATA_PATH="/workspace/data/t5_data/my-t5_00_text_document" \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS=100 \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {use_te: [True], tp_size: [1], pp_size: [1], vp_size: [1], extra_args: ["--calculate-per-token-loss"], args_meta: ["calculate_per_token_loss"]} diff --git a/tests/functional_tests/jet_recipes/build-pyt.yaml b/tests/functional_tests/jet_recipes/build-pyt.yaml new file mode 100644 index 0000000..d24836e --- /dev/null +++ b/tests/functional_tests/jet_recipes/build-pyt.yaml @@ -0,0 +1,23 @@ +type: build +format_version: 1 +maintainers: [maanug] +spec: + name: mcore-pyt + platforms: [linux/amd64] + source: + # The image tag will be added via `jet-tests.yaml` + # Tags are one of {buildcache, $CI_PIPELINE_ID} + image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci + + +--- +type: build +format_version: 1 +maintainers: [maanug] +spec: + name: mcore-nemo + platforms: [linux/amd64] + source: + # The image tag will be added via `jet-tests.yaml` + # Tags are one of {buildcache, $CI_PIPELINE_ID} + image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci \ No newline at end of file diff --git a/tests/functional_tests/jet_recipes/local-generator.py b/tests/functional_tests/jet_recipes/local-generator.py new file mode 100644 index 0000000..513c6ab --- /dev/null +++ b/tests/functional_tests/jet_recipes/local-generator.py @@ -0,0 +1,84 @@ +import argparse +import itertools +import os +import re +import yaml + +SBATCH_TEMPLATE = ''' +srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\ + --container-mounts "{}:{},{}:/workspace/megatron-lm" \\ + bash -c \" + \n{} +\" +''' + + +def eval_name(**globals): + name_template = globals['name'] + + to_eval = re.findall("{.*?}", name_template) + to_eval = [x.strip('{}') for x in to_eval] + str_to_format = re.sub("{.*?}", '{}', name_template) + format_contents = [eval(x, globals) for x in to_eval] + + return str_to_format.format(*format_contents) + + +def save_script(save_dir, format, sbatch_dataset_path, sbatch_mlm_path, **globals): + script = globals['script'] + + globals['name'] = eval_name(**globals) + globals['key'] = "basic/" + globals['name'].lower().replace('_', '-') + globals['assets_dir'] = f"/assets/{globals['key']}" + if format == 'sbatch' and globals['extra_args'] is not None: + globals['extra_args'] = globals['extra_args'].replace('"', "'") + + # gather and evaluate all substitutions marked by braces in script in order of ocurrence + to_eval = re.findall("{.*}", script) + to_eval = [x.strip('{}') for x in to_eval] + str_to_format = re.sub("{.*}", '{}', script) + format_contents = [eval(x, globals) for x in to_eval] + + file_content = str_to_format.format(*format_contents) + if not os.path.exists(save_dir): + os.mkdir(save_dir) + with open(os.path.join(save_dir, globals['name']+".sh"), 'w') as f: + f.write("#!/bin/bash\n") + + if format == 'sbatch': + dataset_mount = list(globals['artifacts'].keys())[0] if 'artifacts' in globals else "/path/to/mount/dataset" + sbatch_content = SBATCH_TEMPLATE.format(sbatch_dataset_path, dataset_mount, sbatch_mlm_path, file_content) + f.write(sbatch_content) + else: + f.write(file_content) + + +def main(src_yaml, save_dir, format, sbatch_dataset_path, sbatch_mlm_path): + # load yaml + with open(src_yaml, 'r') as f: + raw_content = yaml.safe_load(f) + + spec_template = raw_content['spec'] + for prod in raw_content['products']: + config = spec_template.copy() + # expand cartesian products into list of all config overrides + for replace in itertools.product(*prod.values()): + # update config dict with overrides from products + config.update({k: v for k, v in zip(prod.keys(), replace)}) + save_script(save_dir, format, sbatch_dataset_path, sbatch_mlm_path, **config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='Functional tests script generator', + description="""Generates bash or sbatch scripts + from yamls in this directory to run functional tests locally""") + parser.add_argument('src_yaml', help="Yaml file in this directory from which to generate test scripts") + parser.add_argument('--save_dir', required=False, default='./scripts', + help='Directory where scripts will be saved to. Defaults to ./scripts') + parser.add_argument('--format', required=False, default='bash', choices=['bash', 'sbatch'], help="Script format") + parser.add_argument('--sbatch-dataset-path', required=False, default='/path/to/dataset') + parser.add_argument('--sbatch-megatronlm-path', required=False, default='/path/to/megatron-lm') + args = parser.parse_args() + + main(args.src_yaml, args.save_dir, args.format, args.sbatch_dataset_path, args.sbatch_megatronlm_path) diff --git a/tests/functional_tests/jet_recipes/monthly-t5.yaml b/tests/functional_tests/jet_recipes/monthly-t5.yaml new file mode 100644 index 0000000..3dd6d6f --- /dev/null +++ b/tests/functional_tests/jet_recipes/monthly-t5.yaml @@ -0,0 +1,56 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}" + model: t5 + variant: 220m + build: mcore-pyt + scope: monthly + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: False + use_mcore: True + vp_size: 1 + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 32 # GBS, JET schema requires 'batch_size' + precision: bf16 + time_limit: 1800 + artifacts: {/workspace/data/t5_data: text/the_pile/t5_shard00} + ckpt_format: torch + ckpt_resume: 0 + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh \ + DATA_PATH="/workspace/data/t5_data/my-t5_00_text_document" \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS=100 \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + CHECKPOINT_RESUME_TEST={ckpt_resume} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {tp_size: [1,2], pp_size: [1], vp_size: [1] } + - {use_te: [True], tp_size: [2], pp_size: [1], vp_size: [1]} + - {use_te: [True], tp_size: [2], pp_size: [1], vp_size: [1], extra_args: ["--sequence-parallel"], args_meta: ["sequence_parallel"]} + # Checkpoint resume + - {ckpt_resume: [1], use_te: [False, True], tp_size: [1], pp_size: [1], vp_size: [1]} diff --git a/tests/functional_tests/jet_recipes/nightly-bert.yaml b/tests/functional_tests/jet_recipes/nightly-bert.yaml new file mode 100644 index 0000000..29d2857 --- /dev/null +++ b/tests/functional_tests/jet_recipes/nightly-bert.yaml @@ -0,0 +1,52 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}" + model: bert + variant: 345m + build: mcore-pyt + scope: nightly + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: False + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 128 # GBS, JET schema requires 'batch_size' + time_limit: 1200 + ckpt_format: torch + ckpt_resume: 0 + artifacts: {/workspace/data/bert_data: text/the_pile/bert_shard00} + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh \ + DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={100 if ckpt_resume else 50} \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {tp_size: [1], pp_size: [4], vp_size: [2]} + - {use_mcore: [True, False], tp_size: [4], pp_size: [1]} + - {use_mcore: [True, False], tp_size: [1], pp_size: [2]} diff --git a/tests/functional_tests/jet_recipes/nightly-gpt.yaml b/tests/functional_tests/jet_recipes/nightly-gpt.yaml new file mode 100644 index 0000000..5b072ea --- /dev/null +++ b/tests/functional_tests/jet_recipes/nightly-gpt.yaml @@ -0,0 +1,69 @@ +type: basic +format_version: 1 +maintainers: [maanug] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\ + {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}" + model: gpt3 + variant: 345m + build: mcore-pyt + scope: nightly + nodes: 1 + gpus: 8 + platforms: dgx_a100 + use_te: False + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 4 # MBS + batch_size: 32 # GBS, JET schema requires 'batch_size' + moe_grouped_gemm: 0 + time_limit: 1200 + artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00} + ckpt_format: torch + ckpt_resume: 0 + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh \ + DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + VOCAB_FILE=/workspace/data/gpt3_data/bpe/vocab.json \ + MERGE_FILE=/workspace/data/gpt3_data/bpe/merges.txt \ + DATA_CACHE=/workspace/data/index-cache \ + USE_TE={"1" if use_te else "0"} \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS={100 if ckpt_resume else 50} \ + USE_CORE={"1" if use_mcore else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + MOE_GROUPED_GEMM={moe_grouped_gemm} \ + CKPT_FORMAT={ckpt_format} \ + CHECKPOINT_RESUME_TEST={ckpt_resume} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {use_mcore: [True], tp_size: [4], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch_dist]} + - {use_mcore: [False], tp_size: [4], pp_size: [1], ckpt_resume: [0, 1]} + - {use_mcore: [True], tp_size: [4], pp_size: [1], ckpt_resume: [1]} + - {use_mcore: [True], tp_size: [1], pp_size: [2,4], ckpt_resume: [0, 1], ckpt_format: [torch_dist]} + - {use_mcore: [False], tp_size: [1], pp_size: [2,4], ckpt_resume: [0, 1]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--num-experts 2 --sequence-parallel --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_2experts"]} + - {tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--sequence-parallel --num-experts 4 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["te_4experts2parallel"]} + - {tp_size: [1], pp_size: [1], ckpt_resume: [0, 1], ckpt_format: [torch_dist], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"'], args_meta: ["dist_optimizer_overlap_grad_reduce_param_gather"]} +# Non-MCore + - {use_mcore: [False], tp_size: [1,4], pp_size: [1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]} + - {use_mcore: [False], tp_size: [1], pp_size: [1], extra_args: ['"--use-distributed-optimizer --overlap-grad-reduce"'], args_meta: ["dist_optimizer_overlap_grad_reduce"]} + - {use_mcore: [False], tp_size: [1], pp_size: [4], vp_size: [null, 1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]} + - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ["--overlap-grad-reduce"], args_meta: ["overlap_grad_reduce"]} + - {use_mcore: [False], tp_size: [2], pp_size: [2], ckpt_resume: [0, 1], extra_args: ['"--sequence-parallel --num-experts 4 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"'], args_meta: ["4experts"]} diff --git a/tests/functional_tests/jet_recipes/weekly-gpt.yaml b/tests/functional_tests/jet_recipes/weekly-gpt.yaml new file mode 100644 index 0000000..a0e3cf5 --- /dev/null +++ b/tests/functional_tests/jet_recipes/weekly-gpt.yaml @@ -0,0 +1,60 @@ +type: basic +format_version: 1 +maintainers: [shreyasm] +loggers: [stdout] +spec: + name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\ + {'mcore_' if use_mcore else ''}{'nondet_' if allow_nondeterministic else ''}\ + tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ + {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ + {'_'+args_meta if args_meta else ''}" + model: gpt3 + variant: 345m + build: mcore-pyt + scope: weekly + nodes: 1 + gpus: 8 + platforms: dgx_h100 + use_mcore: True + vp_size: null + extra_args: null + args_meta: null + micro_batch_size: 2 # MBS + batch_size: 128 # GBS, JET schema requires 'batch_size' + moe_grouped_gemm: 0 + allow_nondeterministic: False + precision: bf16 + time_limit: 10000 # 2.5 hours + ckpt_format: torch + ckpt_resume: 0 + artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00} + script: |- + ls + cd /workspace/megatron-lm + + ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh \ + DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document \ + VOCAB_FILE=/workspace/data/gpt3_data/bpe/vocab.json \ + MERGE_FILE=/workspace/data/gpt3_data/bpe/merges.txt \ + CHECKPOINT_PATH=/workspace/checkpoints \ + TENSORBOARD_DIR={assets_dir} \ + DATA_CACHE=/workspace/data/index-cache \ + TP_SIZE={tp_size} \ + PP_SIZE={pp_size} \ + NUM_NODES={nodes} \ + MAX_STEPS=2000 \ + USE_CORE={"1" if use_mcore else "0"} \ + USE_FP8={"1" if precision == "fp8" else "0"} \ + VP_SIZE={vp_size if vp_size is not None else '""'} \ + MBS={micro_batch_size} \ + GBS={batch_size} \ + MOE_GROUPED_GEMM={moe_grouped_gemm} \ + ALLOW_NONDETERMINISTIC={"1" if allow_nondeterministic else "0"} \ + JOB_NAME={name} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} +products: + - {use_mcore: [True], precision: [bf16], tp_size: [1], pp_size: [1], allow_nondeterministic: [False], args_meta: ["bf16_baseline"]} + - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [1], allow_nondeterministic: [False, True], args_meta: ["fp8_no_model_parallel"]} + - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_pp"]} + - {use_mcore: [True], precision: [fp8], tp_size: [2, 4], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_tp_pp"]} + - {use_mcore: [True], precision: [fp8], tp_size: [2], pp_size: [2], allow_nondeterministic: [False], extra_args: [" --sequence-parallel"], args_meta: ["fp8_tp_pp_sp"]} diff --git a/tests/functional_tests/python_test_utils/__init__.py b/tests/functional_tests/python_test_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py new file mode 100644 index 0000000..8f93db6 --- /dev/null +++ b/tests/functional_tests/python_test_utils/common.py @@ -0,0 +1,81 @@ +import enum +import glob +import json +import os + +from tensorboard.backend.event_processing import event_accumulator + +# By default TB tries to be smart about what to load in memory to avoid OOM +# Since we expect every step to be there when we do our comparisons, we explicitly +# set the size guidance to 0 so that we load everything. It's okay given our tests +# are small/short. +SIZE_GUIDANCE = { + event_accumulator.TENSORS: 0, + event_accumulator.SCALARS: 0, +} + + +class TypeOfTest(enum.Enum): + APPROX = 1 + DETERMINISTIC = 2 + + +TYPE_OF_TEST_TO_METRIC = { + TypeOfTest.DETERMINISTIC: ["lm loss", "num-zeros"], + TypeOfTest.APPROX: ["lm loss", "iteration-time", "mem-allocated-bytes"], +} + +METRIC_TO_THRESHOLD = { + "iteration-time": 0.3, + "mem-allocated-bytes": 3 * 1000 * 1000, # 3MB + "lm loss": 0.05 +} + +ALLOW_NONDETERMINISTIC = bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO"))) +LOGS_DIR = os.getenv("LOGS_DIR") + +def read_tb_logs_as_list(path, index=0): + """Reads a TensorBoard Events file from the input path, and returns the + summary specified as input as a list. + + Args: + path: str, path to the dir where the events file is located. + summary_name: str, name of the summary to read from the TB logs. + + Returns: + summary_list: list, the values in the read summary list, formatted as a list. + """ + files = glob.glob(f"{path}/events*tfevents*") + files += glob.glob(f"{path}/results/events*tfevents*") + + if not files: + raise FileNotFoundError( + f"File not found matching: {path}/events* || {path}/results/events*" + ) + + files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x))) + + event_file = files[index] + ea = event_accumulator.EventAccumulator(event_file, size_guidance=SIZE_GUIDANCE) + ea.Reload() + + summaries = {} + for scalar_name in ea.Tags()["scalars"]: + summaries[scalar_name] = [round(x.value, 5) for x in ea.Scalars(scalar_name)] + + print( + f"\nObtained the following list for {summaries[scalar_name]} ------------------" + ) + print(summaries) + return summaries + + +def load_expected_data(): + expected_metrics_file = os.getenv("EXPECTED_METRICS_FILE") + + with open(expected_metrics_file) as f: + if os.path.exists(expected_metrics_file): + with open(expected_metrics_file) as f: + return json.load(f) + else: + print(f"File {expected_metrics_file} not found!") \ No newline at end of file diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py new file mode 100644 index 0000000..9b2d08b --- /dev/null +++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py @@ -0,0 +1,32 @@ +import os + +os.environ["OPENBLAS_NUM_THREADS"] = "1" +import json +import sys + +from tests.functional_tests.python_test_utils.common import read_tb_logs_as_list + + +def collect_train_test_metrics(logs_dir, run_name): + summaries = read_tb_logs_as_list(logs_dir) + + train_metrics = { + metric_name: { + "start_step": 0, + "end_step": len(metric_values), + "step_interval": 5, + "values": metric_values[0 : len(metric_values) : 5], + } + for metric_name, metric_values in summaries.items() + } + print( + f"\n ----------- Store the following metrics in tests/functional_tests/test_results/jet/{run_name}.json ----------" + ) + print(f"\n {json.dumps(train_metrics)}", flush=True) + + +if __name__ == "__main__": + args = sys.argv[1:] + logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/ + run_name = args[1] + collect_train_test_metrics(logs_dir, run_name) diff --git a/tests/functional_tests/python_test_utils/jet_test_pipeline.py b/tests/functional_tests/python_test_utils/jet_test_pipeline.py new file mode 100644 index 0000000..e84edde --- /dev/null +++ b/tests/functional_tests/python_test_utils/jet_test_pipeline.py @@ -0,0 +1,142 @@ +import argparse +import os +import sys + +from jet.logs.queries import Field, JETLogsQuery +from jet.utils.instance import JETInstance + + +def select_asset(result_obj, prefix): + if result_obj['obj_ci']['s_job_status'] != "skipped": + assets = result_obj.get('nested_assets', None) + if assets is not None: + for asset in assets: + if asset['s_name'].startswith(prefix): + return asset['s_url'] + return 'not found' + + +def query_results(triggering_pipeline_id): + service = JETInstance().log_service() + query = ( + JETLogsQuery() + .filter(Field('obj_ci.obj_upstream.l_pipeline_id') == triggering_pipeline_id) + .filter(Field('obj_workload.s_type') == 'basic') + .select( + 'l_exit_code', + 'nested_assets', + 'obj_workload.s_key', + 'obj_workload.obj_spec', + 'obj_ci', + 'ts_created', + 'obj_status.s_message', + 'obj_ci.l_job_id', + ) + .orderby('ts_created') # increasing (least recent in case of timestamp) + ) + return service.query(query, flatten=False) + + +def dedupe_results(results): + deduped = {} + for result in results: + key = result['obj_workload']['s_key'] + if key not in deduped: + deduped[key] = result + else: + if result['ts_created'] > deduped[key]['ts_created']: + deduped[key] = result + + return deduped.values() + + +def pretty_print_results(results, summary_jobid): + from prettytable import PrettyTable + + exit_codes = [] + log_urls = [] + names = [] + metrics_file_urls = [] + result_message = [] + jet_log_urls = [] + for result in results: + exit_codes.append(result.get('l_exit_code', -1)) + log_urls.append(select_asset(result, 'output_script-0.log')) + names.append(result['obj_workload']['obj_spec']['s_name']) + result_message.append(result['obj_status']['s_message']) + metrics_file_urls.append(select_asset(result, 'results.json')) + jet_log_urls.append( + f"https://gitlab-master.nvidia.com/dl/jet/ci/-/jobs/{result['obj_ci']['l_job_id']}" + ) + + # Results metrics table + metrics_table = PrettyTable() + metrics_table.add_column("Job Key", names, align="l") + metrics_table.add_column("Test Result", result_message) + metrics_table.add_column("JET Log URL", jet_log_urls) + metrics_table.add_column("SLURM Log URL", log_urls) + metrics_table.add_column("Results Data", metrics_file_urls, align="l") + + exit_codes_good = [ec == 0 for ec in exit_codes] + if not (len(exit_codes_good)): + raise Exception("Can't find any jobs, something went wrong.\n" + metrics_table.get_string()) + if not all(exit_codes_good): + raise Exception("Some jobs failed to complete successfully\n" + metrics_table.get_string()) + print(metrics_table) + print("All jobs completed successfully!") + + +def save_scripts(results, save_dir): + if not os.path.exists(save_dir): + os.mkdir(save_dir) + + for result in results: + script = result['obj_workload']['obj_spec']['s_script'] + target_path = result['obj_workload']['obj_spec']['s_name'] + '.sh' + target_path = os.path.join(save_dir, target_path) + + from textwrap import dedent + + if result['obj_workload']['obj_spec']['flat_artifacts']: + dataset_mount = list(result['obj_workload']['obj_spec']['flat_artifacts'].keys())[0] + content = f''' + srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\ + --container-mounts "/path/to/data:{dataset_mount},/path/to/megatron-lm:/workspace/megatron-lm" \\ + bash -c''' + content = dedent(content) + content += f' \'\n{script}\n\'' + else: + content = ''' + srun --container-image nvcr.io/nvidia/pytorch:24.01-py3 \\ + --container-mounts "/path/to/megatron-lm:/workspace/megatron-lm" \\ + bash -c''' + content = dedent(content) + content += f' \'\n{script}\n\'' + + with open(target_path, 'w') as script_file: + script_file.write('#!/bin/bash') + script_file.write(content) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + 'pipeline_id', help="Pipeline ID for pipeline in MLM repo that triggers the JET CI" + ) + parser.add_argument( + '--download_scripts_dir', required=False, help="Directory in which to save the job script." + ) + parser.add_argument( + '--artifact_links', + required=False, + help="Enables job script artifact link table. Provide results summary job's ID.", + ) + args = parser.parse_args() + + results = query_results(args.pipeline_id) + results = dedupe_results(results) + + if args.download_scripts_dir: + save_scripts(results, args.download_scripts_dir) + + pretty_print_results(results, args.artifact_links) diff --git a/tests/functional_tests/python_test_utils/multitest_ci_pipeline.py b/tests/functional_tests/python_test_utils/multitest_ci_pipeline.py new file mode 100644 index 0000000..734bf2b --- /dev/null +++ b/tests/functional_tests/python_test_utils/multitest_ci_pipeline.py @@ -0,0 +1,47 @@ +import os +import json +import pytest +import sys +import glob +from .common import read_tb_logs_as_list, TypeOfTest +from .test_ci_pipeline import TestCIPipeline + +LOGS_DIR = os.getenv('LOGS_DIR') +EXPECTED_METRICS_DIR = os.getenv('EXPECTED_METRICS_DIR') + + +class TestBulkCIPipeline(TestCIPipeline): + + margin_loss, margin_time = 0.05, 0.1 + + def _setup(self, config_name): + self.config_name = config_name + baseline_filename = config_name + '.json' + + filepath = os.path.join(EXPECTED_METRICS_DIR, baseline_filename) + if os.path.exists(filepath): + with open(filepath) as f: + self.expected = json.load(f) + else: + raise FileNotFoundError(f"{baseline_filename} does not exist") + + def _get_actual(self, loss_type): + return read_tb_logs_as_list(LOGS_DIR+'/'+self.config_name, loss_type) + + @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) + def test_lm_loss_deterministic(self, config_name): + # Expected training loss curve at different global steps. + self._setup(config_name) + self._test_helper("lm loss", TypeOfTest.DETERMINISTIC) + + @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) + def test_lm_loss_approx(self, config_name): + # Expected training loss curve at different global steps. + self._setup(config_name) + self._test_helper("lm loss", TypeOfTest.APPROX) + + @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) + def test_num_zeros_deterministic(self, config_name): + # Expected validation loss curve at different global steps. + self._setup(config_name) + self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC) diff --git a/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_ci_pipeline.py new file mode 100644 index 0000000..8a1b754 --- /dev/null +++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py @@ -0,0 +1,97 @@ +import json +import os +from typing import List, Union + +import numpy as np +import pytest + +from .common import ( + ALLOW_NONDETERMINISTIC, + LOGS_DIR, + METRIC_TO_THRESHOLD, + TYPE_OF_TEST_TO_METRIC, + TypeOfTest, + load_expected_data, + read_tb_logs_as_list, +) + + +@pytest.fixture(params=load_expected_data().items()) +def expected_data(request): + return request.param + + +# If we require a variation of tests for any of the other pipelines we can just inherit this class. +class TestCIPipeline: + allow_nondeterministic = ALLOW_NONDETERMINISTIC + + # Replace symbol in namespace to fix function call result for lifetime of + # this class. + + def _test_helper(self, metric_type: str, metric_dict: List[Union[int, float]], test_type): + expected_list = metric_dict['values'] + print(f"The list of expected values: {expected_list} for metric {metric_type}") + + try: + actual_list = read_tb_logs_as_list(LOGS_DIR)[metric_type] + except KeyError as e: + raise KeyError( + f"Required metric {metric_type} not found in TB logs. Please make sure your model exports this metric as its required by the test case/golden values file" + ) from e + + if actual_list is None: + raise ValueError(f"No values of {metric_type} found in TB logs.") + + + actual_list_sliced = actual_list[ + metric_dict["start_step"] : metric_dict["end_step"] : metric_dict["step_interval"] + ] + print(f"The list of actual values: {actual_list_sliced}") + + if metric_type == "iteration-time": + actual_list_sliced = actual_list_sliced[3:] + expected_list = expected_list[3:] + print(f"Removing first items of values for metric_type iteration-time") + + if test_type == TypeOfTest.DETERMINISTIC: + assert np.allclose( + actual_list_sliced, expected_list, rtol=0, atol=0 + ), f"Actual is not equal to Expected for {metric_type}" + elif test_type == TypeOfTest.APPROX: + assert np.allclose( + actual_list_sliced, expected_list, rtol=1e-5, atol=METRIC_TO_THRESHOLD[metric_type] + ), f"Actual is not equal to Expected for {metric_type}" + else: + raise ValueError(f"Unexpected test_type {test_type} provided") + + def test_approx(self, expected_data): + expected_metric, expected_values = expected_data + + if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.APPROX]: + self._test_helper(expected_metric, expected_values, TypeOfTest.APPROX) + else: + print(f"Skipping metric {expected_metric} for approximate as it is deterministic only.") + + @pytest.mark.skipif(allow_nondeterministic, reason="Cannot expect exact results") + def test_deterministic(self, expected_data): + expected_metric, expected_values = expected_data + + if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.DETERMINISTIC]: + self._test_helper(expected_metric, expected_values, TypeOfTest.DETERMINISTIC) + else: + print(f"Skipping metric {expected_metric} for deterministic as it is approximate only.") + + # # @TODO: This is inactive, do we want to activate it? + # def iteration_timing_node(self): + # expected_iteration_timing_avg = self.expected["train_step_timing_avg"] + # iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"] + # idx = len(iteration_time) // 3 + # iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:]) + # assert ( + # expected_iteration_timing_avg + # == pytest.approx(expected=iteration_time_avg, rel=self.margin_time) + # ), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}." + +# if deterministic, then also approx +# if not determinstic, then also aprox + diff --git a/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py new file mode 100644 index 0000000..46b312e --- /dev/null +++ b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py @@ -0,0 +1,124 @@ +import json +import os + +import numpy as np +import pytest +import scipy.stats as ss +from scipy.integrate import trapezoid + +from .common import TypeOfTest, read_tb_logs_as_list + +LOGS_DIR = os.getenv("LOGS_DIR") +EXPECTED_METRICS_FILE = os.getenv("EXPECTED_METRICS_FILE") + + +# If we require a variation of tests for any of the other pipelines we can just inherit this class. +class TestFP8CIPipeline: + margin_loss, margin_time = 0.2, 0.1 + auc_threshold, correlation_threshold = 0.01, 0.999 + expected = None + + def _setup(self): + if os.path.exists(EXPECTED_METRICS_FILE): + with open(EXPECTED_METRICS_FILE) as f: + self.expected = json.load(f) + if self.expected is None: + raise FileNotFoundError("Expected data is none") + + def _get_actual(self, loss_type): + actual_list = read_tb_logs_as_list(LOGS_DIR)[loss_type] + assert ( + actual_list is not None + ), f"No TensorBoard events file was found in the logs for {loss_type}." + return actual_list + + def _margin_test_helper(self, loss_type): + expected = self.expected[loss_type] + expected_list = np.array(expected["values"]) + actual_list = self._get_actual(loss_type) + actual_list_sliced = np.array( + actual_list[ + expected["start_step"] : expected["end_step"] : expected[ + "step_interval" + ] + ] + ) + + max_diff_index = np.argmax(np.abs(actual_list_sliced - expected_list)) + max_diff = np.abs( + actual_list_sliced[max_diff_index] - expected_list[max_diff_index] + ) + + print( + f"[INFO - margin]: maximum absolute difference for {loss_type} is {max_diff} at index {max_diff_index}, " + f"Actual: {actual_list_sliced[max_diff_index]}, Expected: {expected_list[max_diff_index]}" + ) + assert np.allclose( + actual_list_sliced, expected_list, rtol=1e-5, atol=self.margin_loss + ), f"Actual is not equal to Expected for {loss_type}" + + def _auc_test_helper(self, loss_type): + expected = self.expected[loss_type] + expected_list = np.array(expected["values"]) + actual_list = self._get_actual(loss_type) + actual_list_sliced = np.array( + actual_list[ + expected["start_step"] : expected["end_step"] : expected[ + "step_interval" + ] + ] + ) + + def compute_auc(y_values): + x_values = np.arange(0, len(y_values), 1) + area = trapezoid(y_values, x_values) + return round(area, 5) + + baseline_area = compute_auc(expected_list) + current_area = compute_auc(actual_list_sliced) + diff = abs(baseline_area - current_area) + + print( + f"[INFO - AUC]: AUC diff: {diff * 100 / baseline_area} %, current: {current_area}, baseline: {baseline_area}" + ) + assert (baseline_area <= 0) or (diff <= self.auc_threshold * baseline_area) + + def _correlation_test_helper(self, loss_type): + expected = self.expected[loss_type] + expected_list = np.array(expected["values"]) + actual_list = self._get_actual(loss_type) + actual_list_sliced = np.array( + actual_list[ + expected["start_step"] : expected["end_step"] : expected[ + "step_interval" + ] + ] + ) + corr = ss.pearsonr(actual_list_sliced, expected_list).statistic + + print(f"[INFO - Corr]: Corr: {corr}") + assert corr > self.correlation_threshold + + @pytest.mark.xfail + def test_lm_loss_margin(self): + self._setup() + self._margin_test_helper("lm loss") + + def test_lm_loss_auc(self): + self._setup() + self._auc_test_helper("lm loss") + + @pytest.mark.xfail + def test_lm_loss_correlation(self): + self._setup() + self._correlation_test_helper("lm loss") + + def iteration_timing_node(self): + expected_iteration_timing_avg = self.expected["train_step_timing_avg"] + iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"] + idx = len(iteration_time) // 3 + iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:]) + assert ( + expected_iteration_timing_avg + == pytest.approx(expected=iteration_time_avg, rel=self.margin_time) + ), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}." diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py new file mode 100644 index 0000000..08caa8a --- /dev/null +++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py @@ -0,0 +1,70 @@ +import os + +os.environ["OPENBLAS_NUM_THREADS"] = "1" +import pytest + +from tests.functional_tests.python_test_utils.common import ( + TypeOfTest, + read_tb_logs_as_list, +) + +LOGS_DIR = os.getenv("LOGS_DIR") +ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO") +STEP_INTERVAL = 5 + + +def collect_train_test_metrics(logs_dir, index): + train_loss_list = read_tb_logs_as_list(logs_dir, index)["lm loss"] + train_loss_list = [round(elem, 3) for elem in train_loss_list] + train_metrics = { + "lm loss": train_loss_list[0 : len(train_loss_list) : STEP_INTERVAL], + } + str_train_metrics = str(train_metrics).replace("'", '"') + print(f"\n ----------- The following are the metrics for ----------") + print(f"\n {str_train_metrics}", flush=True) + return train_metrics + + +class TestCIPipeline: + margin_loss = 0.005 + allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC)) + train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0) + train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1) + + def _test_helper(self, loss_type, test_type): + expected = self.train_metrics_100[loss_type] + assert ( + len(expected) == 100 // STEP_INTERVAL + ), f"Train metrics from first run (before checkpoint load) should have {100 // STEP_INTERVAL} elements" + print("expected : " + str(expected)) + actual = self.train_metrics_50_to_100[loss_type] + assert ( + len(actual) == 50 // STEP_INTERVAL + ), f"Train metrics from second run (after checkpoint load) should have {50 // STEP_INTERVAL} elements" + print("actual : " + str(actual)) + start_idx_expected = len(expected) - len(actual) + print("start_idx_expected:", start_idx_expected) + # Here we will just be comparing values of actual and second half (50-100) of expected + for i, (expected_val, actual_val) in enumerate( + zip(expected[start_idx_expected:], actual) + ): + step = start_idx_expected + i * STEP_INTERVAL + if test_type == TypeOfTest.APPROX: + assert ( + actual_val + == pytest.approx(expected=expected_val, rel=self.margin_loss) + ), f"The loss at step {step} should be approximately {expected_val} but it is {actual_val}." + else: + assert ( + actual_val == expected_val + ), f"The value at step {step} should be {expected_val} but it is {actual_val}." + + @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.") + def test_lm_loss_deterministic(self): + self._test_helper("lm loss", TypeOfTest.DETERMINISTIC) + + @pytest.mark.skipif( + not allow_nondeterministic, reason="Nondeterministic is not allowed." + ) + def test_lm_loss_nondeterministic(self): + self._test_helper("lm loss", TypeOfTest.APPROX) diff --git a/tests/functional_tests/shell_test_utils/_run_local_training.sh b/tests/functional_tests/shell_test_utils/_run_local_training.sh new file mode 100644 index 0000000..d7d5d40 --- /dev/null +++ b/tests/functional_tests/shell_test_utils/_run_local_training.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# This script can be used for model onboarding and testing. + +# For onboarding, it extract scalars from Tensorboard logs only. +# For testing, it compares extracted Tensorboard scalars against +# a set of `GOLDEN_VALUES`. + +set -euxo pipefail + +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@"; do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +# Check that mandatory vars are set +MANDATORY_VARS=( + "TRAINING_SCRIPT_PATH" + "TRAINING_PARAMS_PATH" + "OUTPUT_PATH" + "DATA_PATH" +) +for mandatory_var in "${MANDATORY_VARS[@]}"; do + if [[ -z "${!mandatory_var}" ]]; then + echo 'Providing $'$mandatory_var' is mandatory.' + exit 1 + fi +done + +# Envsubst model_params +cat $TRAINING_PARAMS_PATH | envsubst >$TRAINING_PARAMS_PATH.tmp +mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH + +# Copy test_config into baseline +mkdir -p ${OUTPUT_PATH} +cp $TRAINING_PARAMS_PATH ${OUTPUT_PATH}/model_config.yaml || true + +# Exit earlier to leave time for properly saving checkpoint +PARAMS="--exit-duration-in-mins $((($SLURM_JOB_END_TIME - $SLURM_JOB_START_TIME) / 60 - 15))" + +# Extract training params +TRAINING_PARAMS_FROM_CONFIG=$(yq '... comments="" | to_entries | .[] | select(.key != "ENV_VARS") | with(select(.value == "true"); .value = "") | [.key + " " + .value] | join("")' $TRAINING_PARAMS_PATH | tr '\n' ' ') +PARAMS="$PARAMS $TRAINING_PARAMS_FROM_CONFIG" + +# Pull env vars to export +ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH) +for ARGUMENT in $ENV_VARS; do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done + +# Set PYTHONPATH +export PYTHONPATH="$(pwd):${PYTHONPATH:-}" +export WAND_API_KEY="${WAND_API_KEY:-}" + +######## Distributed training settings. ######## +echo "------ARGUMENTS for SLURM ---" +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-6000} +NUM_NODES=${NUM_NODES:-${SLURM_NNODES}} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID}} +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT + --node_rank $SLURM_NODEID +) + +# Start training +torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS + diff --git a/tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh b/tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh new file mode 100644 index 0000000..54c7c21 --- /dev/null +++ b/tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +set -exou pipefail + +collect_jet_jobs () { + PAGE=1 + PER_PAGE=100 + RESULTS="[]" + + while true; do + # Fetch the paginated results + RESPONSE=$(curl \ + -s \ + --globoff \ + --header "PRIVATE-TOKEN: $RW_API_TOKEN" \ + "${ENDPOINT}/pipelines/${JET_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" + ) + # Combine the results + RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE") + + # Check if there are more pages + if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then + break + fi + + # Increment the page number + PAGE=$((PAGE + 1)) + done + + echo "$RESULTS" +} + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 " + exit 1 +elif [[ -z "${RW_API_TOKEN}" ]]; then + echo "RW_API_TOKEN empty, get one at https://gitlab-master.nvidia.com/-/user_settings/personal_access_tokens" + exit 1 +fi + +CI_PIPELINE_ID=$1 +CI_PROJECT_ID=${CI_PROJECT_ID:-19378} + +# Fetch Elastic logs +set +x +PIPELINE_JSON=$(curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RW_API_TOKEN}" \ + "https://gitlab-master.nvidia.com/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100" + ) || ret_code=$? +set -x +if [[ ${ret_code:-0} -ne 0 ]]; then + echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist + exit 1 +fi + +# Fetch GitLab logs of JET downstream pipeline +DOWNSTREAM_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$PIPELINE_JSON") +set +x +JET_PIPELINE_JSON=$(curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RW_API_TOKEN}" \ + "${ENDPOINT}/pipelines/${DOWNSTREAM_PIPELINE_ID}/bridges?per_page=100" + ) +set -x +JET_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$JET_PIPELINE_JSON") + +set +x +JET_LOGS=$(collect_jet_jobs) +set -x + +LAST_STAGE_TEST_JOBS=$(jq \ + --arg ENDPOINT ${ENDPOINT} '[ + .[] + | select(.name | contains("3 logs_after")) + | select(.name | startswith("build/") | not) + | { + name, + retry_url: ($ENDPOINT + "/jobs/" + (.id | tostring) + "/retry") + } + ] | unique_by(.name)' <<< "$JET_LOGS" +) + +NUM_LAST_STAGE_TEST_JOBS=$(jq length <<< $LAST_STAGE_TEST_JOBS) + +set +x +i=1 +for retry_url in $(jq -r '.[].retry_url' <<< "$LAST_STAGE_TEST_JOBS"); do + RES=$(curl \ + --silent \ + --request POST \ + --header "PRIVATE-TOKEN: $RW_API_TOKEN" \ + "$retry_url" + ) || ret_code=$? + if [[ ${ret_code:-0} -ne 0 ]]; then + echo "Failed to retry $retry_url" + exit 1 + fi + echo "($i / $NUM_LAST_STAGE_TEST_JOBS) Retried $retry_url successfully" + i=$(($i + 1)) +done +set -x + +# Wait until all jobs completed +count_active_jobs () { + JET_LOGS=$(collect_jet_jobs) + + echo $(jq '[.[] | select((.status == "running") or (.status == "pending"))] | length' <<< "$JET_LOGS") +} + +set +x +while true; do + active_jobs=$(count_active_jobs) + echo "Active jobs $active_jobs" + + if [[ "$active_jobs" -eq 0 ]]; then + break + fi + sleep 15 +done +set -x \ No newline at end of file diff --git a/tests/functional_tests/shell_test_utils/run_release_record.sh b/tests/functional_tests/shell_test_utils/run_release_record.sh new file mode 100644 index 0000000..e55bd78 --- /dev/null +++ b/tests/functional_tests/shell_test_utils/run_release_record.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +set -ux + +####################################################################################### +# +# Script for capturing a reference model. +# +# It will train a model until a target iteration was hit. +# +# +######################################################################################## + +######################################################################################## +# Please adjust to your needs: +######################################################################################## + +OVERRIDE_GOLDEN_VALUES=true +MODEL="" +MCORE_RELEASE_NUM="" +DATA_PATH="" +TRAINING_SCRIPT_PATH=".py" +TRAINING_PARAMS_PATH="./tests/functional_tests/model_configs/$MODEL/.yaml" +TEST_PARAMS_PATH="./tests/functional_tests/test_configs/$MODEL/" +OUTPUT_PATH="/mcore-v$MCORE_RELEASE_NUM/$MODEL" +IMAGE_TAG="<...>" +NODES="<...>" +PPP="<...>" +PARTITION="<...>" +ITERATIONS="<...>" +GITLAB_TOKEN="my-super-duper-token" # Do not track in VCS +WAND_API_KEY="my-super-duper-key" # Do not track in VCS + +######################################################################################## +# Dont change below +######################################################################################## + +# Container settings +IMAGE="gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG" +MOUNTS="${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}" +ARGUMENTS=( + "TRAINING_SCRIPT_PATH=${TRAINING_SCRIPT_PATH}" + "TRAINING_PARAMS_PATH=${TRAINING_PARAMS_PATH}" + "DATA_PATH=${DATA_PATH}" + "OUTPUT_PATH=${OUTPUT_PATH}" + "WAND_API_KEY=${WAND_API_KEY}" +) +SLURM_LOGS=$OUTPUT_PATH/slurm_logs/ +mkdir -p $SLURM_LOGS + +while : +do +ACTUAL_ITERATIONS=$(cat "$OUTPUT_PATH/checkpoints/latest_checkpointed_iteration.txt" || 0) +if [[ $ACTUAL_ITERATIONS -gt $ITERATIONS ]]; then + break +fi + +# Fire of sbatch +sbatch -W < "$SLURM_LOGS/\${SLURM_JOB_ID}.log" + +srun \ + --ntasks-per-node=1 \ + --container-image=${IMAGE} \ + --container-mounts=${MOUNTS} \ + --container-workdir=/workspace/megatron-lm \ + bash ./tests/functional_tests/shell_test_utils/_run_local_training.sh ${ARGUMENTS[@]} >>"$SLURM_LOGS/\${SLURM_JOB_ID}.log" 2>&1 +EOF + +done + +# Generate golden values +# This code will be added later +# export PYTHONPATH=$(pwd) +# export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1 +# LOG_INTERVAL=$(cat $TRAINING_PARAMS_PATH | yq '."--log-interval" // 1') +# GOLDEN_VALUES=$(python ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \ +# --logs-dir $OUTPUT_PATH/tensorboard \ +# --run-name "$MODEL") +# echo "$GOLDEN_VALUES" > "$OUTPUT/$MODEL.json" + +# # Write golden values into repo if this run should become a reference +# if [[ $OVERRIDE_GOLDEN_VALUES == true ]]; then +# echo "$GOLDEN_VALUES" > tests/functional_tests/test_results/release-$MCORE_RELEASE_NUM-$$MODEL.json +# fi + +# Finally upload everything to JET +jet artifacts registry add \ + --token $GITLAB_TOKEN \ + --source-path $OUTPUT_PATH \ + "unverified/model/mcore-$MCORE_RELEASE_NUM/$MODEL" diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json new file mode 100644 index 0000000..25faec6 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json new file mode 100644 index 0000000..65fbb4d --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4681, 10.45734, 10.4491, 10.44102, 10.41779, 10.34626, 10.11378, 10.04382, 9.86692, 9.67893]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2373.0, 2593.0, 2187.0, 2403.0, 2412.0, 2617.0, 3083.0, 3341.0, 3558.0, 3213.0]}, "iteration_timing_avg": 0.8346488235294117} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json new file mode 100644 index 0000000..423d346 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.42107, 10.42897, 10.43577, 10.40787, 10.38455, 10.32433, 10.13158, 10.04316, 9.86274, 9.65777]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2229.0, 3600.0, 3300.0, 3311.0, 3522.0, 3498.0, 4076.0, 4135.0, 4709.0, 4350.0]}, "iteration_timing_avg": 1.8964105882352944} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json new file mode 100644 index 0000000..05d590e --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50096, 10.48594, 10.4936, 10.48501, 10.50417, 10.4773, 10.42153, 10.29719, 10.15831, 9.9675]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [18201.0, 19789.0, 21743.0, 18735.0, 21941.0, 19700.0, 21781.0]}, "iteration_timing_avg": 0.4730702941176471} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json new file mode 100644 index 0000000..8b1d0bc --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.49275, 10.48836, 10.51349, 10.49399, 10.47549, 10.41922, 10.28044, 10.14255, 9.94736]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26212.0, 19433.0, 24101.0, 23509.0, 21539.0, 17889.0, 19123.0]}, "iteration_timing_avg": 1.6886158823529411} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..474cdd8 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44389, 10.35605, 10.13777, 10.04004, 9.86833, 9.67303]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2398.0, 2539.0, 2945.0, 3162.0, 3457.0, 3125.0]}, "iteration_timing_avg": 0.8110379411764704} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json new file mode 100644 index 0000000..7e68039 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json @@ -0,0 +1,70 @@ +{ + "lm loss": { + "start_step": 0, + "end_step": 50, + "step_interval": 5, + "values": [ + 10.49566, + 10.48166, + 10.48045, + 10.45348, + 10.44393, + 10.35605, + 10.13787, + 10.04034, + 9.86836, + 9.6732 + ] + }, + "num-zeros": { + "start_step": 0, + "end_step": 50, + "step_interval": 5, + "values": [ + 2183.0, + 2469.0, + 2115.0, + 2126.0, + 2322.0, + 2411.0, + 2892.0, + 3234.0, + 3637.0, + 2992.0 + ] + }, + "mem-allocated-bytes": { + "start_step": 0, + "end_step": 50, + "step_interval": 5, + "values": [ + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0, + 1718216192.0 + ] + }, + "iteration-time": { + "start_step": 0, + "end_step": 50, + "step_interval": 5, + "values": [ + 13.22827, + 0.88854, + 0.92588, + 0.89793, + 0.95437, + 0.88007, + 0.88504, + 0.88703, + 0.89866, + 0.88756 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json b/tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json new file mode 100644 index 0000000..ce251b0 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49533, 10.46662, 10.42395, 10.30693, 10.15979, 9.96957, 9.87618, 9.75265, 9.63628, 9.54659, 9.49973, 9.35968, 9.33181, 9.2626, 9.26439, 9.21492]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [18772.0, 19035.0, 22350.0, 18671.0, 20738.0, 23121.0, 22655.0, 27141.0, 24304.0, 25619.0, 17322.0, 32489.0, 28409.0, 21067.0, 37615.0, 30599.0, 26145.0]}, "iteration_timing_avg": 0.3927519402985073} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..85940e2 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49569, 10.4596, 10.32846, 10.17265, 9.96951]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27627.0, 22759.0, 22567.0, 20671.0, 23229.0]}, "iteration_timing_avg": 0.7692817647058824} diff --git a/tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..5e5b762 --- /dev/null +++ b/tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648} diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json new file mode 100644 index 0000000..3bbdd74 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.0958791176470588} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json new file mode 100644 index 0000000..153f5b0 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312, 9.8347, 9.61264, 9.67965, 9.68133, 9.60021, 9.06887, 9.46573, 9.06116, 9.32103, 9.51104]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0, 2686.0, 2671.0, 3014.0, 3152.0, 2960.0, 3015.0, 3735.0, 2675.0, 2947.0, 3414.0]}, "iteration_timing_avg": 0.08244119402985074} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json new file mode 100644 index 0000000..8ade75c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0]}, "iteration_timing_avg": 0.11905411764705882} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json new file mode 100644 index 0000000..fa1ca53 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json new file mode 100644 index 0000000..43fa279 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.1541691176470588} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json new file mode 100644 index 0000000..2d211e0 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153, 9.83685, 9.60745, 9.68285, 9.6869, 9.60677, 9.07989, 9.47324, 9.07018, 9.33019, 9.51809]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0, 2540.0, 2588.0, 3110.0, 3059.0, 2924.0, 2894.0, 3694.0, 2720.0, 2635.0, 3456.0]}, "iteration_timing_avg": 0.150555671641791} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json new file mode 100644 index 0000000..7878654 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404, 9.85697, 9.65534, 9.71837, 9.74563, 9.63824, 9.13952, 9.51114, 9.10678, 9.3932, 9.56085]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0, 4218.0, 4359.0, 4468.0, 5080.0, 4575.0, 4964.0, 5755.0, 4852.0, 4092.0, 5592.0]}, "iteration_timing_avg": 0.33336671641791044} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json new file mode 100644 index 0000000..b07f042 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81916, 10.86702, 10.85724, 10.80665, 10.71115, 10.63679, 10.16197, 10.277, 10.18384, 9.88281, 9.89125, 9.67734, 9.74917, 9.75758, 9.65591, 9.15592, 9.52069, 9.11526, 9.4051, 9.56814]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [7138.0, 8525.0, 8821.0, 8718.0, 7682.0, 8227.0, 7158.0, 8514.0, 9143.0, 9624.0, 9298.0, 10386.0, 10352.0, 12164.0, 10941.0, 12318.0, 13902.0, 11709.0, 10898.0, 12956.0]}, "iteration_timing_avg": 0.33394373134328353} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json new file mode 100644 index 0000000..1c130d9 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0]}, "iteration_timing_avg": 0.33478764705882363} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json new file mode 100644 index 0000000..ecb096e --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81916, 10.86661, 10.85683, 10.80678, 10.7112, 10.63712, 10.16253, 10.27882, 10.18795, 9.88907]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [12923.0, 15794.0, 16416.0, 15771.0, 14114.0, 15096.0, 12918.0, 15842.0, 16657.0, 17467.0]}, "iteration_timing_avg": 0.340485} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json new file mode 100644 index 0000000..d939d54 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0]}, "iteration_timing_avg": 0.27329441176470587} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json new file mode 100644 index 0000000..2f9d91c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.27828194029850745} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json new file mode 100644 index 0000000..46cdac4 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.2851294029850746} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json new file mode 100644 index 0000000..69ca350 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json new file mode 100644 index 0000000..96b8036 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json new file mode 100644 index 0000000..6c6d8e7 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json new file mode 100644 index 0000000..d4a5cfb --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json new file mode 100644 index 0000000..0f5ad40 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json new file mode 100644 index 0000000..b9816fb --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json new file mode 100644 index 0000000..4cf16ef --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json new file mode 100644 index 0000000..302a152 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json new file mode 100644 index 0000000..114dfb1 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json new file mode 100644 index 0000000..b807a2e --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json new file mode 100644 index 0000000..546ccfc --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json new file mode 100644 index 0000000..c0a53bd --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json new file mode 100644 index 0000000..18457f2 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json new file mode 100644 index 0000000..7b39f86 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json new file mode 100644 index 0000000..47198f9 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json new file mode 100644 index 0000000..87e9341 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json new file mode 100644 index 0000000..87e9341 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json new file mode 100644 index 0000000..94554bb --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json new file mode 100644 index 0000000..2778958 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json new file mode 100644 index 0000000..33a65cc --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json new file mode 100644 index 0000000..cdabc8e --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json new file mode 100644 index 0000000..6123f3c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json new file mode 100644 index 0000000..0252095 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json new file mode 100644 index 0000000..2039e2f --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json new file mode 100644 index 0000000..939863d --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json new file mode 100644 index 0000000..460f463 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json new file mode 100644 index 0000000..939863d --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json new file mode 100644 index 0000000..2d807f5 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json new file mode 100644 index 0000000..f23c85a --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json new file mode 100644 index 0000000..64f030d --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json new file mode 100644 index 0000000..b87c0bc --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json new file mode 100644 index 0000000..50f16e7 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86506, 10.87184, 10.80703, 10.71158, 10.63915, 10.1929, 10.30937, 10.21969, 9.91592]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37021.0, 37806.0, 36157.0, 33974.0, 34873.0, 30957.0, 35062.0, 36419.0, 37713.0]}, "iteration_timing_avg": 0.35529294117647064} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json new file mode 100644 index 0000000..cd90f50 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86462, 10.87239, 10.80678, 10.7118, 10.63911, 10.19319, 10.30944, 10.21988, 9.91603]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37033.0, 37783.0, 36040.0, 33452.0, 34761.0, 30933.0, 35487.0, 36392.0, 37655.0]}, "iteration_timing_avg": 0.3566726470588235} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json new file mode 100644 index 0000000..f2d7111 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86367, 10.80237, 10.71665, 10.6452, 10.21186, 10.32279, 10.22474, 9.93034]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38080.0, 36072.0, 33389.0, 34302.0, 30262.0, 35071.0, 36081.0, 36818.0]}, "iteration_timing_avg": 0.2153429411764706} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json new file mode 100644 index 0000000..01e0884 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86435, 10.80239, 10.7159, 10.6454, 10.21181, 10.32236, 10.22471, 9.92956]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38276.0, 36278.0, 32946.0, 34291.0, 30145.0, 35217.0, 36060.0, 37032.0]}, "iteration_timing_avg": 0.21900323529411767} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json new file mode 100644 index 0000000..1c3ceb0 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86861, 10.87486, 10.7986, 10.66452, 10.58021, 10.05487, 10.18533, 10.097, 9.75749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [26144.0, 31960.0, 32510.0, 31451.0, 28954.0, 30872.0, 29506.0, 33312.0, 34558.0, 36855.0]}, "iteration_timing_avg": 0.28211852941176474} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json new file mode 100644 index 0000000..4c8008e --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json new file mode 100644 index 0000000..98ff45e --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json new file mode 100644 index 0000000..265ad7c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json new file mode 100644 index 0000000..517c935 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..265ad7c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json new file mode 100644 index 0000000..265ad7c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json new file mode 100644 index 0000000..265ad7c --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json new file mode 100644 index 0000000..196e4b2 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json new file mode 100644 index 0000000..49917fe --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json new file mode 100644 index 0000000..8718207 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json b/tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json new file mode 100644 index 0000000..624cd82 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62396, 10.53554, 10.25187, 10.20873, 9.96714, 9.96605, 9.92367, 9.79179, 9.26742, 9.61926, 9.18974, 9.46019, 9.62277]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2078.0, 2328.0, 2420.0, 2256.0, 2180.0, 2078.0, 2313.0, 2933.0, 2712.0, 2270.0, 2872.0, 3003.0, 3555.0, 3066.0, 3103.0, 3098.0, 3762.0]}, "iteration_timing_avg": 0.13093716417910448} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..5c516f0 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json new file mode 100644 index 0000000..68d9fe8 --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412} diff --git a/tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json new file mode 100644 index 0000000..87df9ed --- /dev/null +++ b/tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112} diff --git a/tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json new file mode 100644 index 0000000..6478081 --- /dev/null +++ b/tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.14052, 9.14041, 9.13223, 9.12307, 9.07696, 9.06413, 9.00897, 8.96969, 8.93509, 8.85701]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2557220.0, 2644506.0, 2554848.0, 2479331.0, 2739591.0, 2557907.0, 2491851.0, 2537345.0, 2513770.0, 2645270.0]}, "iteration_timing_avg": 0.21943264705882357} \ No newline at end of file diff --git a/tests/functional_tests/test_results/jet/t5_220m_mr_mcore_te_tp1_pp1_vp1_calculate_per_token_loss_dgx_a100_1N8G.json b/tests/functional_tests/test_results/jet/t5_220m_mr_mcore_te_tp1_pp1_vp1_calculate_per_token_loss_dgx_a100_1N8G.json new file mode 100644 index 0000000..7d87869 --- /dev/null +++ b/tests/functional_tests/test_results/jet/t5_220m_mr_mcore_te_tp1_pp1_vp1_calculate_per_token_loss_dgx_a100_1N8G.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.33692, 9.42684, 8.86347, 8.56218, 8.28402, 8.10585, 7.84893, 7.53544, 7.41091, 7.29556, 7.39322, 7.21918, 7.103, 7.04859, 6.90381, 6.96025, 6.96467, 7.03545, 6.70046, 6.96655]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43335.0, 41016.0, 44013.0, 41737.0, 44813.0, 43943.0, 41248.0, 42538.0, 44705.0, 43912.0, 41141.0, 43279.0, 39762.0, 45412.0, 43319.0, 43922.0, 45387.0, 45708.0, 46322.0, 44694.0]}, "iteration_timing_avg": 0.17640776119402987} diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh new file mode 100644 index 0000000..54090ae --- /dev/null +++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh @@ -0,0 +1,139 @@ +#! /bin/bash +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@"; do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail +if [[ -z $MBS ]]; then MBS=4; fi +if [[ -z $GBS ]]; then GBS=128; fi +if [[ -z $VOCAB_FILE ]]; then VOCAB_FILE="/workspace/data/bert_data/vocab.txt"; fi +if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi + +# Change for multinode config +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE * $NUM_NODES)) +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +TRAINING_DTYPE=fp16 +TRANSFORMER_IMPL=local + +if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;" +else + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;" + ADDITIONAL_PARAMS+=" --deterministic-mode" +fi + +USE_LEGACY=1 +if [[ $USE_CORE -eq 1 ]]; then + echo "Running using megatron core" + TRANSFORMER_IMPL=local + TRAINING_DTYPE=bf16 + unset USE_LEGACY +fi +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running checkpoint resume test..." + __SAVE_INTERVAL=50 + ADDITIONAL_PARAMS+=" --use-checkpoint-args --use-checkpoint-opt_param-scheduler" + if [[ $MAX_STEPS -ne 100 ]]; then + echo "Overriding MAX_STEPS=100" + MAX_STEPS=100 + fi +else + __SAVE_INTERVAL=10000 # inf +fi +# Runs the "345M" parameter model +DISTRIBUTED_ARGS="--max-restarts 3 --nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + +torch_run_cmd="torchrun $DISTRIBUTED_ARGS \ + pretrain_bert.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-validation-ppl-to-tensorboard \ + --log-timers-to-tensorboard \ + --log-memory-to-tensorboard \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --micro-batch-size ${MBS:-4} \ + --global-batch-size ${GBS:-128} \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --train-iters $MAX_STEPS \ + --timing-log-level 2 \ + --lr-decay-iters 990000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.0001 \ + --min-lr 0.00001 \ + --lr-warmup-fraction 0.01 \ + --log-interval 1 \ + --save-interval $__SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 10 \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ + ${USE_LEGACY:+--use-legacy-models} \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \ + --no-gradient-accumulation-fusion \ + ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \ + --${TRAINING_DTYPE}" + +if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then + # Both NVTE_APPLY_QK_LAYER_SCALING and --apply-query-key-layer-scaling must be passed + # to enable feature and be backward compatible with TE<0.11 + export NVTE_APPLY_QK_LAYER_SCALING=1 + torch_run_cmd+=" --apply-query-key-layer-scaling" + # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using: + # 1. --apply-query-key-layer-scaling + # 2. transformer_impl="transformer_engine" + # 3. TE >= 0.11 + # 4. fp16 + export NVTE_APPLY_QK_LAYER_SCALING=1 +fi + +command="$command $torch_run_cmd" +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + command="$command; rm -rf $CHECKPOINT_PATH/iter_0000100; echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt; $torch_run_cmd" +fi +echo "-------------------- THE FINAL PRETRAIN SCRIPT COMMAND THAT WILL BE RUN ------------" +echo "$command" +echo "-----------------------------------------------------------------------------" + +echo "$command" >$SCRIPTS_DIR/pretrain_bert_distributed_command.sh +eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest -s ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest -s ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh new file mode 100644 index 0000000..25976d2 --- /dev/null +++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh @@ -0,0 +1,206 @@ +#! /bin/bash +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail +if [[ -z $MBS ]]; then MBS=4; fi +if [[ -z $GBS ]]; then GBS=32; fi +if [[ -z $MOE_GROUPED_GEMM ]]; then MOE_GROUPED_GEMM=0; fi +if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi +if [[ -z $VOCAB_FILE ]]; then VOCAB_FILE="/workspace/data/gpt3_data/vocab.json" ; fi +if [[ -z $MERGE_FILE ]]; then MERGE_FILE="/workspace/data/gpt3_data/merges.txt" ; fi + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +TRAINING_DTYPE=fp16 +TRANSFORMER_IMPL=local + +if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;" +else + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=Tree;" + ADDITIONAL_PARAMS+=" --deterministic-mode" +fi + +if [[ $USE_GA -eq 0 ]]; then + ADDITIONAL_PARAMS+=" --no-gradient-accumulation-fusion" +fi + +USE_LEGACY=1 +if [[ $USE_CORE -eq 1 ]]; then + echo "Running using megatron core" + unset USE_LEGACY +fi + +if [[ $USE_FP8 -eq 1 ]]; then + echo "Running FP8 Training using Transformer Engine ..." + ADDITIONAL_PARAMS+=" --fp8-format hybrid --fp8-amax-history-len 1024 --fp8-amax-compute-algo max" + USE_TE=1 +fi + +if [[ $MOE_GROUPED_GEMM -eq 1 ]]; then + echo "Running MoE with Grouped GEMM" + TRAINING_DTYPE=bf16 # Currently GroupedGEMM for MoE only supports bf16 dtype +fi + +if [[ $USE_TE -eq 1 ]]; then + echo "Running with TransformerEngine ..." + TRANSFORMER_IMPL=transformer_engine + TRAINING_DTYPE=bf16 + ADDITIONAL_PARAMS+=" --attention-softmax-in-fp32" +else + echo "Running with local transformer implementation ..." +fi +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running checkpoint resume test..." + __SAVE_INTERVAL=50 + ADDITIONAL_PARAMS+=" --use-checkpoint-opt_param-scheduler" + if [[ $MAX_STEPS -ne 100 ]]; then + echo "Overriding MAX_STEPS=100" + MAX_STEPS=100 + fi +else + __SAVE_INTERVAL=${SAVE_INTERVAL:-10000} # inf +fi +if [[ -n "$CKPT_FORMAT" ]] && [[ "$CKPT_FORMAT" != 'torch' ]]; then + echo "Using distributed checkpoint format $CKPT_FORMAT..." + [[ "$CKPT_FORMAT" == 'zarr' ]] && command="$command pip install zarr tensorstore==0.1.45;" + ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT --use-mcore-models" +fi +set +x +# Runs the "345M" parameter model + +build_torch_run_cmd() { + DISTRIBUTED_ARGS="--max-restarts 3 --nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + [[ -n "$RUN_CMD" ]] && run_cmd=$RUN_CMD || run_cmd="torchrun $DISTRIBUTED_ARGS" + torch_run_cmd="$run_cmd \ + pretrain_gpt.py \ + --num-layers 12 \ + --hidden-size 512 \ + --num-attention-heads 8 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-validation-ppl-to-tensorboard \ + --log-timers-to-tensorboard \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --micro-batch-size ${MBS:-4} \ + --global-batch-size ${GBS:-32} \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --train-iters $MAX_STEPS \ + --timing-log-level 2 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --lr-decay-style cosine \ + --min-lr 1.0e-5 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --log-interval 1 \ + --save-interval $__SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 10 \ + --transformer-impl $TRANSFORMER_IMPL \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ + ${EP_SIZE:+--expert-model-parallel-size "$EP_SIZE"} \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \ + ${USE_LEGACY:+--use-legacy-models} \ + ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \ + --${TRAINING_DTYPE}" + + if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then + torch_run_cmd+=" --apply-query-key-layer-scaling" + # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using: + # 1. --apply-query-key-layer-scaling + # 2. transformer_impl="transformer_engine" + # 3. TE >= 0.11 + # 4. fp16 + export NVTE_APPLY_QK_LAYER_SCALING=1 + fi +} + +build_torch_run_cmd +command="$command $torch_run_cmd" +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "------RESUME OVERRIDES ARGS LIST --------" + # apply all env vars starting from 'RESUME_OVERRIDE_' (after removing prefix) + _OVERRIDE_PREFIX="RESUME_OVERRIDE_" + _OVERRIDE_PREFIX_LENGTH=${#_OVERRIDE_PREFIX} + _NONEMPTY_OVERRIDES=0 + for ARGUMENT in "$@" + do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + if [[ $KEY == ${_OVERRIDE_PREFIX}* ]]; then + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + KEY="${KEY:$_OVERRIDE_PREFIX_LENGTH}" + if [[ -n "${VALUE}" ]]; then + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" + _NONEMPTY_OVERRIDES=1 + fi + fi + done + echo "---------------------------------" + if [[ $_NONEMPTY_OVERRIDES == 1 ]]; then + ADDITIONAL_PARAMS+=" --no-load-rng" # assuming TPxPP mismatch + fi + + build_torch_run_cmd + command="$command; rm -rf $CHECKPOINT_PATH/iter_0000100; echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt; $torch_run_cmd" +fi +echo "-------------------- THE FINAL PRETRAIN SCRIPT COMMAND THAT WILL BE RUN ------------" +echo "$command" +echo "-----------------------------------------------------------------------------" + +echo "$command" > $SCRIPTS_DIR/pretrain_gpt3_distributed_command.sh +eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export LOGS_DIR=$TENSORBOARD_DIR + if [[ $USE_FP8 -eq 1 ]]; then + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/gpt3-345m-weekly-dgx-h100-1n8g-mcore-tp1-pp1-bf16-baseline.json" + pytest ./tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py + else + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi + fi +fi diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh new file mode 100644 index 0000000..7367b1d --- /dev/null +++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh @@ -0,0 +1,65 @@ +#! /bin/bash +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +set +x +# Runs the "126m" parameter model + +build_run_cmd() { + #DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + [[ -n "$RUN_CMD" ]] && run_cmd=$RUN_CMD || run_cmd="python examples/nlp/language_modeling/megatron_gpt_pretraining.py" + nemo_run_cmd="$run_cmd \ + trainer.num_nodes=$NUM_NODES \ + trainer.devices=$GPUS_PER_NODE \ + trainer.max_steps=$MAX_STEPS \ + trainer.val_check_interval=$MAX_STEPS \ + trainer.limit_val_batches=50 \ + trainer.max_epochs=null \ + trainer.precision=bf16 \ + model.num_layers=12 \ + model.hidden_size=768 \ + model.num_attention_heads=12 \ + model.micro_batch_size=$MBS \ + model.global_batch_size=$GBS \ + model.tensor_model_parallel_size=$TP_SIZE \ + model.pipeline_model_parallel_size=$PP_SIZE \ + model.virtual_pipeline_model_parallel_size=${VP_SIZE:-null} \ + model.encoder_seq_length=2048 \ + model.max_position_embeddings=2048 \ + model.ffn_hidden_size=3072 \ + model.mcore_gpt=True \ + model.apply_query_key_layer_scaling=True \ + model.megatron_amp_O2=True \ + model.data.data_prefix=[] \ + model.data.data_impl=mock \ + model.data.splits_string=\'[99990,8,2]\' \ + model.optim.name=distributed_fused_adam \ + model.optim.weight_decay=0.1 \ + exp_manager.create_checkpoint_callback=False \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS}" +} + +build_run_cmd +command="$command $nemo_run_cmd" +eval $command diff --git a/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh new file mode 100644 index 0000000..ca4cddb --- /dev/null +++ b/tests/functional_tests/test_scripts/multimodal/pretrain_llava_distributed_test.sh @@ -0,0 +1,194 @@ +#! /bin/bash +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail +if [[ -z $MBS ]]; then MBS=4; fi +if [[ -z $GBS ]]; then GBS=32; fi +if [[ -z $MOE_GROUPED_GEMM ]]; then MOE_GROUPED_GEMM=0; fi +if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +TRAINING_DTYPE=fp16 +TRANSFORMER_IMPL=local + +if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;" +else + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=Tree;" + ADDITIONAL_PARAMS+=" --deterministic-mode" +fi + +USE_LEGACY=1 +if [[ $USE_CORE -eq 1 ]]; then + echo "Running using megatron core" + TRANSFORMER_IMPL=local + TRAINING_DTYPE=bf16 + unset USE_LEGACY +fi + +if [[ $MOE_GROUPED_GEMM -eq 1 ]]; then + echo "Running MoE with Grouped GEMM" + TRAINING_DTYPE=bf16 # Currently GroupedGEMM for MoE only supports bf16 dtype +fi + +if [[ $USE_TE -eq 1 ]]; then + echo "Running with TransformerEngine ..." + TRANSFORMER_IMPL=transformer_engine + TRAINING_DTYPE=bf16 + ADDITIONAL_PARAMS+=" --attention-softmax-in-fp32" +else + echo "Running with local transformer implementation ..." +fi +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running checkpoint resume test..." + __SAVE_INTERVAL=50 + ADDITIONAL_PARAMS+=" --use-checkpoint-opt_param-scheduler" + if [[ $MAX_STEPS -ne 100 ]]; then + echo "Overriding MAX_STEPS=100" + MAX_STEPS=100 + fi +else + __SAVE_INTERVAL=10000 # inf +fi +if [[ -n "$CKPT_FORMAT" ]] && [[ "$CKPT_FORMAT" != 'torch' ]]; then + echo "Using distributed checkpoint format $CKPT_FORMAT..." + [[ "$CKPT_FORMAT" == 'zarr' ]] && command="$command pip install zarr tensorstore==0.1.45;" + ADDITIONAL_PARAMS+=" --use-dist-ckpt --dist-ckpt-format $CKPT_FORMAT --use-mcore-models" +fi +set +x + +DISTRIBUTED_ARGS="--max-restarts 3 --nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + +build_torch_run_cmd() { + torch_run_cmd="torchrun $DISTRIBUTED_ARGS \ + pretrain_vlm.py \ + --num-layers 12 \ + --hidden-size 512 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --num-attention-heads 8 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-validation-ppl-to-tensorboard \ + --log-timers-to-tensorboard \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --micro-batch-size ${MBS:-4} \ + --global-batch-size ${GBS:-32} \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --train-iters $MAX_STEPS \ + --timing-log-level 2 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --split 949,50,1 \ + --tokenizer-type NullTokenizer \ + --vocab-size=8192 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --lr-decay-style cosine \ + --min-lr 1.0e-5 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --log-interval 1 \ + --save-interval $__SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 10 \ + --transformer-impl $TRANSFORMER_IMPL \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \ + ${USE_LEGACY:+--use-legacy-models} \ + --no-gradient-accumulation-fusion \ + --${TRAINING_DTYPE} \ + --img-h 336 \ + --img-w 336 \ + --patch-dim 14 \ + --mock-data" + + if [[ "${TRAINING_DTYPE}" == "fp16" ]]; then + torch_run_cmd+=" --apply-query-key-layer-scaling" + # NVTE_APPLY_QK_LAYER_SCALING=1 is required if using: + # 1. --apply-query-key-layer-scaling + # 2. transformer_impl="transformer_engine" + # 3. TE >= 0.11 + # 4. fp16 + export NVTE_APPLY_QK_LAYER_SCALING=1 + fi +} + +build_torch_run_cmd +command="$command $torch_run_cmd" +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "------RESUME OVERRIDES ARGS LIST --------" + # apply all env vars starting from 'RESUME_OVERRIDE_' (after removing prefix) + _OVERRIDE_PREFIX="RESUME_OVERRIDE_" + _OVERRIDE_PREFIX_LENGTH=${#_OVERRIDE_PREFIX} + _NONEMPTY_OVERRIDES=0 + for ARGUMENT in "$@" + do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + if [[ $KEY == ${_OVERRIDE_PREFIX}* ]]; then + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + KEY="${KEY:$_OVERRIDE_PREFIX_LENGTH}" + if [[ -n "${VALUE}" ]]; then + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" + _NONEMPTY_OVERRIDES=1 + fi + fi + done + echo "---------------------------------" + if [[ $_NONEMPTY_OVERRIDES == 1 ]]; then + ADDITIONAL_PARAMS+=" --no-load-rng" # assuming TPxPP mismatch + fi + + build_torch_run_cmd + command="$command; rm -rf $CHECKPOINT_PATH/iter_0000100; echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt; $torch_run_cmd" +fi +echo "-------------------- THE FINAL PRETRAIN SCRIPT COMMAND THAT WILL BE RUN ------------" +echo "$command" +echo "-----------------------------------------------------------------------------" + +echo "$command" > $SCRIPTS_DIR/pretrain_llava_distributed_command.sh +eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh new file mode 100644 index 0000000..f9a3172 --- /dev/null +++ b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh @@ -0,0 +1,168 @@ +#! /bin/bash + +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail +if [[ -z $MBS ]]; then MBS=4; fi + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +TRANSFORMER_IMPL=local +TRAINING_DTYPE=bf16 + +USE_LEGACY=1 +if [[ $USE_CORE -eq 1 ]]; then + echo "Running using megatron core" + TRANSFORMER_IMPL=local + TRAINING_DTYPE=bf16 + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0;" + unset USE_LEGACY + export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 +fi + +if [[ $USE_TE -eq 1 ]]; then + echo "Running with TransformerEngine ..." + TRANSFORMER_IMPL=transformer_engine + TRAINING_DTYPE=bf16 +else + echo "Running with local transformer implementation ..." +fi + +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running checkpoint resume test..." + __SAVE_INTERVAL=50 + if [[ $MAX_STEPS -ne 100 ]]; then + echo "Overriding MAX_STEPS=100" + MAX_STEPS=100 + fi +else + __SAVE_INTERVAL=10000 # inf +fi +set +x +# Runs the "345M" parameter model +DISTRIBUTED_ARGS="--max-restarts 3 --nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + +build_args() { + ARGS=" \ + --exit-interval $MAX_STEPS \ + \ + --recompute-activations \ + --use-flash-attn \ + --apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --no-position-embedding \ + --use-rotary-position-embeddings \ + --rotary-percent 0.5 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --exit-duration-in-mins 220 \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size $MBS \ + --global-batch-size 256 \ + --train-samples 100000 \ + --lr-decay-samples 99000 \ + --lr-warmup-samples 1000 \ + --lr 2.5e-5 \ + --min-lr 2.5e-6 \ + --lr-decay-style cosine \ + --log-interval 5 \ + --eval-iters 100 \ + --eval-interval 2000 \ + --tokenizer-type GPT2BPETokenizer \ + --vocab-file /workspace/data/retro_data/vocab/gpt2-vocab.json \ + --merge-file /workspace/data/retro_data/vocab/gpt2-merges.txt \ + --data-path /workspace/data/retro_data/inputs/wiki-200k_text_document \ + --split 98,2,0 \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.007 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-validation-ppl-to-tensorboard \ + --log-timers-to-tensorboard \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --save-interval $__SAVE_INTERVAL \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --bf16 \ + --transformer-impl $TRANSFORMER_IMPL \ + --${TRAINING_DTYPE} \ + ${USE_LEGACY:+--use-legacy-models} \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \ + --retro-workdir /workspace/data/retro_data/neighbors + --retro-add-retriever \ + --num-workers 32 \ +" +} + +build_args +torch_run_cmd="torchrun $DISTRIBUTED_ARGS \ + pretrain_retro.py \ + ${ARGS}" + +command="$command $torch_run_cmd" + +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + MAX_STEPS=50 + build_args + torch_run_cmd="torchrun $DISTRIBUTED_ARGS \ + pretrain_retro.py \ + ${ARGS}" + command="$command; rm -rf $CHECKPOINT_PATH/iter_0000100; echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt; $torch_run_cmd" +fi +echo "-------------------- THE FINAL PRETRAIN SCRIPT COMMAND THAT WILL BE RUN ------------" +echo "$command" +echo "-----------------------------------------------------------------------------" + +pip install h5py +pip install transformers +pip install faiss-gpu + +echo "$command" > $SCRIPTS_DIR/pretrain_retro_distributed_command.sh +eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh new file mode 100644 index 0000000..5c297ed --- /dev/null +++ b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh @@ -0,0 +1,155 @@ +#! /bin/bash +echo "------ARGUMENTS LIST --------" +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" + echo "$KEY=$VALUE" +done +echo "---------------------------------" + +set -exo pipefail +if [[ -z $MBS ]]; then MBS=4; fi +if [[ -z $GBS ]]; then GBS=32; fi +if [[ -z $VOCAB_PATH ]]; then VOCAB_PATH="/workspace/data/t5_data/bert-large-cased-vocab.txt"; fi +if [[ -z $ALLOW_NONDETERMINISTIC ]]; then ALLOW_NONDETERMINISTIC=0; fi + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +command="export CUDA_DEVICE_MAX_CONNECTIONS=1;" + +TRAINING_DTYPE=fp16 +TRANSFORMER_IMPL=local + +if [[ $ALLOW_NONDETERMINISTIC -eq 1 ]]; then + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1;" +else + command="$command export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0; export NCCL_ALGO=^NVLS;" + ADDITIONAL_PARAMS+=" --deterministic-mode" +fi + +USE_LEGACY=1 +if [[ $USE_CORE -eq 1 ]]; then + echo "Running using megatron core" + TRANSFORMER_IMPL=local + TRAINING_DTYPE=bf16 + unset USE_LEGACY +fi + +if [[ $NO_FA -eq 1 ]]; then + echo "Turn off flash attention environment variable" + export NVTE_FLASH_ATTN=0 + export NVTE_FUSED_ATTN=0 +fi + +if [[ $USE_TE -eq 1 ]]; then + echo "Running with TransformerEngine ..." + TRANSFORMER_IMPL=transformer_engine + TRAINING_DTYPE=bf16 +else + echo "Running with local transformer implementation ..." +fi + +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running checkpoint resume test..." + __SAVE_INTERVAL=50 + if [[ $MAX_STEPS -ne 100 ]]; then + echo "Overriding MAX_STEPS=100" + MAX_STEPS=100 + fi +else + __SAVE_INTERVAL=10000 # inf +fi +set +x + +# install neccessary library +pip install pydantic==2.2.1 + +# Runs the "220M" parameter model +DISTRIBUTED_ARGS="--max-restarts 3 --nproc_per_node $GPUS_PER_NODE --nnodes $NUM_NODES" + +torch_run_cmd="torchrun $DISTRIBUTED_ARGS \ + pretrain_t5.py \ + --encoder-num-layers 12 \ + --decoder-num-layers 12 \ + --hidden-size 768 \ + --num-attention-heads 12 \ + --kv-channels 64 \ + --ffn-hidden-size 3072 \ + --encoder-seq-length 512 \ + --decoder-seq-length 128 \ + --max-position-embeddings 512 \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size ${MBS:-4} \ + --global-batch-size ${GBS:-32} \ + --lr 0.0001 \ + --train-iters $MAX_STEPS \ + --lr-decay-iters $MAX_STEPS \ + --lr-decay-style linear \ + --min-lr 0.00001 \ + --weight-decay 1e-2 \ + --lr-warmup-fraction .01 \ + --clip-grad 1.0 \ + --${TRAINING_DTYPE} \ + --vocab-extra-ids 100 \ + --init-method-std 0.015 \ + --transformer-impl $TRANSFORMER_IMPL \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_PATH \ + --tokenizer-type BertWordPieceCase \ + --split 99982,9,9 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-validation-ppl-to-tensorboard \ + --log-timers-to-tensorboard \ + --timing-log-level 2 \ + --log-interval 1 \ + --save-interval $__SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 10 \ + --distributed-backend nccl \ + ${DATA_CACHE:+--data-cache-path "$DATA_CACHE"} \ + ${USE_LEGACY:+--use-legacy-models} \ + ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS}" + +command="$command $torch_run_cmd" +if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + command="$command; rm -rf $CHECKPOINT_PATH/iter_0000100; echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt; $torch_run_cmd" +fi +echo "-------------------- THE FINAL PRETRAIN SCRIPT COMMAND THAT WILL BE RUN ------------" +echo "$command" +echo "-----------------------------------------------------------------------------" + +echo "$command" > $SCRIPTS_DIR/pretrain_t5_distributed_command.sh +eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +PYTHONPATH=$PWD python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py new file mode 100644 index 0000000..1d3c586 --- /dev/null +++ b/tests/unit_tests/__init__.py @@ -0,0 +1,2 @@ +import torch._dynamo +torch._dynamo.config.suppress_errors = True \ No newline at end of file diff --git a/tests/unit_tests/data/__init__.py b/tests/unit_tests/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py new file mode 100644 index 0000000..d1ea7ee --- /dev/null +++ b/tests/unit_tests/data/test_bin_reader.py @@ -0,0 +1,162 @@ +import os +import random +import sys +import tempfile +from types import ModuleType, SimpleNamespace +from typing import Any, Dict + +import nltk + +try: + import boto3 + import botocore.exceptions as exceptions +except ModuleNotFoundError: + boto3 = ModuleType("boto3") + sys.modules[boto3.__name__] = boto3 + exceptions = ModuleType("botocore.exceptions") + sys.modules[exceptions.__name__] = exceptions + +from megatron.core.datasets.indexed_dataset import ( + IndexedDataset, + S3Config, + _FileBinReader, + _MMapBinReader, + _S3BinReader, +) +from megatron.core.datasets.utils_s3 import S3_PREFIX, S3Client +from tests.unit_tests.data.test_preprocess_data import ( + build_datasets, + dummy_jsonl, + gpt2_merge, + gpt2_vocab, +) + +## +# Overload client from boto3 +## + + +class _LocalClient(S3Client): + """Local test client""" + + def __init__(self, *args: Any) -> None: + pass + + def download_file(self, Bucket: str, Key: str, Filename: str) -> None: + os.system(f"cp {os.path.join('/', Bucket, Key)} {Filename}") + assert os.path.exists(Filename) + + def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: + raise NotImplementedError + + def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: + assert os.path.exists(os.path.join("/", Bucket, Key)) + return {} + + def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, Any]: + _, _range = Range.split("=") + _range_beg, _range_end = tuple(map(int, _range.split("-"))) + + filename = os.path.join("/", Bucket, Key) + + with open(filename, mode='rb', buffering=0) as bin_buffer_file: + bin_buffer_file.seek(_range_beg) + _bytes = bin_buffer_file.read(_range_end - _range_beg) + + response = {"Body": SimpleNamespace(read=lambda: _bytes)} + + return response + + def close(self) -> None: + pass + + +setattr(boto3, "client", _LocalClient) + + +## +# Overload ClientError from botocore.exceptions +## + + +class _LocalClientError(Exception): + """ "Local test client error""" + + pass + + +setattr(exceptions, "ClientError", _LocalClientError) + + +def test_bin_reader(): + with tempfile.TemporaryDirectory() as temp_dir: + # set the default nltk data path + os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") + nltk.data.path.append(os.environ["NLTK_DATA"]) + + path_to_raws = os.path.join(temp_dir, "sample_raws") + path_to_data = os.path.join(temp_dir, "sample_data") + path_to_s3_cache = os.path.join(temp_dir, "s3_cache") + os.mkdir(path_to_raws) + os.mkdir(path_to_data) + os.mkdir(path_to_s3_cache) + + # create the dummy resources + dummy_jsonl(path_to_raws) + + # build the datasets + build_datasets( + path_to_raws, + path_to_data, + extra_args=[ + "--tokenizer-type", + "GPT2BPETokenizer", + "--vocab-file", + gpt2_vocab(temp_dir), + "--merge-file", + gpt2_merge(temp_dir), + "--append-eod", + "--workers", + "10", + "--log-interval", + "1", + ], + ) + + prefixes = set( + [ + os.path.join(temp_dir, "sample_data", path.split(".")[0]) + for path in os.listdir(path_to_data) + if path.endswith(".bin") or path.endswith(".idx") + ] + ) + + for prefix in prefixes: + indexed_dataset_file = IndexedDataset(prefix, multimodal=False, mmap=False) + assert isinstance(indexed_dataset_file.bin_reader, _FileBinReader) + + indexed_dataset_mmap = IndexedDataset(prefix, multimodal=False, mmap=True) + assert isinstance(indexed_dataset_mmap.bin_reader, _MMapBinReader) + + indexed_dataset_s3 = IndexedDataset( + S3_PREFIX + prefix, + multimodal=False, + mmap=False, + s3_config=S3Config(path_to_idx_cache=path_to_s3_cache), + ) + assert isinstance(indexed_dataset_s3.bin_reader, _S3BinReader) + + assert len(indexed_dataset_s3) == len(indexed_dataset_file) + assert len(indexed_dataset_s3) == len(indexed_dataset_mmap) + + indices = random.sample( + list(range(len(indexed_dataset_s3))), min(100, len(indexed_dataset_s3)) + ) + + for idx in indices: + assert (indexed_dataset_s3[idx] == indexed_dataset_file[idx]).all() + assert (indexed_dataset_s3[idx] == indexed_dataset_mmap[idx]).all() + + +if __name__ == "__main__": + test_bin_reader() diff --git a/tests/unit_tests/data/test_builder.py b/tests/unit_tests/data/test_builder.py new file mode 100644 index 0000000..5675259 --- /dev/null +++ b/tests/unit_tests/data/test_builder.py @@ -0,0 +1,367 @@ +## +# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import +## + +import torch + +from megatron.core.datasets.utils import compile_helpers +from tests.unit_tests.test_utilities import Utils + +if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() +else: + compile_helpers() + +## +# Done +## + +import os +import tempfile +from collections import defaultdict +from typing import Dict, Optional + +import numpy +import torch + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset +from megatron.core.datasets.utils import Split, get_blend_from_list + +_NUM_DATASETS = 10 + +_SEQUENCE_LENGTH = 10 + +_SIZES = {} +for split in Split: + _SIZES[split] = [] + for i in range(_NUM_DATASETS): + _SIZES[split].append({Split.train: 1000, Split.valid: 100, Split.test: 10}[split] * (i + 1)) + +_MARGIN = 0.005 + + +def do_setup(odir): + paths = defaultdict(list) + + for i in range(_NUM_DATASETS): + path_to_data = os.path.join(odir, str(i)) + os.mkdir(path_to_data) + + for split in _SIZES: + data = numpy.zeros((_SIZES[split][i], _SEQUENCE_LENGTH)) + path = os.path.join(path_to_data, f"{split.name}.npy") + numpy.save(path, data) + paths[split].append(path) + + return paths + + +def test_builder(): + + # Define the class here to avoid pytest warnings + + class TestDataset(MegatronDataset): + def __init__( + self, + dataset: LowLevelDataset, + dataset_path: Optional[str], + indices: numpy.ndarray, + num_samples: Optional[int], + index_split: Split, + config: BlendedMegatronDatasetConfig, + ) -> None: + super().__init__(dataset, dataset_path, indices, num_samples, index_split, config) + + if self.num_samples is None: + self.num_samples = len(self.indices) + + self.sample_index = numpy.random.choice(self.indices, size=self.num_samples) + + @staticmethod + def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: + return len(low_level_dataset) + + @staticmethod + def build_low_level_dataset( + dataset_path: str, config: BlendedMegatronDatasetConfig + ) -> LowLevelDataset: + return numpy.load(dataset_path) + + def __len__(self) -> int: + return len(self.sample_index) + + def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]: + return {"text": self.dataset[self.sample_index[idx]]} + + with tempfile.TemporaryDirectory() as temp_dir: + + paths = do_setup(temp_dir) + + blends = { + split: get_blend_from_list( + [ + weight_or_path + for pair in zip(list(range(1, len(paths[split]) + 1, 1)), paths[split]) + for weight_or_path in pair + ] + ) + for split in Split + } + + blends_unweighted = {split: (blends[split][0], None) for split in blends} + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[blends[Split.train], None, None,], + ) + try: + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [None, None, None], lambda: True, config + ).build() + raise RuntimeError + except AssertionError: + pass + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[get_blend_from_list([paths[Split.train][0]]), None, None,], + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [1000, None, None], lambda: True, config + ).build() + assert len(datasets[0]) == 1000 and isinstance(datasets[0], TestDataset) + assert datasets[1] is None + assert datasets[2] is None + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[ + blends_unweighted[Split.train], + blends_unweighted[Split.valid], + blends_unweighted[Split.test], + ], + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [1000, 1000, 1000], lambda: True, config + ).build() + assert len(datasets[0]) == 1000 + assert len(datasets[1]) == 1000 + assert len(datasets[2]) == sum(_SIZES[Split.test]) + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[ + blends_unweighted[Split.train], + blends_unweighted[Split.valid], + blends_unweighted[Split.test], + ], + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [None, None, None], lambda: True, config + ).build() + assert len(datasets[0]) == sum(_SIZES[Split.train]) + assert numpy.all( + numpy.array(datasets[0].weights) + == numpy.unique(datasets[0].dataset_index, return_counts=True)[1] + ) + assert len(datasets[1]) == sum(_SIZES[Split.valid]) + assert numpy.all( + numpy.array(datasets[1].weights) + == numpy.unique(datasets[1].dataset_index, return_counts=True)[1] + ) + assert len(datasets[2]) == sum(_SIZES[Split.test]) + assert numpy.all( + numpy.array(datasets[2].weights) + == numpy.unique(datasets[2].dataset_index, return_counts=True)[1] + ) + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[blends_unweighted[Split.train], None, None,], + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [1000, None, None], lambda: True, config + ).build() + assert len(datasets[0]) == 1000 + for i in range(_NUM_DATASETS): + assert len(datasets[0].datasets[i]) == _SIZES[Split.train][i] + assert datasets[1] is None + assert datasets[2] is None + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[blends[Split.train], None, None], + ) + try: + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [1000, None, None], lambda: True, config + ).build() + raise RuntimeError + except IndexError: + ## + # + # The size per dataset is a function of the requested size, the weight per dataset, + # and a constant coefficient. The sizes, and consequently the total size to request, + # are modified such that the weights may or may not be sufficiently representative. + # To fix this, the weights should be reset according to the new sizes: + # + # S := size + # W := weights + # + # S = func(S, W) + # + # W = S / sum(S) + # + ## + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend_per_split=[blends[Split.train], blends[Split.valid], blends[Split.test],], + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [100, 100, 100], lambda: True, config + ).build() + assert ( + len(datasets[0]) >= 100 and len(datasets[0]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS + ) + assert ( + len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS + ) + assert ( + len(datasets[2]) >= 100 and len(datasets[2]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS + ) + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends_unweighted[Split.train], + split="100,0,0", + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [None, None, None], lambda: True, config + ).build() + assert len(datasets[0]) == sum(_SIZES[Split.train]) + assert numpy.all( + numpy.array(datasets[0].weights) + == numpy.unique(datasets[0].dataset_index, return_counts=True)[1] + ) + assert datasets[1] is None + assert datasets[2] is None + + if torch.distributed.is_initialized(): + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends_unweighted[Split.train], + split="100,0,0", + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, + [None, None, None], + lambda: torch.distributed.get_rank() % 2 == 0, + config, + ).build() + if torch.distributed.get_rank() % 2 == 0: + assert len(datasets[0]) == sum(_SIZES[Split.train]) + assert numpy.all( + numpy.array(datasets[0].weights) + == numpy.unique(datasets[0].dataset_index, return_counts=True)[1] + ) + else: + assert datasets[0] is None + assert datasets[1] is None + assert datasets[2] is None + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends_unweighted[Split.train], + split="50,50,0", + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [1000, 0, None], lambda: True, config + ).build() + assert len(datasets[0]) == 1000 + assert sum(map(len, datasets[0].datasets)) == sum(_SIZES[Split.train]) / 2 + assert sum(map(len, datasets[1].datasets)) == sum(_SIZES[Split.train]) / 2 + assert datasets[1] is not None and len(datasets[1]) == 0 + assert datasets[2] is None + + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends_unweighted[Split.train], + split="50,50,0", + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, + [int(sum(_SIZES[Split.train]) / 4), int(sum(_SIZES[Split.train])), None], + lambda: True, + config, + ).build() + assert len(datasets[0]) == sum(_SIZES[Split.train]) / 4 + assert len(datasets[1]) == sum(_SIZES[Split.train]) / 2 + assert datasets[2] is None + + # 990 9 1 + # 100000 1000 1 + # [] + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends[Split.train], + split="990,9,1", + ) + try: + # All three of 100000, 1000, and 1 result in error, yet 10000 and 100 do not + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [100000, 1000, 1], lambda: True, config + ).build() + except IndexError: + ## + # + # The size per dataset is a function of the requested size, the weight per dataset, + # and a constant coefficient. The sizes, and consequently the total size to request, + # are modified such that the weights may or may not be sufficiently representative. + # To fix this, the weights should be reset according to the new sizes: + # + # S := size + # W := weights + # + # S = func(S, W) + # + # W = S / sum(S) + # + ## + config = BlendedMegatronDatasetConfig( + random_seed=1234, + sequence_length=_SEQUENCE_LENGTH, + blend=blends[Split.train], + split="990,9,1", + ) + datasets = BlendedMegatronDatasetBuilder( + TestDataset, [10000, 100, 0], lambda: True, config + ).build() + assert ( + len(datasets[0]) >= 10000 + and len(datasets[0]) <= 10000 * (1 + _MARGIN) + _NUM_DATASETS + ) + assert ( + len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS + ) + assert len(datasets[2]) == 0 + + +if __name__ == "__main__": + test_builder() diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py new file mode 100644 index 0000000..a53854f --- /dev/null +++ b/tests/unit_tests/data/test_gpt_dataset.py @@ -0,0 +1,116 @@ +## +# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import +## + +import torch + +from megatron.core.datasets.utils import compile_helpers +from tests.unit_tests.test_utilities import Utils + +if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() +else: + compile_helpers() + +## +# Done +## + +import random + +import numpy + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset +from megatron.training.tokenizer.tokenizer import _NullTokenizer + +_MOCK_VOCAB_SIZE = 8192 + + +def sample_N(dataset, N, randomize): + if randomize: + indices = [random.randint(0, len(dataset) - 1) for _ in range(N)] + else: + indices = list(range(N)) + samples = [dataset[index]["tokens"].numpy() for index in indices] + return samples + + +def test_mock_gpt_dataset(): + tokenizer = _NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE) + + config = GPTDatasetConfig( + random_seed=1234, + sequence_length=1024, + split="990,9,1", + reset_position_ids=True, + reset_attention_mask=True, + eod_mask_loss=True, + tokenizer=tokenizer, + ) + + datasets = BlendedMegatronDatasetBuilder( + MockGPTDataset, [100, 100, 100], lambda: True, config + ).build() + + N = 10 + + # Check iso-index variance by split + subsets = [sample_N(dataset, N, randomize=False) for dataset in datasets] + assert not numpy.allclose(subsets[0], subsets[1]) + assert not numpy.allclose(subsets[0], subsets[2]) + assert not numpy.allclose(subsets[1], subsets[2]) + + # Check iso-split / iso-index identity + subset_1A = sample_N(datasets[0], N, randomize=False) + subset_1B = sample_N(datasets[0], N, randomize=False) + assert numpy.allclose(subset_1A, subset_1B) + + # Check iso-split variance by index + subset_1A = sample_N(datasets[0], N, randomize=True) + subset_1B = sample_N(datasets[0], N, randomize=True) + assert not numpy.allclose(subset_1A, subset_1B) + + config = GPTDatasetConfig( + random_seed=1234, + sequence_length=1024, + split="990,10,0", + reset_position_ids=True, + reset_attention_mask=True, + eod_mask_loss=True, + drop_last_partial_validation_sequence=False, + add_extra_token_to_sequence=False, + tokenizer=tokenizer, + ) + + datasets = BlendedMegatronDatasetBuilder( + MockGPTDataset, [0, None, 0], lambda: True, config + ).build() + + sample = datasets[1][datasets[1].shuffle_index.argmax()] + argmax = sample['labels'].shape[0] - torch.flip(sample['labels'], [0]).argmax() - 1 + + # Test add_extra_token_to_sequence + assert sample['tokens'][argmax] != tokenizer.eod + assert sample['labels'][argmax] == tokenizer.eod + + # Test eod_mask_loss, drop_last_partial_validation_sequence + assert argmax < sample['labels'].shape[0] - 1 + assert torch.all(sample['labels'][argmax + 1 :] == 0) + assert not torch.any( + sample['loss_mask'][ + torch.logical_and(sample['labels'] == tokenizer.eod, sample['labels'] == 0,) + ] + ) + + sample = datasets[1][None] + + # Check handling of None index + assert not torch.any(sample['loss_mask']) + + +if __name__ == "__main__": + test_mock_gpt_dataset() diff --git a/tests/unit_tests/data/test_multimodal_dataset.py b/tests/unit_tests/data/test_multimodal_dataset.py new file mode 100644 index 0000000..4eeb157 --- /dev/null +++ b/tests/unit_tests/data/test_multimodal_dataset.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +## +# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import +## + +import torch + +from megatron.core.datasets.utils import compile_helpers +from tests.unit_tests.test_utilities import Utils + +if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() +else: + compile_helpers() + +## +# Done +## + +from types import SimpleNamespace + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig +from megatron.training.tokenizer.tokenizer import _NullTokenizer + +_MOCK_VOCAB_SIZE = 8192 + + +def test_mock_multimodal_dataset(): + config = MultimodalDatasetConfig( + random_seed=1234, + sequence_length=1024, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=True, + image_h=336, + image_w=336, + split="990,9,1", + tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE), + ) + + datasets = BlendedMegatronDatasetBuilder( + MockMultimodalDataset, [100, 100, 100], lambda: True, config + ).build() + + for ds in datasets: + sample = ds[0] + assert "image" in sample + assert sample["image"].shape == torch.Size([3, 336, 336]) + assert "tokens" in sample + + +if __name__ == "__main__": + test_mock_multimodal_dataset() diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py new file mode 100644 index 0000000..6865096 --- /dev/null +++ b/tests/unit_tests/data/test_preprocess_data.py @@ -0,0 +1,241 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import json +import os +import sys +import tempfile + +import nltk +import requests + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.training.tokenizer.gpt2_tokenization import ( + PRETRAINED_MERGES_ARCHIVE_MAP, + PRETRAINED_VOCAB_ARCHIVE_MAP, +) +from tools.merge_datasets import main as merge_main +from tools.preprocess_data import Encoder +from tools.preprocess_data import get_args as build_args +from tools.preprocess_data import main as build_main + +__HUGGINGFACE_BERT_BASE_UNCASED_VOCAB = ( + "https://huggingface.co/bert-base-uncased/raw/main/vocab.txt" +) + +__LOCAL_BERT_VOCAB = "/home/gitlab-runner/data/bert_data/vocab.txt" + +__LOCAL_GPT2_MERGE = "/home/gitlab-runner/data/gpt3_data/gpt2-merges.txt" + +__LOCAL_GPT2_VOCAB = "/home/gitlab-runner/data/gpt3_data/gpt2-vocab.json" + + +def dummy_jsonl(odir): + # numbers + list_numbers = [json.dumps({"text": str(i + 1)}) + "\n" for i in range(100)] + with open(os.path.join(odir, "numbers.jsonl"), "w") as writer: + writer.writelines(list_numbers) + # numbers ascending + list_numbers_ascending = [ + json.dumps({"text": " ".join([str(j + 1) for j in range(i + 1)])}) + "\n" + for i in range(100) + ] + with open(os.path.join(odir, "numbers_ascending.jsonl"), "w") as writer: + writer.writelines(list_numbers_ascending) + # test + list_test = [] + with open(__file__) as reader: + for line in reader: + list_test.append(json.dumps({"text": line}) + "\n") + with open(os.path.join(odir, "test.jsonl"), "w") as writer: + writer.writelines(list_test) + + +def build_datasets(idir, odir, extra_args=[]): + for name in os.listdir(idir): + sys.argv = [ + sys.argv[0], + "--input", + os.path.join(idir, name), + "--output-prefix", + os.path.join(odir, os.path.splitext(name)[0]), + ] + extra_args + build_main() + + +def merge_datasets(idir): + sys.argv = [sys.argv[0], "--input", idir, "--output-prefix", os.path.join(idir, "merge")] + merge_main() + + +def do_test_preprocess_data(temp_dir, extra_args=[]): + # set the default nltk data path + os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") + nltk.data.path.append(os.environ["NLTK_DATA"]) + + path_to_raws = os.path.join(temp_dir, "sample_raws") + path_to_data = os.path.join(temp_dir, "sample_data") + os.mkdir(path_to_raws) + os.mkdir(path_to_data) + + # create the dummy resources + dummy_jsonl(path_to_raws) + + # build the datasets + build_datasets( + path_to_raws, path_to_data, extra_args=extra_args, + ) + + # merge the datasets + merge_datasets(path_to_data) + + sys.argv = [sys.argv[0], "--input", None, "--output-prefix", None,] + extra_args + encoder = Encoder(build_args()) + encoder.initializer() + + def tokens_to_string(toks): + for option in ["decode", "detokenize"]: + try: + return getattr(encoder.tokenizer, option)(toks) + except: + continue + raise RuntimeError(f"{type(encoder.tokenizer)} tokenizer cannot decode or detokenize") + + merged_index = 0 + merged_dataset = IndexedDataset(os.path.join(path_to_data, "merge")) + + # sorted to ensure ordering matches merged dataset + basenames = sorted( + [ + name + for name in os.listdir(path_to_data) + if name.endswith(".idx") and not name.startswith("merge") + ] + ) + + # index into the merged document index + merged_doc_index_index = 0 + + for basename in basenames: + realpath_raw = f"{os.path.join(path_to_raws, '_'.join(basename.split('_')[:-2]))}.jsonl" + realpath_doc = os.path.join(path_to_data, basename.split(".")[-2]) + + dataset_index = 0 + dataset = IndexedDataset(realpath_doc) + + merged_doc_idx = merged_dataset.document_indices[ + merged_doc_index_index : merged_doc_index_index + len(dataset.document_indices) + ] + merged_doc_idx = merged_doc_idx - merged_doc_idx[0] + + assert ( + dataset.document_indices == merged_doc_idx + ).all(), f"ERROR: {basename.split('_')[:-2]}: merged dataset document indices mismatch" + + merged_doc_index_index += len(dataset.document_indices) - 1 + + with open(realpath_raw, "rt") as reader: + for json_line in reader: + toks = encoder.encode(json_line)[0]["text"] + + raw = tokens_to_string(toks) + + processed_toks = [] + while len(processed_toks) < len(toks): + processed_toks.extend(dataset[dataset_index]) + dataset_index += 1 + processed = tokens_to_string(processed_toks) + + assert ( + raw == processed + ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents do not match" + + merged_toks = [] + while len(merged_toks) < len(toks): + merged_toks.extend(merged_dataset[merged_index]) + merged_index += 1 + merged = tokens_to_string(merged_toks) + + assert ( + raw == merged + ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents do not match" + + print( + f"INFO: {''.join(basename.split('_')[:-2])}: raw, processed, and merged documents match!" + ) + + print("INFO: Success!") + + +def gpt2_vocab(odir): + if os.path.exists(__LOCAL_GPT2_VOCAB): + return __LOCAL_GPT2_VOCAB + path = os.path.join(odir, "vocab.json") + with open(path, "wb") as writer: + writer.write(requests.get(PRETRAINED_VOCAB_ARCHIVE_MAP['gpt2']).content) + return path + + +def gpt2_merge(odir): + if os.path.exists(__LOCAL_GPT2_MERGE): + return __LOCAL_GPT2_MERGE + path = os.path.join(odir, "merge.txt") + with open(path, "wb") as writer: + writer.write(requests.get(PRETRAINED_MERGES_ARCHIVE_MAP['gpt2']).content) + return path + + +def test_preprocess_data_gpt(): + with tempfile.TemporaryDirectory() as temp_dir: + + # gpt specific args + gpt_args = [ + "--tokenizer-type", + "GPT2BPETokenizer", + "--vocab-file", + gpt2_vocab(temp_dir), + "--merge-file", + gpt2_merge(temp_dir), + "--append-eod", + "--workers", + "10", + "--log-interval", + "1", + ] + + do_test_preprocess_data(temp_dir, extra_args=gpt_args) + + +def bert_vocab(odir): + if os.path.exists(__LOCAL_BERT_VOCAB): + return __LOCAL_BERT_VOCAB + path = os.path.join(odir, "vocab.txt") + with open(path, "wb") as writer: + writer.write(requests.get(__HUGGINGFACE_BERT_BASE_UNCASED_VOCAB).content) + return path + + +def test_preprocess_data_bert(): + with tempfile.TemporaryDirectory() as temp_dir: + + # bert specific args + bert_args = [ + "--tokenizer-type", + "BertWordPieceLowerCase", + "--vocab-file", + bert_vocab(temp_dir), + "--split-sentences", + "--workers", + "10", + "--log-interval", + "1", + "--partitions", + "2", + "--keep-sequential-samples", + ] + + do_test_preprocess_data(temp_dir, extra_args=bert_args) + + +if __name__ == "__main__": + test_preprocess_data_gpt() + test_preprocess_data_bert() \ No newline at end of file diff --git a/tests/unit_tests/data/test_preprocess_mmdata.py b/tests/unit_tests/data/test_preprocess_mmdata.py new file mode 100644 index 0000000..8aab96e --- /dev/null +++ b/tests/unit_tests/data/test_preprocess_mmdata.py @@ -0,0 +1,221 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import os +import random +import sys +import tempfile + +import nltk +import numpy + +from megatron.core.datasets.indexed_dataset import IndexedDataset +from tests.unit_tests.data.test_preprocess_data import dummy_jsonl, gpt2_merge, gpt2_vocab +from tools.merge_datasets import main as merge_main +from tools.preprocess_mmdata import Encoder +from tools.preprocess_mmdata import get_args as build_args +from tools.preprocess_mmdata import main as build_main + + +def dummy_img(odir_txt, odir_img): + for name in os.listdir(odir_txt): + with open(os.path.join(odir_txt, name), "rt") as reader_txt: + length = sum(1 for _ in reader_txt) + os.makedirs(os.path.join(odir_img, os.path.splitext(name)[0]), exist_ok=False) + for i in range(length): + with open( + os.path.join(odir_img, os.path.splitext(name)[0], f"{str(i).zfill(4)}.img"), "wb" + ) as writer_img: + # 32 * 32 - 1 to induce preprocessing 0-index padding + writer_img.write(bytes([random.randint(0, 255) for _ in range(32 * 32 - 1)])) + + +def build_datasets(idir_txt, idir_img, odir, extra_args=[]): + for name in os.listdir(idir_txt): + sys.argv = [ + sys.argv[0], + "--input", + os.path.join(idir_txt, name), + "--input-image", + os.path.join(idir_img, os.path.splitext(name)[0]), + "--output-prefix", + os.path.join(odir, os.path.splitext(name)[0]), + ] + extra_args + build_main() + + +def merge_datasets(idir): + sys.argv = [ + sys.argv[0], + "--input", + idir, + "--output-prefix", + os.path.join(idir, "merge"), + "--multimodal", + ] + merge_main() + + +def do_test_preprocess_mmdata(temp_dir, extra_args=[]): + # set the default nltk data path + os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") + nltk.data.path.append(os.environ["NLTK_DATA"]) + + path_to_raws_txt = os.path.join(temp_dir, "sample_raws_txt") + path_to_raws_img = os.path.join(temp_dir, "sample_raws_img") + path_to_data = os.path.join(temp_dir, "sample_data") + os.mkdir(path_to_raws_txt) + os.mkdir(path_to_raws_img) + os.mkdir(path_to_data) + + # create the dummy text resources + dummy_jsonl(path_to_raws_txt) + + # create the dummy image resources + dummy_img(path_to_raws_txt, path_to_raws_img) + + # build the datasets + build_datasets( + path_to_raws_txt, path_to_raws_img, path_to_data, extra_args=extra_args, + ) + + # merge the datasets + merge_datasets(path_to_data) + + sys.argv = [ + sys.argv[0], + "--input", + None, + "--input-image", + None, + "--output-prefix", + None, + ] + extra_args + encoder = Encoder(build_args()) + encoder.initializer() + + def tokens_to_string(toks): + for option in ["decode", "detokenize"]: + try: + return getattr(encoder.tokenizer, option)(toks) + except AttributeError: + continue + raise RuntimeError(f"{type(encoder.tokenizer)} tokenizer cannot `decode` or `detokenize`.") + + merged_index = 0 + merged_dataset = IndexedDataset(os.path.join(path_to_data, "merge"), multimodal=True) + + # sorted to ensure ordering matches merged dataset + basenames = sorted( + [ + name + for name in os.listdir(path_to_data) + if name.endswith(".idx") and not name.startswith("merge") + ] + ) + + # index into the merged document index + merged_doc_index_index = 0 + + for basename in basenames: + realpath_raw_txt = os.path.join(path_to_raws_txt, f"{os.path.splitext(basename)[0]}.jsonl") + realpath_raw_img = os.path.join(path_to_raws_img, os.path.splitext(basename)[0]) + realpath_doc = os.path.join(path_to_data, os.path.splitext(basename)[0]) + + dataset_index = 0 + dataset = IndexedDataset(realpath_doc, multimodal=True) + + merged_doc_idx = merged_dataset.document_indices[ + merged_doc_index_index : merged_doc_index_index + len(dataset.document_indices) + ] + merged_doc_idx = merged_doc_idx - merged_doc_idx[0] + + assert ( + dataset.document_indices == merged_doc_idx + ).all(), f"ERROR: {basename.split('_')[:-2]}: merged dataset document indices mismatch" + + merged_doc_index_index += len(dataset.document_indices) - 1 + + with open(realpath_raw_txt, "rt") as reader: + for json_line, image_path in zip( + reader, + [ + os.path.join(realpath_raw_img, basename) + for basename in os.listdir(realpath_raw_img) + ], + ): + toks, image, length = encoder.encode((json_line, image_path)) + + raw_text = tokens_to_string(toks) + # reverse to account for preprocessing 0-index padding + raw_image = image[::-1] + + processed_toks = dataset[dataset_index][0] + assert dataset[dataset_index][1] == 0 + processed_text = tokens_to_string(processed_toks) + + processed_image = dataset[dataset_index + 1][0] + assert dataset[dataset_index + 1][1] == 1 + # reverse to account for preprocessing 0-index padding + processed_image = processed_image[::-1][0 : raw_image.size] + + assert ( + raw_text == processed_text + ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents (text) do not match" + + assert numpy.allclose( + raw_image, processed_image + ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents (image) do not match" + + dataset_index += 2 + + merged_toks = merged_dataset[merged_index][0] + assert merged_dataset[merged_index][1] == 0 + merged_text = tokens_to_string(merged_toks) + + merged_image = merged_dataset[merged_index + 1][0] + assert merged_dataset[merged_index + 1][1] == 1 + # reverse to account for preprocessing 0-index padding + merged_image = merged_image[::-1][0 : raw_image.size] + + assert ( + raw_text == merged_text + ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents (text) do not match" + + assert numpy.allclose( + raw_image, merged_image + ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents (image) do not match" + + merged_index += 2 + + print( + f"INFO: {''.join(basename.split('_')[:-2])}: raw, processed, and merged documents match!" + ) + + print("INFO: Success!") + + +def test_preprocess_mmdata(): + with tempfile.TemporaryDirectory() as temp_dir: + + # gpt specific args + gpt_args = [ + "--pad-length", + "1024", + "--tokenizer-type", + "GPT2BPETokenizer", + "--vocab-file", + gpt2_vocab(temp_dir), + "--merge-file", + gpt2_merge(temp_dir), + "--append-eod", + "--workers", + "10", + "--log-interval", + "1", + ] + + do_test_preprocess_mmdata(temp_dir, extra_args=gpt_args) + + +if __name__ == "__main__": + test_preprocess_mmdata() diff --git a/tests/unit_tests/dist_checkpointing/__init__.py b/tests/unit_tests/dist_checkpointing/__init__.py new file mode 100644 index 0000000..4cf102b --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/__init__.py @@ -0,0 +1,57 @@ +import os +import weakref +from pathlib import Path +from shutil import rmtree +from tempfile import TemporaryDirectory +from typing import Union, Optional + +from tests.unit_tests.test_utilities import Utils + + +def empty_dir(path: Path): + if Utils.rank > 0: + return + for p in path.iterdir(): + if p.is_dir(): + rmtree(p) + else: + p.unlink() + + + +class TempNamedDir(TemporaryDirectory): + """ TemporaryDirectory with a fully named directory. Empties the dir if not empty. """ + def __init__(self, name: Union[str, Path], sync=True, + ignore_cleanup_errors=False) -> None: + self.name = str(name) + if Utils.rank == 0: + os.makedirs(name, exist_ok=True) + empty_dir(Path(name)) + + self._ignore_cleanup_errors = ignore_cleanup_errors + self._finalizer = weakref.finalize( + self, self._cleanup, self.name, + warn_message="Implicitly cleaning up {!r}".format(self)) + self.sync = sync + + def cleanup(self, override_sync: Optional[bool] = None) -> None: + sync = self.sync if override_sync is None else override_sync + if sync : + import torch + torch.distributed.barrier() + + if Utils.rank == 0: + super().cleanup() + + def __enter__(self): + path = Path(super().__enter__()) + if self.sync: + import torch + torch.distributed.barrier() + return path + + def __exit__(self, exc_type, exc_val, exc_tb): + raised = exc_type is not None + if not raised: + self.cleanup() + diff --git a/tests/unit_tests/dist_checkpointing/conftest.py b/tests/unit_tests/dist_checkpointing/conftest.py new file mode 100644 index 0000000..62392e4 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/conftest.py @@ -0,0 +1,37 @@ +from pathlib import Path +from unittest import mock + +import pytest + +from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +@pytest.fixture(scope="session") +def tmp_path_dist_ckpt(tmp_path_factory) -> Path: + """ Common directory for saving the checkpoint. + + Can't use pytest `tmp_path_factory` directly because directory must be shared between processes. """ + + tmp_dir = tmp_path_factory.mktemp('ignored', numbered=False) + tmp_dir = tmp_dir.parent.parent / 'tmp_dist_ckpt' + + if Utils.rank == 0: + with TempNamedDir(tmp_dir, sync=False): + yield tmp_dir + + else: + yield tmp_dir + + +@pytest.fixture(scope='session', autouse=True) +def set_default_dist_ckpt_strategy(): + def get_pyt_dist_save_sharded_strategy(): + return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1) + + with mock.patch( + 'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy', + new=get_pyt_dist_save_sharded_strategy, + ) as _fixture: + yield _fixture diff --git a/tests/unit_tests/dist_checkpointing/models/__init__.py b/tests/unit_tests/dist_checkpointing/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/dist_checkpointing/models/common.py b/tests/unit_tests/dist_checkpointing/models/common.py new file mode 100644 index 0000000..3dd4518 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/common.py @@ -0,0 +1,160 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import math + +import torch + +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.serialization import \ + get_default_save_sharded_strategy, get_default_load_sharded_strategy +from megatron.core.dist_checkpointing.strategies.fully_parallel import \ + FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper +from megatron.core.dist_checkpointing.validation import StrictHandling +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +def common_test_simple_sharded_state_dict_save_load(initialize_model_fn, tmp_path_dist_ckpt, + src_layer_spec_fn, dst_layer_spec_fn): + """ Simple save and load sanity check, without any equality tests. """ + Utils.initialize_model_parallel(2,4) + gpt_model = initialize_model_fn(1, src_layer_spec_fn) + with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir: + # Save + sharded_state_dict = gpt_model.sharded_state_dict() + save(sharded_state_dict, ckpt_dir) + + # Load + gpt_model = initialize_model_fn(2, dst_layer_spec_fn) + sharded_state_dict = gpt_model.sharded_state_dict() + state_dict, missing_keys, unexpected_keys = load(sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL) + # Potential mismatch is because of extra states which is ok + assert all('_extra_state' in k for k in missing_keys) + assert all('_extra_state' in k for k in unexpected_keys) + gpt_model.load_state_dict(state_dict) + Utils.destroy_model_parallel() + + +def common_test_parallel_reconfiguration_e2e(initialize_model_fn, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, + src_layer_spec_fn, dst_layer_spec_fn, use_fpsl, + load_order="tp-dp-pp", store_order="tp-dp-pp"): + """ Test model saving and loading with different TP/PP """ + with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model_reconfiguration_model_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model_reconfiguration_model_B') as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(*src_tp_pp, order=load_order) + gpt_model_A = initialize_model_fn(1, src_layer_spec_fn) + save_strategy = get_default_save_sharded_strategy() + if use_fpsl: + save_strategy = FullyParallelSaveStrategyWrapper( + save_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True), + True + ) + save(gpt_model_A.sharded_state_dict(), ckpt_dir_A, save_strategy) + regular_state_dict_A = gpt_model_A.state_dict() + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP and save as checkpoint B + # No FPS this time, only FPL + Utils.initialize_model_parallel(*dest_tp_pp, order=store_order) + gpt_model_B = initialize_model_fn(2, dst_layer_spec_fn) + if use_fpsl: + load_strategy = get_default_load_sharded_strategy(ckpt_dir_A) + load_strategy = FullyParallelLoadStrategyWrapper(load_strategy) + else: + load_strategy = None + state_dict, missing_keys, unexpected_keys = load(gpt_model_B.sharded_state_dict(), ckpt_dir_A, load_strategy, strict=StrictHandling.RETURN_ALL) + # Potential mismatch is because of extra states which is ok + assert all('_extra_state' in k for k in missing_keys) + assert all('_extra_state' in k for k in unexpected_keys) + gpt_model_B.load_state_dict(state_dict) + save(gpt_model_B.sharded_state_dict(), ckpt_dir_B) + regular_state_dict_B = gpt_model_A.state_dict() + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + plain_state_dict_A = load_plain_tensors(ckpt_dir_A) + plain_state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(plain_state_dict_A, plain_state_dict_B) + assert not any(map(bool, diffs)), diffs + + # Test both regular state dicts are equal, turning FP8 states to bytes first + regular_state_dict_A = {k: v for k, v in regular_state_dict_A.items() + if not k.endswith('_extra_state')} + regular_state_dict_B = {k: v for k, v in regular_state_dict_B.items() + if not k.endswith('_extra_state')} + diffs = diff(regular_state_dict_A, regular_state_dict_B) + assert not any(map(bool, diffs)), diffs + Utils.destroy_model_parallel() + + +def common_test_state_dict_comparison(initialize_model_fn, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2, 4) + with TempNamedDir(tmp_path_dist_ckpt / 'test_state_dict_comparison_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_state_dict_comparison_B') as ckpt_dir_B: + gpt_model_A = initialize_model_fn(1) + save(gpt_model_A.sharded_state_dict(), ckpt_dir_A) + gpt_model_B = initialize_model_fn(2) + save(gpt_model_B.sharded_state_dict(), ckpt_dir_B) + + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_A_dup = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + + # Test that A matches A + diffs = diff(state_dict_A, state_dict_A_dup) + assert not any(map(bool, diffs)), diffs + + # Test that A *keys* match B *keys*, but the tensors content is different + only_left, only_right, mismatch = diff(state_dict_A, state_dict_B) + assert (not only_left and not only_right), (only_left, only_right) + assert len(mismatch) == len(state_dict_A), (len(mismatch), (len(state_dict_A))) + Utils.destroy_model_parallel() + + +def common_test_vocab_size_padding_change(initialize_model_fn, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp): + """ Test model loading with different vocab size (caused by TP padding). """ + def get_test_vocab_size(make_divisible_by=128): + divisor = make_divisible_by * parallel_state.get_tensor_model_parallel_world_size() + return int(math.ceil(vocab_size_base / divisor)) * divisor + + vocab_size_dependent_keys = { + 'output_layer.weight', + 'output_layer.bias', + 'embedding.word_embeddings.weight', + } + + with TempNamedDir(tmp_path_dist_ckpt / 'test_vocab_size_padding_change_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_vocab_size_padding_change_B') as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(*src_tp_pp) + gpt_model_A = initialize_model_fn(1, vocab_size=get_test_vocab_size()) + save(gpt_model_A.sharded_state_dict(), ckpt_dir_A) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP and save as checkpoint B + Utils.initialize_model_parallel(*dest_tp_pp) + gpt_model_B = initialize_model_fn(2, vocab_size=get_test_vocab_size()) + state_dict = load(gpt_model_B.sharded_state_dict(), ckpt_dir_A) + gpt_model_B.load_state_dict(state_dict) + save(gpt_model_B.sharded_state_dict(), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test equality + Utils.initialize_model_parallel(1, 1) + plain_state_dict_A = load_plain_tensors(ckpt_dir_A) + plain_state_dict_B = load_plain_tensors(ckpt_dir_B) + # Test vocab size dependent keys are equal up to `vocab_size_base` + for vocab_layer_key in vocab_size_dependent_keys: + if vocab_layer_key in plain_state_dict_A: + ten_A = plain_state_dict_A.pop(vocab_layer_key) + ten_B = plain_state_dict_B.pop(vocab_layer_key) + assert torch.all(ten_A[:vocab_size_base] == ten_B[:vocab_size_base]), vocab_layer_key + + # Test other tensors are equal + diffs = diff(plain_state_dict_A, plain_state_dict_B) + assert not any(map(bool, diffs)), diffs + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/models/test_bert_model.py b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py new file mode 100644 index 0000000..0748296 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py @@ -0,0 +1,86 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.models.bert.bert_model import BertModel +import pytest + +import os +import torch +from torch.distributed._tensor import DeviceMesh + +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core import parallel_state as ps +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.dist_checkpointing.models.common import \ + common_test_simple_sharded_state_dict_save_load, \ + common_test_parallel_reconfiguration_e2e, common_test_state_dict_comparison, \ + common_test_vocab_size_padding_change +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec, bert_layer_with_transformer_engine_spec + + +def initialize_bert_model(seed, layer_spec_fn=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs): + os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0' + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + layer_spec = layer_spec_fn() if callable(layer_spec_fn) else layer_spec_fn + + default_config_kwargs=dict(num_layers=8, hidden_size=16, num_attention_heads=8, use_cpu_initialization=True) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs) + pre_process = ps.is_pipeline_first_stage() + post_process = ps.is_pipeline_last_stage() + model = BertModel(config=transformer_config, transformer_layer_spec=layer_spec, vocab_size=vocab_size, max_sequence_length=4, + pre_process=pre_process, post_process=post_process, num_tokentypes=0) + + with torch.no_grad(): + for p in model.parameters(): + p.random_() + return model + + +class TestBertModel: + @pytest.mark.parametrize('src_layer_spec', [bert_layer_with_transformer_engine_spec, bert_layer_local_spec]) + @pytest.mark.parametrize('dst_layer_spec', [bert_layer_with_transformer_engine_spec, bert_layer_local_spec]) + def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt, + src_layer_spec, dst_layer_spec): + common_test_simple_sharded_state_dict_save_load(initialize_bert_model, tmp_path_dist_ckpt, + src_layer_spec, dst_layer_spec) + + +class TestBERTModelReconfiguration: + @pytest.mark.parametrize( + ('use_fpsl', 'src_tp_pp', 'dest_tp_pp', 'src_layer_spec', 'dst_layer_spec'), + [ + (False, (2, 4), (4, 2), bert_layer_with_transformer_engine_spec, bert_layer_with_transformer_engine_spec), + (False, (1, 8), (8, 1), bert_layer_with_transformer_engine_spec, bert_layer_with_transformer_engine_spec), + (True, (2, 1), (1, 8), bert_layer_with_transformer_engine_spec, bert_layer_with_transformer_engine_spec), + (False, (1, 1), (2, 2), bert_layer_with_transformer_engine_spec, bert_layer_with_transformer_engine_spec), + (True, (2, 1), (1, 8), bert_layer_local_spec, bert_layer_local_spec), + (True, (1, 1), (2, 4), bert_layer_with_transformer_engine_spec, bert_layer_local_spec), + (False, (1, 8), (2, 1), bert_layer_local_spec, bert_layer_with_transformer_engine_spec), + ] + ) + def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, + src_layer_spec, dst_layer_spec, use_fpsl): + """ Test model saving and loading with different TP/PP """ + common_test_parallel_reconfiguration_e2e(initialize_bert_model, tmp_path_dist_ckpt, src_tp_pp, + dest_tp_pp, src_layer_spec, dst_layer_spec, use_fpsl) + + def test_state_dict_comparison(self, tmp_path_dist_ckpt): + common_test_state_dict_comparison(initialize_bert_model, tmp_path_dist_ckpt) + + @pytest.mark.parametrize("vocab_size_base,src_tp_pp,dest_tp_pp", [ + (128, (2, 4), (4, 2)), + (17, (1, 8), (8, 1)), + (127, (1, 8), (8, 1)), + (31123, (1, 1), (1, 8)), + (17, (1, 1), (1, 8)), + ]) + def test_vocab_size_padding_change(self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp): + """ Test model loading with different vocab size (caused by TP padding). """ + common_test_vocab_size_padding_change(initialize_bert_model, tmp_path_dist_ckpt, vocab_size_base, + src_tp_pp, dest_tp_pp) diff --git a/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py b/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py new file mode 100644 index 0000000..0e95026 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py @@ -0,0 +1,80 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import pytest + +import torch + +from megatron.core import parallel_state as ps +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from tests.unit_tests.dist_checkpointing.models.common import \ + common_test_simple_sharded_state_dict_save_load, \ + common_test_parallel_reconfiguration_e2e, \ + common_test_state_dict_comparison, common_test_vocab_size_padding_change +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.gpt.gpt_layer_specs import \ + get_gpt_layer_with_transformer_engine_spec as gpt_te_spec, get_gpt_layer_local_spec as gpt_local_spec + + +def initialize_gpt_model(seed, layer_spec_fn=gpt_te_spec, vocab_size=128, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + default_config_kwargs=dict(num_layers=8, hidden_size=16, num_attention_heads=8, use_cpu_initialization=True) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs) + pre_process = ps.is_pipeline_first_stage() + post_process = ps.is_pipeline_last_stage() + model = GPTModel(config=transformer_config, transformer_layer_spec=layer_spec_fn(), vocab_size=vocab_size, max_sequence_length=4, + pre_process=pre_process, post_process=post_process) + + with torch.no_grad(): + for p in model.parameters(): + p.random_() + return model + + +class TestGPTModel: + @pytest.mark.parametrize('src_layer_spec_fn', [gpt_te_spec, gpt_local_spec]) + @pytest.mark.parametrize('dst_layer_spec_fn', [gpt_te_spec, gpt_local_spec]) + def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt, + src_layer_spec_fn, dst_layer_spec_fn): + common_test_simple_sharded_state_dict_save_load(initialize_gpt_model, tmp_path_dist_ckpt, + src_layer_spec_fn, dst_layer_spec_fn) + + +class TestGPTModelReconfiguration: + @pytest.mark.parametrize( + ('use_fpsl', 'load_order', 'store_order', 'src_tp_pp', 'dest_tp_pp', 'src_layer_spec_fn', 'dst_layer_spec_fn'), + [ + (False, 'tp-dp-pp', 'tp-dp-pp', (2, 4), (4, 2), gpt_te_spec, gpt_te_spec), + (False, 'tp-pp-dp', 'tp-pp-dp', (1, 8), (8, 1), gpt_te_spec, gpt_te_spec), + (True, 'tp-dp-pp', 'tp-pp-dp', (2, 1), (1, 8), gpt_te_spec, gpt_te_spec), + (False, 'tp-dp-pp', 'tp-dp-pp', (1, 1), (2, 2), gpt_te_spec, gpt_te_spec), + (True, 'tp-pp-dp', 'tp-pp-dp', (2, 1), (1, 8), gpt_local_spec, gpt_local_spec), + (False, 'tp-dp-pp', 'tp-pp-dp', (1, 1), (2, 4), gpt_te_spec, gpt_local_spec), + (True, 'tp-dp-pp', 'tp-dp-pp', (2, 4), (4, 2), gpt_local_spec, gpt_te_spec), + (False, 'tp-pp-dp', 'tp-pp-dp', (2, 1), (1, 8), gpt_te_spec, gpt_local_spec), + (False, 'tp-dp-pp', 'tp-pp-dp', (2, 4), (2, 4), gpt_local_spec, gpt_local_spec), + ] + ) + def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, + src_layer_spec_fn, dst_layer_spec_fn, use_fpsl, load_order, store_order): + """ Test model saving and loading with different TP/PP """ + common_test_parallel_reconfiguration_e2e(initialize_gpt_model, tmp_path_dist_ckpt, src_tp_pp, + dest_tp_pp, src_layer_spec_fn, dst_layer_spec_fn, use_fpsl, load_order, store_order) + + + def test_state_dict_comparison(self, tmp_path_dist_ckpt): + common_test_state_dict_comparison(initialize_gpt_model, tmp_path_dist_ckpt) + + @pytest.mark.parametrize("vocab_size_base,src_tp_pp,dest_tp_pp", [ + (128, (2, 4), (4, 2)), + (17, (1, 8), (8, 1)), + (127, (1, 8), (8, 1)), + (31123, (1, 1), (1, 8)), + (17, (1, 1), (1, 8)), + ]) + def test_vocab_size_padding_change(self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp): + """ Test model loading with different vocab size (caused by TP padding). """ + common_test_vocab_size_padding_change(initialize_gpt_model, tmp_path_dist_ckpt, vocab_size_base, + src_tp_pp, dest_tp_pp) diff --git a/tests/unit_tests/dist_checkpointing/models/test_grouped_mlp.py b/tests/unit_tests/dist_checkpointing/models/test_grouped_mlp.py new file mode 100644 index 0000000..aef8640 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_grouped_mlp.py @@ -0,0 +1,165 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.serialization import \ + get_default_save_sharded_strategy, get_default_load_sharded_strategy +from megatron.core.dist_checkpointing.strategies.fully_parallel import \ + FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper +from megatron.core.models.gpt.gpt_layer_specs import \ + get_gpt_layer_with_transformer_engine_spec +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.moe.experts import GroupedMLP +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.dist_checkpointing.models.test_sequential_mlp import initialize_expert_layer +from tests.unit_tests.test_utilities import Utils + + +def initialize_grouped_mlp(seed, glu=True, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + num_moe_experts = 8 + num_local_experts = num_moe_experts // parallel_state.get_expert_model_parallel_world_size() + default_config_kwargs = dict(num_layers=pp_size, hidden_size=12, num_attention_heads=4, num_moe_experts=num_moe_experts, use_cpu_initialization=True, + gated_linear_unit=glu, add_bias_linear=False) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs) + model = GroupedMLP(num_local_experts, transformer_config) + return model + + +def get_pp_offsets(): + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + return ((0, pp_rank, pp_size),) + + +class TestGroupedMLPReconfiguration: + @pytest.mark.parametrize("use_fpsl,src_tp_pp_exp,dest_tp_pp_exp,use_glu", [ + # changing PP is impossible because the number of layers must be the same + (False, (2, 4, 1), (2, 4, 1), False), + (True, (2, 4, 1), (2, 4, 1), False), + (False, (1, 1, 1), (1, 1, 1), False), + (True, (1, 1, 1), (1, 1, 4), False), + (False, (1, 1, 8), (1, 1, 2), False), + (False, (2, 2, 2), (4, 2, 1), False), + (True, (1, 1, 4), (8, 1, 1), False), + (False, (1, 8, 1), (1, 8, 1), False), + (False, (1, 1, 4), (2, 1, 1), False), + (False, (1, 1, 1), (1, 1, 1), True), + (False, (1, 1, 1), (1, 1, 4), True), + (True, (1, 1, 1), (2, 1, 1), True), + (False, (1, 1, 4), (8, 1, 1), True), + (True, (2, 1, 4), (1, 1, 8), True), + (False, (2, 1, 4), (1, 1, 8), True), + ]) + def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl): + """ Test model saving and loading with different TP/PP/expert parallelism """ + src_tp, src_pp, src_exp = src_tp_pp_exp + dest_tp, dest_pp, dest_exp = dest_tp_pp_exp + with TempNamedDir(tmp_path_dist_ckpt / 'test_grouped_mlp_reconfiguration_model_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_grouped_mlp_reconfiguration_model_B') as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp) + model_A = initialize_grouped_mlp(1, use_glu) + sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets()) + + save_strategy = get_default_save_sharded_strategy() + if use_fpsl: + save_strategy = FullyParallelSaveStrategyWrapper( + save_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True), + True + ) + save(sharded_state_dict, ckpt_dir_A, save_strategy) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP/expert and save as checkpoint B + # No FPS this time, only FPL + Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp) + model_B = initialize_grouped_mlp(2, use_glu) + if use_fpsl: + load_strategy = get_default_load_sharded_strategy(ckpt_dir_A) + load_strategy = FullyParallelLoadStrategyWrapper(load_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True)) + else: + load_strategy = None + state_dict = load(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A, load_strategy) + model_B.load_state_dict(state_dict) + save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(state_dict_A, state_dict_B) + assert not any(map(bool, diffs)), diffs + Utils.destroy_model_parallel() + + @pytest.mark.parametrize("src_module,src_tp_pp_exp,dest_tp_pp_exp,use_glu", [ + # changing PP is impossible because the number of layers must be the same + ('sequential', (2, 4, 1), (2, 4, 1), False), + ('sequential', (1, 1, 1), (1, 1, 4), False), + ('sequential', (2, 2, 2), (4, 2, 1), False), + ('sequential', (1, 1, 4), (8, 1, 1), False), + ('sequential', (2, 1, 4), (1, 1, 8), False), + ('sequential', (2, 4, 1), (2, 4, 1), True), + ('sequential', (1, 1, 1), (1, 1, 4), True), + ('sequential', (2, 2, 2), (4, 2, 1), True), + ('sequential', (1, 1, 4), (8, 1, 1), True), + ('sequential', (2, 1, 4), (1, 1, 8), True), + ('grouped', (2, 4, 1), (2, 4, 1), False), + ('grouped', (1, 1, 1), (1, 1, 4), False), + ('grouped', (2, 2, 2), (4, 2, 1), False), + ('grouped', (1, 1, 4), (8, 1, 1), False), + ('grouped', (2, 1, 4), (1, 1, 8), False), + ('grouped', (2, 4, 1), (2, 4, 1), True), + ('grouped', (1, 1, 1), (1, 1, 4), True), + ('grouped', (2, 2, 2), (4, 2, 1), True), + ('grouped', (1, 1, 4), (8, 1, 1), True), + ('grouped', (2, 1, 4), (1, 1, 8), True), + ]) + def test_sequential_grouped_mlp_interchangeable(self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, src_module): + """ Test model saving and loading with different TP/PP/expert parallelism """ + src_tp, src_pp, src_exp = src_tp_pp_exp + dest_tp, dest_pp, dest_exp = dest_tp_pp_exp + with TempNamedDir(tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_B') as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp) + if src_module == 'sequential': + model_A = initialize_expert_layer(1, use_glu, add_bias_linear=False, moe_grouped_gemm=False) + else: + model_A = initialize_grouped_mlp(1, use_glu) + sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets()) + + save_strategy = get_default_save_sharded_strategy() + save(sharded_state_dict, ckpt_dir_A, save_strategy) + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp) + if src_module == 'sequential': + model_B = initialize_grouped_mlp(1, use_glu) + else: + model_B = initialize_expert_layer(1, use_glu, add_bias_linear=False, moe_grouped_gemm=False) + load_strategy = None + state_dict = load(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A, load_strategy) + model_B.load_state_dict(state_dict) + save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(state_dict_A, state_dict_B) + assert not any(map(bool, diffs)), diffs + Utils.destroy_model_parallel() \ No newline at end of file diff --git a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py new file mode 100644 index 0000000..16243a5 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch +from torch.optim import Adam + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.dict_utils import diff, nested_values +from megatron.core.dist_checkpointing.optimizer import \ + get_param_id_to_sharded_param_map, optim_state_to_sharding_state +from megatron.core.transformer.mlp import MLP +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils +from megatron.core.dist_checkpointing import save, load, load_plain_tensors, \ + ShardedTensor +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + + +def initialize_mlp(glu=True): + model_parallel_cuda_manual_seed(123) + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + transformer_config = TransformerConfig(num_layers=pp_size, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True, + gated_linear_unit=glu) + return MLP(transformer_config, get_gpt_layer_with_transformer_engine_spec().submodules.mlp.submodules) + + +def get_pp_offsets(): + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + return ((0, pp_rank, pp_size),) + + +class TestParallelMLPWithGLU: + @pytest.mark.parametrize("src_tp_pp,dest_tp_pp", [ + # changing PP is impossible because the number of layers must be the same + ((2, 2), (4, 2)), + ((1, 1), (8, 1)), + ((1, 8), (1, 8)), + ((1, 1), (2, 1)), + ]) + def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp): + """ Test module saving and loading with different TP/PP """ + with TempNamedDir(tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_A') as ckpt_dir_A, \ + TempNamedDir(tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_B') as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(*src_tp_pp) + mlp_A = initialize_mlp() + save(mlp_A.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP and save as checkpoint B + Utils.initialize_model_parallel(*dest_tp_pp) + mlp_B = initialize_mlp() + state_dict = load(mlp_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A) + mlp_B.load_state_dict(state_dict) + save(mlp_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(state_dict_A, state_dict_B) + assert not any(map(bool, diffs)), diffs diff --git a/tests/unit_tests/dist_checkpointing/models/test_retro_model.py b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py new file mode 100644 index 0000000..be2f9ba --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import types + +import pytest + +import torch + +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core import parallel_state as ps +from megatron.core.dist_checkpointing.validation import StrictHandling +from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + + +def initialize_retro_model(seed, decoder_spec_fn, spec_type, num_layers=9, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + default_config_kwargs=dict( + num_layers=num_layers, + hidden_size=16, + num_attention_heads=12, + kv_channels=64, + ffn_hidden_size=64, + use_cpu_initialization=True, + retro_num_neighbors=2, + retro_chunk_length=4, + retro_retrieved_length=8, + retro_split_preprocessing="98,2,0", + ) + default_config_kwargs.update(**config_kwargs) + retro_config = RetroConfig(**default_config_kwargs) + pre_process = ps.is_pipeline_first_stage() + post_process = ps.is_pipeline_last_stage() + + + de_block_spec = decoder_spec_fn(retro_config, use_transformer_engine=True if spec_type=="te" else False) + model = RetroModel(config=retro_config, transformer_layer_spec=de_block_spec, + pre_process=pre_process, post_process=post_process, + vocab_size=29184, max_sequence_length=4) + + with torch.no_grad(): + for p in model.parameters(): + p.random_() + return model + + +class TestRetroModel: + @pytest.mark.parametrize('src_spec_type', ['te', 'local']) + @pytest.mark.parametrize('dst_spec_type', ['te', 'local']) + @pytest.mark.parametrize('model_type', ['retro']) + def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type): + decoder_spec_fn = get_retro_decoder_block_spec + + Utils.initialize_model_parallel(1, 1) + gpt_model = initialize_retro_model(2, decoder_spec_fn, src_spec_type) + with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir: + # Save + sharded_state_dict = gpt_model.sharded_state_dict() + save(sharded_state_dict, ckpt_dir) + + # Load + gpt_model = initialize_retro_model(2, decoder_spec_fn, dst_spec_type) + sharded_state_dict = gpt_model.sharded_state_dict() + + state_dict, missing_keys, unexpected_keys = load(sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL) + # Potential mismatch is because of extra states which is ok + assert all('_extra_state' in k for k in missing_keys) + assert all('_extra_state' in k for k in unexpected_keys) + gpt_model.load_state_dict(state_dict) + gpt_model.load_state_dict(state_dict) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py new file mode 100644 index 0000000..f98d503 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py @@ -0,0 +1,215 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +from pkg_resources import packaging +from importlib.metadata import version +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.serialization import ( + get_default_save_sharded_strategy, + get_default_load_sharded_strategy, +) +from megatron.core.dist_checkpointing.strategies.fully_parallel import ( + FullyParallelSaveStrategyWrapper, + FullyParallelLoadStrategyWrapper, +) +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.moe.experts import SequentialMLP, TEGroupedMLP +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + +_te_version = packaging.version.Version(version("transformer-engine")) + +def initialize_expert_layer(seed, glu=True, moe_grouped_gemm=False, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + num_moe_experts = 8 + num_local_experts = num_moe_experts // parallel_state.get_expert_model_parallel_world_size() + default_config_kwargs = dict( + num_layers=pp_size, + hidden_size=12, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=True, + gated_linear_unit=glu, + ) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=moe_grouped_gemm + ) + if moe_grouped_gemm: + model = TEGroupedMLP( + num_local_experts, transformer_config, transformer_layer_spec.submodules.mlp.submodules + ) + else: + model = SequentialMLP( + num_local_experts, transformer_config, transformer_layer_spec.submodules.mlp.submodules + ) + return model + + +def get_pp_offsets(): + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + return ((0, pp_rank, pp_size),) + +moe_grouped_gemm_options = [False] +if _te_version >= packaging.version.Version("1.9.0.dev0"): + moe_grouped_gemm_options.append(True) + +class TestExpertLayerReconfiguration: + @pytest.mark.parametrize( + "use_fpsl,src_tp_pp_exp,dest_tp_pp_exp,use_glu", + [ + # changing PP is impossible because the number of layers must be the same + (False, (2, 4, 1), (2, 4, 1), False), + (True, (2, 4, 1), (2, 4, 1), False), + (False, (1, 1, 1), (1, 1, 1), False), + (True, (1, 1, 1), (1, 1, 4), False), + (False, (1, 1, 8), (1, 1, 2), False), + (False, (2, 2, 2), (4, 2, 1), False), + (True, (1, 1, 4), (8, 1, 1), False), + (False, (1, 8, 1), (1, 8, 1), False), + (False, (1, 1, 4), (2, 1, 1), False), + (False, (1, 1, 1), (1, 1, 1), True), + (False, (1, 1, 1), (1, 1, 4), True), + (True, (1, 1, 1), (2, 1, 1), True), + (False, (1, 1, 4), (8, 1, 1), True), + ], + ) + @pytest.mark.parametrize("moe_grouped_gemm", moe_grouped_gemm_options) + def test_parallel_reconfiguration_e2e( + self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl, moe_grouped_gemm + ): + """ Test model saving and loading with different TP/PP/expert parallelism """ + src_tp, src_pp, src_exp = src_tp_pp_exp + dest_tp, dest_pp, dest_exp = dest_tp_pp_exp + with TempNamedDir( + tmp_path_dist_ckpt / 'test_expert_layer_reconfiguration_model_A' + ) as ckpt_dir_A, TempNamedDir( + tmp_path_dist_ckpt / 'test_expert_layer_reconfiguration_model_B' + ) as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp) + model_A = initialize_expert_layer(1, use_glu, moe_grouped_gemm) + sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets()) + + save_strategy = get_default_save_sharded_strategy() + if use_fpsl: + save_strategy = FullyParallelSaveStrategyWrapper( + save_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True), + True, + ) + save(sharded_state_dict, ckpt_dir_A, save_strategy) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP/expert and save as checkpoint B + # No FPS this time, only FPL + Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp) + model_B = initialize_expert_layer(1, use_glu, moe_grouped_gemm) + if use_fpsl: + load_strategy = get_default_load_sharded_strategy(ckpt_dir_A) + load_strategy = FullyParallelLoadStrategyWrapper( + load_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True), + ) + else: + load_strategy = None + state_dict = load( + model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), + ckpt_dir_A, + load_strategy, + ) + model_B.load_state_dict(state_dict) + save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(state_dict_A, state_dict_B) + assert not any(map(bool, diffs)), diffs + + @pytest.mark.skipif( + _te_version < packaging.version.Version("1.9.0.dev0"), + reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.", + ) + @pytest.mark.parametrize( + "src_module,src_tp_pp_exp,dest_tp_pp_exp,use_glu", + [ + # changing PP is impossible because the number of layers must be the same + ('sequential', (2, 4, 1), (2, 4, 1), False), + ('sequential', (1, 1, 1), (1, 1, 4), False), + ('sequential', (2, 2, 2), (4, 2, 1), False), + ('sequential', (1, 1, 4), (8, 1, 1), False), + ('sequential', (2, 1, 4), (1, 1, 8), False), + ('sequential', (2, 4, 1), (2, 4, 1), True), + ('sequential', (1, 1, 1), (1, 1, 4), True), + ('sequential', (2, 2, 2), (4, 2, 1), True), + ('sequential', (1, 1, 4), (8, 1, 1), True), + ('sequential', (2, 1, 4), (1, 1, 8), True), + ('grouped', (2, 4, 1), (2, 4, 1), False), + ('grouped', (1, 1, 1), (1, 1, 4), False), + ('grouped', (2, 2, 2), (4, 2, 1), False), + ('grouped', (1, 1, 4), (8, 1, 1), False), + ('grouped', (2, 1, 4), (1, 1, 8), False), + ('grouped', (2, 4, 1), (2, 4, 1), True), + ('grouped', (1, 1, 1), (1, 1, 4), True), + ('grouped', (2, 2, 2), (4, 2, 1), True), + ('grouped', (1, 1, 4), (8, 1, 1), True), + ('grouped', (2, 1, 4), (1, 1, 8), True), + ], + ) + def test_sequential_grouped_mlp_interchangeable( + self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, src_module + ): + """ Test model saving and loading with different TP/PP/expert parallelism """ + src_tp, src_pp, src_exp = src_tp_pp_exp + dest_tp, dest_pp, dest_exp = dest_tp_pp_exp + with TempNamedDir( + tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_A' + ) as ckpt_dir_A, TempNamedDir( + tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_B' + ) as ckpt_dir_B: + # Save checkpoint A + Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp) + model_A = initialize_expert_layer( + 1, use_glu, moe_grouped_gemm=src_module != 'sequential' + ) + sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets()) + + save_strategy = get_default_save_sharded_strategy() + save(sharded_state_dict, ckpt_dir_A, save_strategy) + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp) + model_B = initialize_expert_layer( + 1, use_glu, moe_grouped_gemm=src_module == 'sequential' + ) + load_strategy = None + state_dict = load( + model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), + ckpt_dir_A, + load_strategy, + ) + model_B.load_state_dict(state_dict) + save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + state_dict_A = load_plain_tensors(ckpt_dir_A) + state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(state_dict_A, state_dict_B) + assert not any(map(bool, diffs)), diffs + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/models/test_t5_model.py b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py new file mode 100644 index 0000000..c2db5e6 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py @@ -0,0 +1,85 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.dist_checkpointing import save, load, load_plain_tensors +from megatron.core import parallel_state as ps +from megatron.core.dist_checkpointing.validation import StrictHandling +from megatron.core.models.T5 import T5Model +from megatron.core.models.T5.t5_spec import \ + encoder_model_with_transformer_engine_default_spec as t5_encoder_te_spec, \ + decoder_model_with_transformer_engine_default_spec as t5_decoder_te_spec, \ + encoder_model_with_local_spec as t5_encoder_local_spec, \ + decoder_model_with_local_spec as t5_decoder_local_spec +from megatron.core.models.retro.decoder_spec import \ + get_retro_decoder_layer_te_spec, get_retro_decoder_layer_local_spec +from megatron.core.models.retro.encoder_spec import \ + get_retro_encoder_layer_te_spec, get_retro_encoder_layer_local_spec +from megatron.core.transformer.transformer_block import \ + TransformerBlockSubmodules +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + + +def initialize_t5_model(seed, encoder_spec_fn, decoder_spec_fn, num_layers=2, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + default_config_kwargs=dict(num_layers=num_layers, hidden_size=16, num_attention_heads=12, kv_channels=64, ffn_hidden_size=64, use_cpu_initialization=True) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs) + pre_process = ps.is_pipeline_first_stage() + post_process = ps.is_pipeline_last_stage() + + en_block_spec = TransformerBlockSubmodules([encoder_spec_fn()] * num_layers) + de_block_spec = TransformerBlockSubmodules([decoder_spec_fn()] * num_layers) + model = T5Model(config=transformer_config, transformer_encoder_layer_spec=en_block_spec, transformer_decoder_layer_spec=de_block_spec, + pre_process=False, post_process=False, + vocab_size=29184, max_sequence_length=4) + + with torch.no_grad(): + for p in model.parameters(): + p.random_() + return model + + +class TestT5Model: + @pytest.mark.parametrize('src_spec_type', ['te', 'local']) + @pytest.mark.parametrize('dst_spec_type', ['te', 'local']) + @pytest.mark.parametrize('model_type', ['t5']) + def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type): + enc_dec_spec_fn = { + 'te': { + 't5': (t5_encoder_te_spec, t5_decoder_te_spec), + 'retro': (get_retro_encoder_layer_te_spec, get_retro_decoder_layer_te_spec), + }, + 'local': { + 't5': (t5_encoder_local_spec, t5_decoder_local_spec), + 'retro': (get_retro_encoder_layer_local_spec, get_retro_decoder_layer_local_spec), + } + } + src_encoder_spec_fn, src_decoder_spec_fn = enc_dec_spec_fn[src_spec_type][model_type] + dst_encoder_spec_fn, dst_decoder_spec_fn = enc_dec_spec_fn[dst_spec_type][model_type] + + Utils.initialize_model_parallel(1, 1) + gpt_model = initialize_t5_model(1, src_encoder_spec_fn, src_decoder_spec_fn) + with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir: + # Save + sharded_state_dict = gpt_model.sharded_state_dict() + save(sharded_state_dict, ckpt_dir) + + # Load + gpt_model = initialize_t5_model(2, dst_encoder_spec_fn, dst_decoder_spec_fn) + sharded_state_dict = gpt_model.sharded_state_dict() + + state_dict, missing_keys, unexpected_keys = load(sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL) + # Potential mismatch is because of extra states which is ok + assert all('_extra_state' in k for k in missing_keys) + assert all('_extra_state' in k for k in unexpected_keys) + gpt_model.load_state_dict(state_dict) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py new file mode 100644 index 0000000..fb73a96 --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_async_save.py @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from unittest import mock + +import pytest +import torch + +from megatron.core.dist_checkpointing import ShardedTensor, load, save +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue +from megatron.core.dist_checkpointing.strategies.filesystem_async import FileSystemWriterAsync +from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + + +def write_data_os_err_mock_fn(local_proc_idx, write_bucket, results_queue, count_queue, use_fsync): + """Raises an error on worker #2 during storage save""" + try: + if local_proc_idx == 2: + raise OSError('worker #2 critical failure') + output = (local_proc_idx, []) + except Exception as e: + output = (local_proc_idx, e) + results_queue.put(output) + count_queue.get() + count_queue.task_done() + + +class TestAsyncSave: + def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2, 4) + + sharded_state_dict = { + 'sd_keyA': ShardedTensor.from_rank_offsets( + 'keyA', torch.ones(2, 4), replica_id=Utils.rank + ), + 'sd_keyB': ShardedTensor.from_rank_offsets( + 'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1 + ), + } + + with TempNamedDir( + tmp_path_dist_ckpt / 'test_equivalence_async' + ) as async_ckpt_dir, TempNamedDir( + tmp_path_dist_ckpt / 'test_equivalence_sync' + ) as sync_ckpt_dir: + # async + async_calls = AsyncCallsQueue() + async_request = save(sharded_state_dict, async_ckpt_dir, async_sharded_save=True) + async_calls.schedule_async_request(async_request) + + # sync + save(sharded_state_dict, sync_ckpt_dir, async_sharded_save=False) + + # finalize async + async_calls.maybe_finalize_async_calls(blocking=True) + + # load and compare + loaded_async_state_dict = load(sharded_state_dict, async_ckpt_dir) + loaded_sync_state_dict = load(sharded_state_dict, sync_ckpt_dir) + diffs = diff(loaded_async_state_dict, loaded_sync_state_dict) + assert not any(map(bool, diffs)), diffs + + Utils.destroy_model_parallel() + + @pytest.mark.parametrize('async_save', [False, True]) + @pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn]) + def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn): + Utils.initialize_model_parallel(2, 4) + sharded_state_dict = { + f'key{i}': ShardedTensor.from_rank_offsets(f'key{i}_rank{Utils.rank}', torch.ones(2, 4)) + for i in range(4) # make sure there is enough non-empty saving workers + } + + with TempNamedDir(tmp_path_dist_ckpt / 'test_errors_are_reported') as ckpt_dir: + async_calls = AsyncCallsQueue() + save_strategy = TorchDistSaveShardedStrategy('torch_dist', 1, thread_count=8) + + try: + orig_fn = FileSystemWriterAsync.write_preloaded_data + FileSystemWriterAsync.write_preloaded_data = worker_fn + with pytest.raises(RuntimeError) as exc_info: + if async_save: + async_request = save( + sharded_state_dict, ckpt_dir, save_strategy, async_sharded_save=True + ) + async_calls.schedule_async_request(async_request) + async_calls.maybe_finalize_async_calls(blocking=True) + else: + save(sharded_state_dict, ckpt_dir, save_strategy) + assert 'Worker failure' in str(exc_info.value) + + finally: + FileSystemWriterAsync.write_preloaded_data = orig_fn + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/test_cached_metadata.py b/tests/unit_tests/dist_checkpointing/test_cached_metadata.py new file mode 100644 index 0000000..c933a3a --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_cached_metadata.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import pickle +from copy import deepcopy + +from dataclasses import fields + +import torch + +from megatron.core.dist_checkpointing import ShardedTensor, load, save +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.serialization import get_default_save_sharded_strategy +from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +class TestCachedMetadata: + def test_cached_metadata(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2, 4) + + sharded_state_dict_non_cached = { + 'sd_keyA': ShardedTensor.from_rank_offsets( + 'keyA', torch.ones(2, 4), replica_id=Utils.rank + ), + 'sd_keyB': ShardedTensor.from_rank_offsets( + 'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1 + ), + } + + sharded_state_dict_cached = { + 'sd_keyA': ShardedTensor.from_rank_offsets( + 'keyA', torch.ones(2, 4), replica_id=Utils.rank + ), + 'sd_keyB': ShardedTensor.from_rank_offsets( + 'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1 + ), + } + + loaded_non_cached, loaded_cached = None, None + md_non_cached, md_cached = None, None + with TempNamedDir(tmp_path_dist_ckpt / 'ckpt_dir') as ckpt_dir: + save(sharded_state_dict_non_cached, ckpt_dir, async_sharded_save=False) + loaded_non_cached = load(sharded_state_dict_non_cached, ckpt_dir) + md_path = ckpt_dir / '.metadata' + with md_path.open('rb') as f: + md_non_cached = pickle.load(f) + + save_strategy = deepcopy(get_default_save_sharded_strategy()) + save_strategy.use_cached_ckpt_structure = True + # Run over 3 iterations with cached metadata enabled + # The 3rd iteration will run with cached metadata + # `ckpt_dir` at the 3rd iteration 2 will be maintained for comparison + ckpt_dir = None + for i in range(3): + ckpt_dir = TempNamedDir(tmp_path_dist_ckpt / f'ckpt_dir_${i}_cached') + save( + sharded_state_dict_cached, + ckpt_dir.__enter__(), + save_strategy, + async_sharded_save=False, + ) + if i < 2: + ckpt_dir.cleanup() + loaded_cached = load(sharded_state_dict_cached, ckpt_dir.__enter__()) + md_path = ckpt_dir.__enter__() / '.metadata' + + with md_path.open('rb') as f: + md_cached = pickle.load(f) + + # Check loaded state dict + diffs = diff(loaded_non_cached, loaded_cached) + + assert not any( + len(x) for x in diffs + ), 'Cached metadata doesn\'t produce the same state_dict in loading' + # Check metadata recorded in .metadata, torch.distributed.metadata.Metadata + for field in fields(md_non_cached): + if field.name not in ['storage_data', 'storage_meta']: + diffs = diff(getattr(md_non_cached, field.name), getattr(md_cached, field.name)) + assert not any( + len(x) for x in diffs + ), f'{field.name} is different in metadata from non-cached, cached metadata impls' + ckpt_dir.cleanup() + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py new file mode 100644 index 0000000..3d131da --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py @@ -0,0 +1,190 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import io + +import numpy as np +import pytest +import torch +from torch.distributed.checkpoint import CheckpointException + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import ShardedTensor, save, load +from megatron.core.dist_checkpointing.core import CheckpointingException, \ + maybe_load_config +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.mapping import ShardedTensorFactory, \ + ShardedObject +from megatron.core.dist_checkpointing.serialization import load_tensors_metadata +from megatron.core.dist_checkpointing.strategies.resharding import \ + apply_nd_flattened_tensors_reformulation, restore_nd_flattened_tensors_formulation +from megatron.core.dist_checkpointing.strategies.torch import \ + get_reformulation_metadata + +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +class TestFlattenedResharding: + @pytest.mark.parametrize( + ('src_tp_pp', 'dest_tp_pp',), + [ + ((2, 4), (2, 4)), + ((2, 4), (2, 2)), + ((2, 4), (4, 2)), + ((8, 1), (1, 2)), + ] + ) + def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp): + with TempNamedDir(tmp_path_dist_ckpt / 'test_flattened_partition_change_save_load') as ckpt_dir: + Utils.initialize_model_parallel(*src_tp_pp) + state_dict = self._build_state_dict() + + save(state_dict, ckpt_dir) + + # change TPxPP + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(*dest_tp_pp) + loaded_state_dict = load(self._build_state_dict(random=True), ckpt_dir) + expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()} + + diffs = diff(expected_state_dict, loaded_state_dict) + assert not any(diffs), diffs + + Utils.destroy_model_parallel() + + + @pytest.mark.parametrize( + ('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'), + [ + ((2, 4), (2, 2), { + 0: [(0, 0, 0), (0, 0, 10)], # TP 0, DP 0, PP 0 + 1: [(4, 0, 0), (4, 0, 10)], # TP 1, DP 0, PP 0 + 2: [(0, 0, 0), (0, 0, 10)], # TP 0, DP 1, PP 0 + 3: [(4, 0, 0), (4, 0, 10)], # TP 1, DP 1, PP 0 + 4: [(0, 0, 20), (0, 0, 30)], # TP 0, DP 0, PP 1 + 5: [(4, 0, 20), (4, 0, 30)], # TP 1, DP 0, PP 1 + 6: [(0, 0, 20), (0, 0, 30)], # TP 0, DP 1, PP 1 + 7: [(4, 0, 20), (4, 0, 30)], # TP 1, DP 1, PP 1 + }), + ((8, 1), (1, 2), { + rank: [(tp, 0, 0) for tp in range(8)] + for rank in range(8) + }) + ] + ) + def test_reformulate_nd_flattened_tensors(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, expected_ckpt_offsets_by_rank): + with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir: + Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp') + state_dict = self._build_state_dict() + + ckpt_local_shape = state_dict['sd_key_flat'].local_shape + + save(state_dict, ckpt_dir) + + # change TPxPP + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(*dest_tp_pp, order='tp-dp-pp') + load_state_dict = self._build_state_dict(random=True) + + reformulation_metadata = get_reformulation_metadata(load_state_dict, ckpt_dir) + reformulated_state_dict, formulation_restore_data = apply_nd_flattened_tensors_reformulation(load_state_dict, reformulation_metadata) + assert isinstance(reformulated_state_dict['sd_key_unflat'], ShardedTensor) + assert isinstance(reformulated_state_dict['sd_key_flat'], dict) + + assert reformulated_state_dict['sd_key_flat'].keys() == set((offset, ckpt_local_shape) for offset in expected_ckpt_offsets_by_rank[Utils.rank]), \ + (reformulated_state_dict['sd_key_flat'].keys(), ckpt_local_shape, expected_ckpt_offsets_by_rank[Utils.rank]) + + # We can even load the reformulated state dict with a high-level API + loaded_state_dict = load(reformulated_state_dict, ckpt_dir, validate_access_integrity=False) + loaded_state_dict = restore_nd_flattened_tensors_formulation(loaded_state_dict, formulation_restore_data) + expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()} + diffs = diff(expected_state_dict, loaded_state_dict) + assert not any(diffs), diffs + + Utils.destroy_model_parallel() + + + @pytest.mark.parametrize( + ('src_tp_pp',), + [ + ((2, 4),), + ((8, 1),), + ((1, 1),), + ((1, 4),), + ] + ) + def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp): + with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir: + Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp') + state_dict = self._build_state_dict() + + save(state_dict, ckpt_dir) + + # change TPxPP + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(1, 1) + + sharded_metadata = load_tensors_metadata(ckpt_dir) + + for attr_name in ('local_shape', 'global_shape'): + flat_val = getattr(sharded_metadata['flat'], attr_name) + unflat_val = getattr(sharded_metadata['unflat'], attr_name) + assert flat_val == unflat_val, (attr_name, flat_val, unflat_val) + + for sh_ten in sharded_metadata.values(): + sh_ten.replica_id = Utils.rank + loaded_state_dict = load(sharded_metadata, ckpt_dir) + assert torch.all(loaded_state_dict['unflat'] == torch.arange(8 * 5 * 40).reshape(8, 5, 40)) + assert torch.all(loaded_state_dict['flat'] == torch.arange(8 * 5 * 40)) + + Utils.destroy_model_parallel() + + def _build_state_dict(self, random=False): + tp_rank = parallel_state.get_tensor_model_parallel_rank() + tp_size = parallel_state.get_tensor_model_parallel_world_size() + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + dp_rank = parallel_state.get_data_parallel_rank() + dp_size = parallel_state.get_data_parallel_world_size() + + init_fn = torch.rand if random else torch.arange + global_ten = init_fn(8 * 5 * 40).reshape(8, 5, 40) + local_ten = global_ten + local_ten = local_ten.chunk(tp_size, dim=0)[tp_rank] + local_ten = local_ten.chunk(pp_size, dim=2)[pp_rank] + assert local_ten.shape == (8 // tp_size, 5, 40 // pp_size) + + local_ten_size_by_dp = local_ten.numel() + assert local_ten_size_by_dp % dp_size == 0, (local_ten_size_by_dp, dp_size) + local_ten_size_by_dp = local_ten_size_by_dp // dp_size + # make a bit shifted DP slices so that they are not equal + start_jitter = dp_rank + end_jitter = dp_rank + 1 if dp_rank + 1 < dp_size else 0 + local_dp_slice = slice( + local_ten_size_by_dp * dp_rank + start_jitter, + local_ten_size_by_dp * (dp_rank + 1) + end_jitter + ) + local_flat_ten = local_ten.flatten()[local_dp_slice] + if dp_rank == dp_size - 1: + assert local_flat_ten.numel() == local_ten_size_by_dp - dp_rank + else: + assert local_flat_ten.numel() == local_ten_size_by_dp + 1 + + state_dict = { + 'sd_key_unflat': ShardedTensor.from_rank_offsets( + 'unflat', + local_ten, + (0, tp_rank, tp_size), + (2, pp_rank, pp_size), + replica_id=dp_rank, + ), + 'sd_key_flat': ShardedTensor.from_rank_offsets_flat( + 'flat', + local_flat_ten, + local_ten.shape, + (0, tp_rank, tp_size), + (2, pp_rank, pp_size), + flattened_range=local_dp_slice + ), + } + return state_dict diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py new file mode 100644 index 0000000..7a0984e --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py @@ -0,0 +1,220 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from pathlib import Path +from typing import Dict + +import numpy as np +import pytest + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.dict_utils import nested_values, \ + map_reduce, dict_list_map_outplace +from megatron.core.dist_checkpointing.mapping import is_main_replica +from megatron.core.dist_checkpointing.strategies.base import \ + SaveShardedStrategy, LoadShardedStrategy +from megatron.core.dist_checkpointing.strategies.fully_parallel import \ + FullyParallelSaveStrategyWrapper, _sharded_tensor_shard_id, \ + FullyParallelLoadStrategyWrapper, _ShardId +from tests.unit_tests.test_utilities import Utils + + +class MockSaveStrategy(SaveShardedStrategy): + def __init__(self): + super().__init__('mock', 1) + self.save_keys = set() + + def save(self, sharded_state_dict, ckpt_dir): + self.save_keys = {sh_ten.key for sh_ten in nested_values(sharded_state_dict) + if is_main_replica(sh_ten.replica_id)} + + +class MockLoadStrategy(LoadShardedStrategy): + def __init__(self, device='cpu'): + super().__init__() + self.device = device + self.load_keys = set() + + def load(self, sharded_state_dict, ckpt_dir): + self.load_keys = {sh_ten.key for sh_ten in nested_values(sharded_state_dict) + if is_main_replica(sh_ten.replica_id)} + + def load_rand(x): + assert isinstance(x, ShardedTensor) + x.init_data(self.device) + x.data.fill_(Utils.rank) + return x.data + + return dict_list_map_outplace(load_rand, sharded_state_dict) + + def load_tensors_metadata(self, checkpoint_dir: Path): + pass + + def check_backend_compatibility(self, loaded_version): + pass + + def check_version_compatibility(self, loaded_version): + pass + + +class TestFullyParallelSaveAndLoad: + @staticmethod + def get_sharded_state_dict(): + return { + 'sd_key_tp_repl1': ShardedTensor.from_rank_offsets('key_TP_repl1', torch.ones(10), + (0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size()), + replica_id=parallel_state.get_data_parallel_rank(with_context_parallel=True)), + 'sd_key_tp_repl2': ShardedTensor.from_rank_offsets('key_TP_repl2', torch.ones(10), + (0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size()), + replica_id=parallel_state.get_data_parallel_rank(with_context_parallel=True)), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.ones(20), (0, Utils.rank, Utils.world_size)), + 'sd_keyE_no_C': ShardedTensor.from_rank_offsets('keyC', torch.ones(100), replica_id=Utils.rank), + 'sd_keyX_no_D': ShardedTensor.from_rank_offsets('keyD', torch.ones(1000), replica_id=Utils.rank), + 'sd_keyC_no_E': ShardedTensor.from_rank_offsets('keyE', torch.ones(100), replica_id=Utils.rank), + } + + @pytest.mark.parametrize("parallelization_along_dp", [False, True]) + def test_save_distribution(self, parallelization_along_dp): + Utils.initialize_model_parallel(2, 1) + state_dict = self.get_sharded_state_dict() + + # Ranks assignment: + # 1. Lowest coverage + # 2. Largest tensor + # 3. Shard id (key) + if not parallelization_along_dp: + expected_key_to_saving_ranks = { + 'keyB': list(range(Utils.world_size)), # everyone must save (disjoint shards, coverage == 1) + 'key_TP_repl1': [0, 1], # lowest coverage (4), first TP domain + 'key_TP_repl2': [2, 3], # lowest coverage (4), second TP domain + 'keyD': [4], # largest tensor + 'keyC': [5], # second largest tensor + 'keyE': [6], # second largest tensor + } + else: + if parallel_state.get_tensor_model_parallel_rank() == 0: + expected_key_to_saving_ranks = { + # everyone must save (disjoint shards, coverage == 1): + 'keyB': list(range(parallel_state.get_data_parallel_world_size(with_context_parallel=True))), + # this time, TP sharded tensors have the same coverage as fully replicated! + 'keyD': [0], # largest tensor + 'keyC': [1], # second largest tensor + 'keyE': [2], # second largest tensor + 'key_TP_repl1': [3], # smallest tensor + 'key_TP_repl2': [3], # smallest tensor, last rank is the least occupied + } + else: + expected_key_to_saving_ranks = { + # everyone must save (disjoint shards, coverage == 1): + 'keyB': list(range(parallel_state.get_data_parallel_world_size(with_context_parallel=True))), + # tensors C, D, E are absent in this DP group + 'key_TP_repl1': [0], # smallest tensor + 'key_TP_repl2': [1], # smallest tensor, last rank is the least occupied + } + + parallelization_group = parallel_state.get_data_parallel_group(with_context_parallel=True) if parallelization_along_dp else None + dp_rank = torch.distributed.get_rank(parallelization_group) + expected_keys_saved_by_current_rank = {k for k, v in expected_key_to_saving_ranks.items() if dp_rank in v} + + # Run save and tests + mock_strategy = MockSaveStrategy() + save_strategy = FullyParallelSaveStrategyWrapper(mock_strategy, + parallelization_group, + do_cache_distribution=True) + save_strategy.save(state_dict, Path('mock_dir')) + key_to_saving_rank = dict(map_reduce(save_strategy.cached_distribution.main_rank_for_shard.items(), lambda shard_rank: shard_rank[0][0], lambda shard_rank: shard_rank[1])) + assert expected_key_to_saving_ranks == key_to_saving_rank + + for k, sh_ten in state_dict.items(): + if _sharded_tensor_shard_id(sh_ten) in save_strategy.cached_distribution.shards_in_this_group: + is_expected_to_be_saved_by_this_rank = dp_rank in expected_key_to_saving_ranks.get(sh_ten.key, []) + assert sh_ten.replica_id == int(not is_expected_to_be_saved_by_this_rank), expected_key_to_saving_ranks + + assert mock_strategy.save_keys == expected_keys_saved_by_current_rank, (Utils.rank, mock_strategy.save_keys, expected_keys_saved_by_current_rank) + + @pytest.mark.parametrize("parallelization_along_dp", [False, True]) + def test_load_distribution(self, parallelization_along_dp): + Utils.initialize_model_parallel(2, 1) + + state_dict = self.get_sharded_state_dict() + + # Ranks assignment: + # 1. Lowest coverage + # 2. Largest tensor + # 3. Shard id (key) + if not parallelization_along_dp: + expected_key_to_saving_ranks = { + 'keyB': list(range(Utils.world_size)), # everyone must save (disjoint shards, coverage == 1) + 'key_TP_repl1': [0, 1], # lowest coverage (4), first TP domain + 'key_TP_repl2': [2, 3], # lowest coverage (4), second TP domain + 'keyD': [4], # largest tensor + 'keyC': [5], # second largest tensor + 'keyE': [6], # second largest tensor + } + else: + # When loading, expected key distribution is the same across TP, because every replica needs to be loaded + expected_key_to_saving_ranks = { + # everyone must load (disjoint shards, coverage == 1): + 'keyB': list(range(parallel_state.get_data_parallel_world_size(with_context_parallel=True))), + # this time, TP sharded tensors have the same coverage as fully replicated! + 'keyD': [0], # largest tensor + 'keyC': [1], # second largest tensor + 'keyE': [2], # second largest tensor + 'key_TP_repl1': [3], # smallest tensor + 'key_TP_repl2': [3], # smallest tensor, last rank is the least occupied + } + + parallelization_group = parallel_state.get_data_parallel_group(with_context_parallel=True) if parallelization_along_dp else None + dp_rank = torch.distributed.get_rank(parallelization_group) + expected_keys_saved_by_current_rank = {k for k, v in expected_key_to_saving_ranks.items() if dp_rank in v} + + # Run save and tests + mock_strategy = MockLoadStrategy() + load_strategy = FullyParallelLoadStrategyWrapper(mock_strategy, + parallelization_group, + do_cache_distribution=True) + loaded_state_dict = load_strategy.load(state_dict, Path('mock_dir')) + key_to_saving_rank = dict(map_reduce(load_strategy.cached_distribution.main_rank_for_shard.items(), lambda shard_rank: shard_rank[0][0], lambda shard_rank: shard_rank[1])) + assert expected_key_to_saving_ranks == key_to_saving_rank + + assert mock_strategy.load_keys == expected_keys_saved_by_current_rank, (Utils.rank, mock_strategy.load_keys, expected_keys_saved_by_current_rank) + + assert loaded_state_dict.keys() == state_dict.keys() + + @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda']) + def test_memory_usage(self, state_dict_device): + Utils.initialize_model_parallel(2, 1) + + megabytes = 1024 * 1024 + mock_strategy = MockLoadStrategy(state_dict_device) + + mem_alloc = [] + + class ParallelLoadWithMemUsage(FullyParallelLoadStrategyWrapper): + def _get_empty_tensor_for_exchange(self, *args, **kwargs) -> torch.Tensor: + ret = super()._get_empty_tensor_for_exchange(*args, **kwargs) + mem_alloc.append(torch.cuda.memory_allocated()) + return ret + + load_strategy = ParallelLoadWithMemUsage(mock_strategy) + torch.distributed.barrier() + + # Each tensor is 4MB, 40MB in total. + # We expect extra memory usage peak at ~32MB, not 1GB + sharded_state_dict = { + f'ten_{i}': ShardedTensor.from_rank_offsets(f'ten_{i}', torch.rand(megabytes, dtype=torch.float, device=state_dict_device), + (0, Utils.rank, Utils.world_size)) + for i in range(10) + } + + mem_alloc_start = torch.cuda.memory_allocated() + + loaded_state_dict = load_strategy.load(sharded_state_dict, Path('mock_dir')) + + # Each rank is expected to do 7 * 10 empty allocations + assert len(mem_alloc) == 7 * 10 + # Peak mem usage should be within 4MB (single tensor) + assert max(mem_alloc) - mem_alloc_start < 4.01 * megabytes, (max(mem_alloc), mem_alloc_start) + + Utils.destroy_model_parallel() \ No newline at end of file diff --git a/tests/unit_tests/dist_checkpointing/test_mapping.py b/tests/unit_tests/dist_checkpointing/test_mapping.py new file mode 100644 index 0000000..ebd0d1e --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_mapping.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.core import CheckpointingException +from megatron.core.dist_checkpointing.mapping import is_main_replica, \ + ShardedTensorFactory, ShardedObject, apply_factories, apply_factory_merges +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + +class TestShardedTensor: + + # def setup_method(self, method): + # Utils.initialize_model_parallel(1,1) + # transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + # self.gpt_embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4, add_position_embedding=True) + # + # def teardown_method(self, method): + # Utils.destroy_model_parallel() + + def test_from_rank_offsets_constructor(self, dtype=torch.float, device='cuda'): + data = torch.ones((1, 3, 7, 9), dtype=dtype, device=device) + shape = data.shape + rank_offsets = [ + (0, 0, 10), + (2, 3, 6) + ] + sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets) + + assert isinstance(sh_ten, ShardedTensor) + assert sh_ten.dtype is dtype + assert sh_ten.local_shape == shape + assert sh_ten.global_shape == (shape[0] * 10, shape[1], shape[2] * 6, shape[3]) + assert sh_ten.global_offset == (0, 0, shape[2] * 3, 0) + assert sh_ten.axis_fragmentations == (10, 1, 6, 1) + + def test_from_rank_offsets_flat_constructor(self, dtype=torch.float, device='cuda'): + data = torch.arange(28, dtype=dtype, device=device).reshape((1, 4, 7)) + shape = data.shape + rank_offsets = [ + (1, 0, 2), + (2, 3, 5) + ] + flattened_range = slice(4, 9) + flat_data = data.flatten()[flattened_range] + sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range) + + # The main attributes properties are unchanged + assert isinstance(sh_ten, ShardedTensor) + assert sh_ten.dtype is dtype + assert sh_ten.local_shape == shape + assert sh_ten.global_shape == (shape[0], shape[1] * 2, shape[2] * 5) + assert sh_ten.global_offset == (0, 0, shape[2] * 3) + assert sh_ten.axis_fragmentations == (1, 2, 5) + + assert torch.all(sh_ten.data == torch.arange(4, 9, device=device)) + + def test_metadata_integrity_violation(self): + data = torch.ones((1, 3, 7, 9), device='meta') + rank_offsets = [ + (0, 0, 10), + (2, 3, 6) + ] + sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets) + sh_ten.validate_metadata_integrity() + with pytest.raises(CheckpointingException): + sh_ten.local_shape = (1, 2, 7, 9) + sh_ten.validate_metadata_integrity() + + sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets) + with pytest.raises(CheckpointingException): + sh_ten.global_offset = (0, 1, 0) + sh_ten.validate_metadata_integrity() + + with pytest.raises(CheckpointingException): + sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', data, data.shape, *rank_offsets, + flattened_range=slice(4, 9)) + + sh_ten = ShardedTensor.from_rank_offsets_flat('keyA', data.flatten()[4:9], data.shape, *rank_offsets, + flattened_range=slice(4, 9)) + assert sh_ten.local_shape == (1, 3, 7, 9) + with pytest.raises(CheckpointingException): + sh_ten.local_shape = (5,) + sh_ten.validate_metadata_integrity() + + + +class TestShardedTensorFactory: + def test_build_and_merge(self): + def build_fn(key, tensor, replica_id, flattened_range): + assert flattened_range is None + return { + 'level2_a': ShardedTensor.from_rank_offsets(key + 'part1', tensor + 1, replica_id=replica_id), + 'level2_b': ShardedTensor.from_rank_offsets(key + 'part2', tensor + 2, replica_id=replica_id) + } + + # state_dict will be modified in-place + def get_state_dict(): + return { + 'level1': ShardedTensorFactory('a', torch.arange(3), build_fn, lambda x: x['level2_b']) + } + state_dict = get_state_dict() + apply_factories(state_dict) + assert torch.allclose(state_dict['level1']['level2_a'].data, torch.tensor([1, 2, 3])) + assert torch.allclose(state_dict['level1']['level2_b'].data, torch.tensor([2, 3, 4])) + + # Simulate loading + state_dict['level1']['level2_a'] = state_dict['level1']['level2_a'].data + state_dict['level1']['level2_b'] = state_dict['level1']['level2_b'].data + + loaded_state_dict = apply_factory_merges(state_dict, get_state_dict()) + assert torch.allclose(loaded_state_dict['level1'], torch.tensor([2, 3, 4])) + + +def test_is_main_replica(): + assert is_main_replica(0) + assert is_main_replica((0,)) + assert is_main_replica((0, 0)) + assert not is_main_replica(1) + assert not is_main_replica(2) + assert not is_main_replica((1,)) + assert not is_main_replica((1, 0)) + assert not is_main_replica((1, 1, 1)) diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py new file mode 100644 index 0000000..1616c7d --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py @@ -0,0 +1,513 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from copy import deepcopy +from functools import partial +from time import sleep +from types import MethodType, SimpleNamespace +from unittest import mock +from unittest.mock import MagicMock + +import numpy as np +import pytest +import torch +from torch.optim import Adam + +from megatron.core import parallel_state, DistributedDataParallel as DDP +from megatron.core.dist_checkpointing import ShardedTensor, save, load, \ + load_tensors_metadata, load_plain_tensors +from megatron.core.dist_checkpointing.dict_utils import nested_values, diff +from megatron.core.dist_checkpointing.optimizer import \ + get_param_id_to_sharded_param_map, optim_state_to_sharding_state +from megatron.core.dist_checkpointing.serialization import \ + get_default_save_sharded_strategy +from megatron.core.dist_checkpointing.strategies.fully_parallel import \ + FullyParallelSaveStrategyWrapper +from megatron.core.dist_checkpointing.utils import extract_sharded_tensors +from megatron.core.models.gpt import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.optimizer import DistributedOptimizer, OptimizerConfig, \ + get_megatron_optimizer +from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed +from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.mlp import apply_swiglu_sharded_factory +from megatron.core.utils import get_model_config +from megatron.training.checkpointing import load_checkpoint, save_checkpoint +from megatron.training.training import get_model +from megatron.training.utils import unwrap_model +from pretrain_gpt import model_provider + +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv1d(8, 16, 3) + self.proj = torch.nn.Linear(8, 5) + self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1) + + def sharded_state_dict(self): + sharded_state_dict = self.state_dict(keep_vars=True) + # conv + sharded_state_dict['conv.weight'] = ShardedTensor.from_rank_offsets( + 'conv.weight', sharded_state_dict['conv.weight'], + (1, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size()) + ) + # bias is non-sharded + sharded_state_dict['conv.bias'] = ShardedTensor.from_rank_offsets('conv.bias', sharded_state_dict['conv.bias']) + + # proj + sharded_state_dict['proj.weight'] = ShardedTensor.from_rank_offsets( + 'proj.weight', sharded_state_dict['proj.weight'], + (0, Utils.rank, Utils.world_size) + ) + sharded_state_dict['proj.bias'] = ShardedTensor.from_rank_offsets( + 'proj.bias', sharded_state_dict['proj.bias'], + (0, Utils.rank, Utils.world_size) + ) + return sharded_state_dict + + +class SwigluFactoryModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 64 // parallel_state.get_tensor_model_parallel_world_size(), bias=False) + self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1) + + def sharded_state_dict(self): + sharded_state_dict = self.state_dict(keep_vars=True) + sharded_state_dict['linear.weight'] = ShardedTensor.from_rank_offsets( + 'linear.weight', sharded_state_dict['linear.weight'], + ((0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size())), + replica_id=((parallel_state.get_pipeline_model_parallel_rank(), 0, parallel_state.get_data_parallel_rank(with_context_parallel=True))) + ) + sharded_state_dict['linear.weight'] = apply_swiglu_sharded_factory(sharded_state_dict['linear.weight'], ()) + return sharded_state_dict + + +class SwigluFactoryModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 64 // parallel_state.get_tensor_model_parallel_world_size(), bias=False) + self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1) + + def sharded_state_dict(self): + sharded_state_dict = self.state_dict(keep_vars=True) + sharded_state_dict['linear.weight'] = ShardedTensor.from_rank_offsets( + 'linear.weight', sharded_state_dict['linear.weight'], + ((0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size())), + replica_id=((parallel_state.get_pipeline_model_parallel_rank(), 0, parallel_state.get_data_parallel_rank(with_context_parallel=True))) + ) + sharded_state_dict['linear.weight'] = apply_swiglu_sharded_factory(sharded_state_dict['linear.weight'], ()) + return sharded_state_dict + + +class TestOptimizer: + def setup_class(cls): + Utils.initialize_distributed() + + @pytest.fixture(scope='function', autouse=True) + def cleanup_model_parallel(self): + # pass for initialize + yield + Utils.destroy_model_parallel() + + def test_optimizer_params(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(1,1) + model = Model() + # Force optimizer state initialization + for p in model.parameters(): + p.grad = torch.ones_like(p.data) + optim = Adam(model.parameters()) + optim.step() + + model_state_dict = model.sharded_state_dict() + param_map = get_param_id_to_sharded_param_map(model_state_dict, optim.param_groups[0]['params']) + optim_state_dict = optim.state_dict() + optim_state_to_sharding_state(optim_state_dict, param_map, exclude_keys=('step',)) + + optim_sharded_tensors = nested_values(extract_sharded_tensors(optim_state_dict)[0]) + optim_sharded_keys = {sh_ten.key for sh_ten in optim_sharded_tensors} + assert len(optim_sharded_keys) == 2 * len(model_state_dict) + assert optim_sharded_keys == set([ + f'optimizer.state.{state_key}.{layer_name}' + for state_key in ['exp_avg', 'exp_avg_sq'] + for layer_name in model_state_dict + ]) + + +def initialize_gpt_model(pre_process=True, post_process=True, seed=0, use_glu=True, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + default_config_kwargs=dict(num_layers=8, hidden_size=16, num_attention_heads=8, use_cpu_initialization=True) + default_config_kwargs.update(**config_kwargs) + transformer_config = TransformerConfig(**default_config_kwargs, gated_linear_unit=use_glu) + model = GPTModel(config=transformer_config, transformer_layer_spec=get_gpt_layer_local_spec(), vocab_size=128, max_sequence_length=4, + pre_process=pre_process, post_process=post_process) + + model.bfloat16() + with torch.no_grad(): + for p in model.parameters(): + p.random_() + return model + + +def initialize_small_model(pre_process=True, post_process=True, seed=0, **config_kwargs): + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + + return SwigluFactoryModel() + + +def init_basic_mock_args(args, bf16=True): + args.data_parallel_random_init = False + args.virtual_pipeline_model_parallel_size = None + args.fp16 = False + args.bf16 = bf16 + args.accumulate_allreduce_grads_in_fp32 = False + args.overlap_grad_reduce = False + args.use_distributed_optimizer = True + args.ddp_bucket_size = None + args.check_for_nan_in_loss_and_grad = False + args.ddp_average_in_collective = False + return args + + +def init_checkpointing_mock_args(args, ckpt_dir, fully_parallel=False): + args.save = ckpt_dir + args.load = ckpt_dir + args.pretrained_checkpoint = None + args.ckpt_fully_parallel_save = fully_parallel + args.ckpt_fully_parallel_load = fully_parallel + args.async_save = False + args.use_dist_ckpt = True + args.dist_ckpt_format = 'torch_dist' + args.no_save_optim = False + args.no_save_rng = False + args.ckpt_assume_constant_structure = False + args.log_progress = False + args.auto_detect_ckpt_format = False + args.exit_on_missing_checkpoint = False + args.finetune = False + args.consumed_train_samples = 0 + args.consumed_valid_samples = 0 + args.retro_add_retriever = False + args.no_load_optim = False + args.no_load_rng = False + args.dist_ckpt_strictness = 'assume_ok_unexpected' + + +def load_checkpoint_no_arg_checks(*args, **kwargs): + with mock.patch('megatron.training.checkpointing.check_checkpoint_args'): + with mock.patch('megatron.training.checkpointing.update_num_microbatches'): + return load_checkpoint(*args, **kwargs) + + +def setup_model_and_optimizer(seed, initialize_fn=initialize_gpt_model, bf16=True, dist_opt=True): + mock_args = SimpleNamespace() + with mock.patch('megatron.training.training.get_args', new=lambda: mock_args): + init_basic_mock_args(mock_args, bf16=bf16) + model = get_model(partial(initialize_fn, seed=seed)) + + config = OptimizerConfig(bf16=bf16, params_dtype=torch.bfloat16 if bf16 else torch.float, use_distributed_optimizer=dist_opt) + optimizer = get_megatron_optimizer(config, model) + + torch.manual_seed(seed + 1) + model_parallel_cuda_manual_seed(seed + 1) + + for group in optimizer.optimizer.param_groups: + for p in group['params']: + if len(optimizer.optimizer.state[p]) == 0: + optimizer.optimizer.state[p]['exp_avg'] = torch.rand_like(p.data) + optimizer.optimizer.state[p]['exp_avg_sq'] = torch.rand_like(p.data) + + optimizer.reload_model_params() + + return unwrap_model(model), optimizer + + +class TestDistributedOptimizer: + def setup_class(cls): + Utils.initialize_distributed() + + @pytest.fixture(scope='function', autouse=True) + def cleanup_model_parallel(self): + # pass for initialize + yield + Utils.destroy_model_parallel() + + @pytest.mark.parametrize("initialize_fn", [initialize_small_model, initialize_gpt_model]) + @pytest.mark.parametrize("use_fpsl", [False, True]) + @pytest.mark.parametrize("tp_pp,src_dp,dest_dp", [ + ((4, 1), 2, 2), + # ((1, 1), 8, 1), # TODO: changing DP doesn't work in unit tests because of NCCL crashes + # ((1, 1), 1, 8), + # ((2, 1), 2, 1), + # ((2, 1), 2, 2), + ]) + def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn): + src_world_size = tp_pp[0] * tp_pp[1] * src_dp + dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp + assert src_world_size <= Utils.world_size, (tp_pp, src_dp) + assert dest_world_size <= Utils.world_size, (tp_pp, dest_dp) + + sharding_type = 'fully_sharded_model_space' if use_fpsl else 'dp_zero_gather_scatter' + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_dp_sharding', sync=True) as ckpt_dir: + try: + Utils.set_world_size(src_world_size) + if Utils.rank >= 0: + # Save checkpoint A + Utils.initialize_model_parallel(*tp_pp) + model, optimizer_A = setup_model_and_optimizer(seed=2, initialize_fn=initialize_fn) + + save_strategy = get_default_save_sharded_strategy() + if use_fpsl: + save_strategy = FullyParallelSaveStrategyWrapper( + save_strategy, + parallel_state.get_data_parallel_group(with_context_parallel=True), + True + ) + save(optimizer_A.sharded_state_dict(model[0].sharded_state_dict(), sharding_type=sharding_type), ckpt_dir, save_strategy) + optim_param_state_A = optimizer_A.get_parameter_state_dp_zero() + Utils.destroy_model_parallel() + else: + # this prevents NCCL errors when changing DP. TODO: fix it properly + sleep(20) + + # Load checkpoint A with different TP/PP and save as checkpoint B + Utils.set_world_size(dest_world_size) + if Utils.rank == 0: + print('_____________________') + if Utils.rank >= 0: + Utils.initialize_model_parallel(*tp_pp) + + model, optimizer_B = setup_model_and_optimizer(seed=3, initialize_fn=initialize_fn) + optim_param_state_B = optimizer_B.get_parameter_state_dp_zero() + diffs = diff(optim_param_state_A, optim_param_state_B) + # Expect a mismatch in values - diffs[2] nonempty + if parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0: + assert not diffs[0] and not diffs[1] and diffs[2], diffs + + sharded_state_dict = optimizer_B.sharded_state_dict( + model[0].sharded_state_dict(), + is_loading=True, + sharding_type=sharding_type, + ) + optim_state_dict = load(sharded_state_dict, ckpt_dir) + optimizer_B.load_state_dict(optim_state_dict) + optim_param_state_B = optimizer_B.get_parameter_state_dp_zero() + + # Test both param state dicts are equal + diffs = diff(optim_param_state_A, optim_param_state_B) + assert not any(map(bool, diffs)), diffs + + else: + # this prevents NCCL errors when changing DP. TODO: fix it properly + sleep(20) + finally: + Utils.set_world_size() + + @pytest.mark.parametrize( + ('src_tp_pp', 'dest_tp_pp', 'use_glu'), + [ + ((2, 2), (2, 4), False,), + ((1, 8), (4, 1), True), + ((2, 4), (4, 2), False), + ] + ) + def test_finetune_doesnt_load_optimizer(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu): + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_finetune_doesnt_load_optimizer', sync=True) as ckpt_dir: + mock_args = SimpleNamespace() + with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args): + init_basic_mock_args(mock_args) + init_checkpointing_mock_args(mock_args, ckpt_dir, False) + + Utils.initialize_model_parallel(*src_tp_pp) + model, optimizer = setup_model_and_optimizer(seed=2, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu)) + + # We need to save the TPxPP of the source model + mock_args.tensor_model_parallel_size = src_tp_pp[0] + mock_args.pipeline_model_parallel_size = src_tp_pp[1] + save_checkpoint(10, model, optimizer, None, 0) + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(*dest_tp_pp) + model, optimizer = setup_model_and_optimizer(seed=3, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu)) + model_unloaded_state_dict = deepcopy(model[0].state_dict()) + optim_unloaded_state_dict = deepcopy(optimizer.state_dict()) + + # Load with different TPxPP should raise DistributeOptimizer error + with pytest.raises(RuntimeError) as exc_info: + load_checkpoint_no_arg_checks(model, optimizer, None) + assert "(TP, PP) mismatch" in str(exc_info.value) + + ## Check that the state didn't change + assert not any(diff(model[0].state_dict(), model_unloaded_state_dict)) + assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict)) + + # Now test the same with a `finetune` flag + mock_args.finetune = True + load_checkpoint_no_arg_checks(model, optimizer, None) + + ## Model weights should be different, but optimizer state is unchanged + diffs = diff(model[0].state_dict(), model_unloaded_state_dict) + # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff - we expect only values diff + assert not diffs[0] and not diffs[1] and diffs[2] + assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict)) + + # ... or `no_load_optim` flag + model, optimizer = setup_model_and_optimizer(seed=3, initialize_fn=partial(initialize_gpt_model, use_glu=use_glu)) + mock_args.finetune = False + mock_args.no_load_optim = True + mock_args.no_load_rng = True + load_checkpoint_no_arg_checks(model, optimizer, None) + + ## Model weights should be different, but optimizer state is unchanged + diffs = (diff(model[0].state_dict(), model_unloaded_state_dict)) + # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff - we expect only values diff + assert not diffs[0] and not diffs[1] and diffs[2] + assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict)) + + + def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt): + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_can_load_deprecated_bucket_space_format', sync=True) as ckpt_dir: + mock_args = SimpleNamespace() + with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args): + init_basic_mock_args(mock_args) + init_checkpointing_mock_args(mock_args, ckpt_dir, True) + + Utils.initialize_model_parallel(4, 2) + model, optimizer = setup_model_and_optimizer(seed=2, initialize_fn=initialize_gpt_model) + + mock_args.tensor_model_parallel_size = 4 + mock_args.pipeline_model_parallel_size = 2 + + # Mock optimizer sharded_state_dict so that it ignores the externally passed sharding_type and uses 'fully_sharded_bucket_space' instead + orig_optim_sharded_state_dict_fn = optimizer.sharded_state_dict + def sharded_state_dict_bucket_space(self, *args, sharding_type: str = 'fully_sharded_model_space', **kwargs): + return orig_optim_sharded_state_dict_fn(*args, sharding_type='fully_sharded_bucket_space', **kwargs) + + optimizer.sharded_state_dict = MethodType(sharded_state_dict_bucket_space, optimizer) + save_checkpoint(10, model, optimizer, None, 0) + + flag = 0 + key_list = [] + torch.distributed.barrier() + if Utils.rank == 0: + sharded_metadata = load_tensors_metadata(ckpt_dir / 'iter_0000010') + key_list = list(sharded_metadata.keys()) + # Check if actually using `fully_parallel_bucket_space` format. + key = 'optimizer.distributed.dp_group_idx_0.gbuf_idx_0.dtype_(torch.bfloat16, torch.bfloat16).bucket_idx_0.exp_avg_sq' + if key in key_list: + flag = 1 + + tensor = torch.tensor([flag], dtype=torch.long, device='cuda') + torch.distributed.broadcast(tensor, 0) + flag = tensor[0].item() + assert flag == 1, key_list + + optimizer.sharded_state_dict = orig_optim_sharded_state_dict_fn + load_checkpoint_no_arg_checks(model, optimizer, None) + + + +class TestFP32Optimizer: + def setup_class(cls): + Utils.initialize_distributed() + + @pytest.fixture(scope='function', autouse=True) + def cleanup_model_parallel(self): + # pass for initialize + yield + Utils.destroy_model_parallel() + + @pytest.mark.parametrize( + ('src_tp_pp', 'dest_tp_pp'), + [ + ((2, 4), (2, 4)), + ((2, 4), (4, 2)), + ((8, 1), (1, 2)), + ] + ) + def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp): + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=True) as ckpt_dir_A: + with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=True) as ckpt_dir_B: + Utils.initialize_model_parallel(*src_tp_pp) + model_A, optimizer_A = setup_model_and_optimizer(seed=2, initialize_fn=initialize_small_model, bf16=False) + + save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP and save as checkpoint B + Utils.initialize_model_parallel(*dest_tp_pp) + model_B, optimizer_B = setup_model_and_optimizer(seed=3, initialize_fn=initialize_small_model, bf16=False) + load_sharded_state_dict = optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()) + state_dict = load(load_sharded_state_dict, ckpt_dir_A) + + optimizer_B.load_state_dict(state_dict) + save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + plain_state_dict_A = load_plain_tensors(ckpt_dir_A) + plain_state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(plain_state_dict_A, plain_state_dict_B) + assert not any(map(bool, diffs)), diffs + + +class TestOptimizerResharding: + @pytest.fixture(scope='function', autouse=True) + def cleanup_model_parallel(self): + # pass for initialize + yield + Utils.destroy_model_parallel() + + @pytest.mark.parametrize( + ('use_dist_opt', 'bf16'), + ( + (False, True), # regular BF16 + (True, True), # DistOpt BF16 + # (False, False), # FP32 + ) + ) + @pytest.mark.parametrize( + ('src_tp_pp', 'dest_tp_pp',), + [ + ((2, 4), (2, 4)), + ((2, 4), (2, 2)), + ((2, 4), (4, 2)), + ((8, 1), (1, 2)), + ] + ) + def test_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16): + with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=False) as ckpt_dir_A: + with TempNamedDir(tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=False) as ckpt_dir_B: + Utils.initialize_model_parallel(*src_tp_pp) + model_A, optimizer_A = setup_model_and_optimizer(seed=2, bf16=bf16, dist_opt=use_dist_opt) + + save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A) + Utils.destroy_model_parallel() + + # Load checkpoint A with different TP/PP and save as checkpoint B + Utils.initialize_model_parallel(*dest_tp_pp) + model_B, optimizer_B = setup_model_and_optimizer(seed=3, bf16=bf16, dist_opt=use_dist_opt) + load_sharded_state_dict = optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()) + state_dict = load(load_sharded_state_dict, ckpt_dir_A) + + optimizer_B.load_state_dict(state_dict) + save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B) + Utils.destroy_model_parallel() + + # Test both checkpoints are equal + Utils.initialize_model_parallel(1, 1) + plain_state_dict_A = load_plain_tensors(ckpt_dir_A) + plain_state_dict_B = load_plain_tensors(ckpt_dir_B) + diffs = diff(plain_state_dict_A, plain_state_dict_B) + assert not any(map(bool, diffs)), diffs diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py new file mode 100644 index 0000000..e06699f --- /dev/null +++ b/tests/unit_tests/dist_checkpointing/test_serialization.py @@ -0,0 +1,554 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import io +import logging + +import numpy as np +import pytest +import torch +from torch.distributed.checkpoint import CheckpointException as PyTCheckpointingException + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing import ShardedTensor, save, load +from megatron.core.dist_checkpointing.core import CheckpointingException, \ + maybe_load_config +from megatron.core.dist_checkpointing.dict_utils import diff +from megatron.core.dist_checkpointing.mapping import ShardedTensorFactory, \ + ShardedObject +from megatron.core.dist_checkpointing.serialization import \ + load_tensors_metadata, load_sharded_metadata +from megatron.core.dist_checkpointing.strategies.base import StrategyAction, \ + get_default_strategy +from megatron.core.dist_checkpointing.validation import StrictHandling + +from tests.unit_tests.dist_checkpointing import TempNamedDir +from tests.unit_tests.test_utilities import Utils + + +class TestSerialization: + def setup_class(cls): + Utils.initialize_distributed() + + @pytest.fixture(scope='function', autouse=True) + def cleanup_model_parallel(self): + # pass for initialize + yield + Utils.destroy_model_parallel() + + def test_single_process_save_load(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(1,1) + + sharded_state_dict = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 4), replica_id=Utils.rank), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.ones(3, 5, 7), replica_id=Utils.rank), + } + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_single_process_save_load', sync=True) as ckpt_dir: + save(sharded_state_dict, ckpt_dir) + torch.distributed.barrier() + + saved_config = maybe_load_config(ckpt_dir) + if saved_config.sharded_backend == 'zarr': + assert (ckpt_dir / 'keyA').is_dir() + assert (ckpt_dir / 'keyB').is_dir() + assert not (ckpt_dir / 'keyC').exists() + assert not (ckpt_dir / 'sd_keyA').is_dir() + + load_ssd = { + 'load_sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 4), replica_id=Utils.rank), + } + loaded_state_dict = load(load_ssd, ckpt_dir) + + assert set(loaded_state_dict.keys()) == {'load_sd_keyA'} + assert isinstance(loaded_state_dict['load_sd_keyA'], torch.Tensor) + assert loaded_state_dict['load_sd_keyA'].shape == (2, 4) + + Utils.destroy_model_parallel() + + + def test_multi_process_save(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2,4) + + state_dict = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 4), (0, Utils.rank, Utils.world_size)), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.ones(3, 5, 7), (2, Utils.rank, Utils.world_size)), + } + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_multi_process_save', sync=True) as ckpt_dir: + save(state_dict, ckpt_dir) + + saved_config = maybe_load_config(ckpt_dir) + if saved_config.sharded_backend == 'zarr': + assert (ckpt_dir / 'keyA').is_dir() + assert (ckpt_dir / 'keyB').is_dir() + assert not (ckpt_dir / 'keyC').exists() + assert not (ckpt_dir / 'sd_keyA').is_dir() + + Utils.destroy_model_parallel() + + + def test_partition_change_save_load(self, tmp_path_dist_ckpt, strategy=None): + Utils.initialize_model_parallel(2,4) + + # ten_a: global shape (2, 4): + ten_a_global = torch.tensor([[0, 1, 2, 3], [10, 11, 12, 13]]) + ten_a = torch.zeros(1, 1) + 10 * parallel_state.get_tensor_model_parallel_rank() + parallel_state.get_pipeline_model_parallel_rank() + assert ten_a.shape == (1, 1) + + # ten_b: global shape (4, 5, 80), where (x, y, z) is (100x + z) + ten_b = torch.zeros(4, 5, 10) + (torch.arange(10) + 10 * Utils.rank) + ten_b += torch.arange(4).unsqueeze(-1).unsqueeze(-1) * 100 + assert ten_b.shape == (4, 5, 10) + + state_dict = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', ten_a, + (0, parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_tensor_model_parallel_world_size()), + (1, parallel_state.get_pipeline_model_parallel_rank(), parallel_state.get_pipeline_model_parallel_world_size()), + replica_id=0), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', ten_b, (2, Utils.rank, Utils.world_size)), + } + + ten_a_global_shape = ten_a_global.shape + ten_b_global_shape = (4, 5, 10 * 8) + + assert state_dict['sd_keyA'].local_shape == (1, 1) + assert state_dict['sd_keyA'].global_shape == ten_a_global_shape + assert state_dict['sd_keyB'].global_shape == ten_b_global_shape + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_partition_change_save_load', sync=True) as ckpt_dir: + save(state_dict, ckpt_dir, strategy) + + del ten_a, ten_b + + # without changing TPxPP, load tensors without any sharding + load_sd = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', + torch.empty(ten_a_global_shape), + replica_id=Utils.rank), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', + torch.empty(ten_b_global_shape), + replica_id=Utils.rank), + } + loaded_state_dict = load(load_sd, ckpt_dir) + + ten_a = loaded_state_dict['sd_keyA'] + ten_b = loaded_state_dict['sd_keyB'] + assert isinstance(ten_a, torch.Tensor) + assert ten_a.shape == ten_a_global_shape + assert torch.all(ten_a == ten_a_global) + + assert isinstance(ten_b, torch.Tensor) + assert ten_b.shape == ten_b_global_shape + assert np.all([ + val == 100 * x + z + for x, x_row in enumerate(ten_b) + for y, y_row in enumerate(x_row) + for z, val in enumerate(y_row) + ]) + + del ten_a, ten_b + + # change TPxPP + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(1,2) + + load_sd = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.empty(2, 1), + (1, parallel_state.get_data_parallel_rank(), parallel_state.get_data_parallel_world_size()), + replica_id=parallel_state.get_pipeline_model_parallel_rank()), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.empty(5, 80), + (0, Utils.rank // 2, 4), + prepend_axis_num=1, + replica_id=Utils.rank % 2), + } + + loaded_state_dict = load(load_sd, ckpt_dir) + ten_a = loaded_state_dict['sd_keyA'] + ten_b = loaded_state_dict['sd_keyB'] + + assert isinstance(ten_a, torch.Tensor) + assert ten_a.shape == (2, 1) + assert torch.all(ten_a[:, 0] == ten_a_global[:, parallel_state.get_data_parallel_rank()]) + + assert isinstance(ten_b, torch.Tensor) + assert ten_b.shape == (5, 10 * 8) + assert torch.all(ten_b == torch.arange(80).unsqueeze(0).expand(5, 80) + Utils.rank // 2 * 100) + + def test_load_tensors_metadata(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2,4) + + state_dict = { + 'sd_keyA': ShardedTensor.from_rank_offsets('keyA', torch.arange(10) + Utils.rank * 10, (0, Utils.rank, Utils.world_size)), + 'sd_keyB': ShardedTensor.from_rank_offsets('keyB', torch.ones(3, 5, 7), (2, Utils.rank, Utils.world_size)), + } + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_load_tensors_metadata', sync=True) as ckpt_dir: + save(state_dict, ckpt_dir) + + del state_dict + sharded_state_dict = load_tensors_metadata(ckpt_dir) + # loaded dict keys are ShardedTensor keys! + assert 'keyA' in sharded_state_dict + assert 'sd_keyA' not in sharded_state_dict + + # Check metadata + assert sharded_state_dict['keyA'].global_shape == (10 * Utils.world_size,) + assert sharded_state_dict['keyB'].global_shape == (3, 5, 7 * Utils.world_size) + assert sharded_state_dict['keyA'].local_shape == sharded_state_dict['keyA'].global_shape + assert sharded_state_dict['keyB'].local_shape == sharded_state_dict['keyB'].global_shape + assert sharded_state_dict['keyA'].global_offset == (0,) + assert sharded_state_dict['keyB'].global_offset == (0, 0, 0) + assert sharded_state_dict['keyA'].axis_fragmentations == (1,) + assert sharded_state_dict['keyB'].axis_fragmentations == (1, 1, 1) + assert sharded_state_dict['keyA'].replica_id == 0 + assert sharded_state_dict['keyB'].replica_id == 0 + + # metadata dict can be loaded. We don't validate access because there are multiple replica_id=0 + state_dict = load(sharded_state_dict, ckpt_dir, validate_access_integrity=False) + assert torch.all(state_dict['keyA'] == torch.arange(10 * Utils.world_size)) + + Utils.destroy_model_parallel() + + def test_can_mix_sharded_tensors_and_factories(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(1, 1) + + def _build_fn(key, tensor, replica_id, flattened_range): + assert flattened_range is None + return [ + ShardedTensor.from_rank_offsets(key + 'part1', tensor, replica_id=replica_id), + ShardedTensor.from_rank_offsets(key + 'part2', tensor, replica_id=replica_id), + ShardedTensor.from_rank_offsets(key + 'part3', tensor, replica_id=replica_id), + ] + + # state dict can be modified by dist_checkpointing.save, so two copies + def get_sharded_state_dict(base=0): + return {'all': [ + ShardedTensor.from_rank_offsets('A', torch.arange(2) + base, replica_id=Utils.rank), + ShardedTensor.from_rank_offsets('B', torch.arange(3) + base, replica_id=Utils.rank), + ShardedTensor.from_rank_offsets('C', torch.arange(4) + base, replica_id=Utils.rank), + ShardedTensorFactory('D', torch.arange(5) + base, _build_fn, sum, replica_id=Utils.rank), + ]} + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_can_mix_sharded_tensors_and_factories', sync=True) as ckpt_dir: + save(get_sharded_state_dict(0), ckpt_dir) + loaded_state_dict = load(get_sharded_state_dict(10), ckpt_dir) + + expected_sd = { + 'all': [ + torch.arange(2), + torch.arange(3), + torch.arange(4), + torch.arange(5) * 3, # sum of three parts, as specified in merge_fn + ] + } + diffs = diff(loaded_state_dict, expected_sd) + assert not any(map(bool, diffs)), diffs + + Utils.destroy_model_parallel() + + def test_load_error_msg(self, tmp_path_dist_ckpt): + ckpt_dir_name = 'test_load_error_msg' + Utils.initialize_model_parallel(1, 1) + sh_ten = ShardedTensor.from_rank_offsets('keyA', torch.rand(10), replica_id=Utils.rank) + state_dict = {'some_key': sh_ten} + + # Non-existent directory + non_ex_path = f'/tmp/non-existent-path/{ckpt_dir_name}' + with pytest.raises(CheckpointingException) as exc_info: + load(state_dict, non_ex_path) + assert f'directory {non_ex_path} does not exist' in str(exc_info.value) + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / ckpt_dir_name, sync=True) as ckpt_dir: + # Empty directory - not a distributed checkpoint + with pytest.raises(CheckpointingException) as exc_info: + load(state_dict, ckpt_dir) + assert f'is not a distributed checkpoint' in str(exc_info.value) + + # Missing Zarr arrays + torch.distributed.barrier() + save(state_dict, ckpt_dir) + sh_ten.key = 'different_key' + with pytest.raises((CheckpointingException, PyTCheckpointingException)) as exc_info: + load(state_dict, ckpt_dir) + assert "different_key" in str(exc_info.value) + + def test_sharded_object_serialization(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(1, 1) + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_sh_obj', sync=True) as ckpt_dir: + state = {'some': 'dict'} + state_serialized = io.BytesIO() + torch.save(state, state_serialized) + state_dict = {'some_key': ShardedObject('sh_obj_A', state_serialized, (1,), (0,), + replica_id=Utils.rank)} + + save(state_dict, ckpt_dir) + del state, state_serialized, state_dict + other_state = {'other': 'dictionary'} + other_serialized = io.BytesIO() + torch.save(other_state, other_serialized) + state_dict = {'other_key': ShardedObject('sh_obj_A', other_serialized, (1,), (0,), + replica_id=Utils.rank)} + load_state_dict = load(state_dict, ckpt_dir) + assert 'other_key' in load_state_dict + load_state_dict['other_key'].seek(0) + loaded_state = torch.load(load_state_dict['other_key']) + + assert loaded_state == {'some': 'dict'} + + Utils.destroy_model_parallel() + + def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt): + Utils.initialize_model_parallel(2,4) + + # Global tensor is just a range(32) repeated twice over the first dimension + local_tensor = torch.arange(4).unsqueeze(0).expand(2, 4) + Utils.rank * 4 + + state_dict = { + 'rigid': ShardedTensor.from_rank_offsets('keyA', local_tensor, (1, Utils.rank, Utils.world_size)), + 'flexible': ShardedTensor.from_rank_offsets('keyB', local_tensor, (1, Utils.rank, Utils.world_size), + allow_shape_mismatch=True), + } + assert state_dict['rigid'].global_shape == (2, 32) + assert state_dict['flexible'].global_shape == (2, 32) + + # sync=True to make sure other ranks wait for rank 0 to finish creating directory. + with TempNamedDir(tmp_path_dist_ckpt / 'test_tensor_shape_mismatch', sync=True) as ckpt_dir: + save(state_dict, ckpt_dir) + + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + tp_rank = parallel_state.get_tensor_model_parallel_rank() + + # Smaller coverage than expected (28 < 32) + state_dict = { + 'rigid': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 7), (1, pp_rank, pp_size), replica_id=tp_rank), + } + with pytest.raises((CheckpointingException, PyTCheckpointingException)): + load(state_dict, ckpt_dir) + + state_dict = { + 'flexible': ShardedTensor.from_rank_offsets('keyB', torch.ones(2, 7), (1, pp_rank, pp_size), replica_id=tp_rank, + allow_shape_mismatch=True), + } + loaded_state_dict = load(state_dict, ckpt_dir) + assert torch.all(loaded_state_dict['flexible'] == torch.arange(7).unsqueeze(0).expand(2, 7) + pp_rank * 7) + + # Larger coverage than expected (36 > 32) + state_dict = { + 'rigid': ShardedTensor.from_rank_offsets('keyA', torch.ones(2, 9), (1, pp_rank, pp_size), replica_id=tp_rank), + } + with pytest.raises((CheckpointingException, PyTCheckpointingException)): + load(state_dict, ckpt_dir) + + state_dict = { + 'flexible': ShardedTensor.from_rank_offsets('keyB', torch.ones(2, 9), (1, pp_rank, pp_size), replica_id=tp_rank, + allow_shape_mismatch=True), + } + loaded_state_dict = load(state_dict, ckpt_dir) + expected_tensor = torch.arange(9).unsqueeze(0).expand(2, 9) + pp_rank * 9 + + if pp_rank >= (32 // 9): + assert pp_rank == 3, pp_rank + expected_tensor[:, 5:] = 0 # padding with 0s + assert torch.all(loaded_state_dict['flexible'] == expected_tensor) + + Utils.destroy_model_parallel() + + +class TestNonStrictLoad: + def setup_method(self, method): + Utils.initialize_model_parallel(2, 4) # doesn't matter for this test + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def _get_base_state_dict(self): + return { + 'TenA': ShardedTensor.from_rank_offsets('TenA', torch.arange(2), replica_id=Utils.rank), + 'TenB': ShardedTensor.from_rank_offsets('TenB', torch.arange(3), (0, Utils.rank, Utils.world_size), replica_id=0), + 'TenC': ShardedTensor.from_rank_offsets('TenC', torch.arange(3), replica_id=Utils.world_size - Utils.rank - 1), + 'ObjA': ShardedObject('ObjA', list(range(10)), (1,), (0,), replica_id=Utils.rank), + 'ObjB': ShardedObject('ObjB', {Utils.rank + 7}, (1, Utils.world_size), (0, Utils.rank), replica_id=0), + } + + @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist']) + @pytest.mark.parametrize('validate_integrity', [True, False]) + def test_unexpected_keys_handling_during_validation(self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format): + sharded_state_dict = self._get_base_state_dict() + with TempNamedDir(tmp_path_dist_ckpt / 'test_unexpected_keys_raises_error_during_validation') as ckpt_dir: + save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1) + save(sharded_state_dict, ckpt_dir, save_strategy) + + def load_with_flag(strict): + sharded_state_dict = self._get_base_state_dict() + sharded_state_dict['TenD'] = ShardedTensor.from_rank_offsets('UnexpectedTenD', torch.arange(3), replica_id=Utils.rank) + sharded_state_dict['ObjD'] = ShardedObject('UnexpectedObjD', None, (1,), (0,), replica_id=Utils.rank) + return load(sharded_state_dict, ckpt_dir, validate_access_integrity=validate_integrity, strict=strict) + + def test_error(error_msg): + assert 'Unexpected keys' in error_msg + assert 'UnexpectedTenD' in error_msg + assert 'UnexpectedObjD' in error_msg + assert 'Missing keys' not in error_msg + + # ASSUME_OK_UNEXPECTED results in an exception raised by the underlying strategy + with pytest.raises(PyTCheckpointingException if save_format == 'torch_dist' else CheckpointingException) as exc_info: + load_with_flag(StrictHandling.ASSUME_OK_UNEXPECTED) + # Informative exceptions with `RAISE_*` options: + with pytest.raises(CheckpointingException) as exc_info: + load_with_flag(StrictHandling.RAISE_UNEXPECTED) + test_error(str(exc_info.value)) + with pytest.raises(CheckpointingException) as exc_info: + load_with_flag(StrictHandling.RAISE_ALL) + test_error(str(exc_info.value)) + + # Logged mismatches: + with caplog.at_level(logging.WARNING): + loaded_state_dict = load_with_flag(StrictHandling.LOG_UNEXPECTED) + assert 'TenA' in loaded_state_dict + test_error(caplog.text) + with caplog.at_level(logging.WARNING): + loaded_state_dict = load_with_flag(StrictHandling.LOG_ALL) + assert 'TenA' in loaded_state_dict + test_error(caplog.text) + + # Returned mismatches + loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(StrictHandling.RETURN_UNEXPECTED) + assert 'TenA' in loaded_state_dict + assert unexpected_keys == {'UnexpectedTenD', 'UnexpectedObjD'} + assert missing_keys == set() + loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(StrictHandling.RETURN_ALL) + assert 'TenA' in loaded_state_dict + assert unexpected_keys == {'UnexpectedTenD', 'UnexpectedObjD'} + assert missing_keys == set() + + # Ignore mismatch + loaded_state_dict = load_with_flag(StrictHandling.IGNORE_ALL) + assert 'TenA' in loaded_state_dict + + @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist']) + @pytest.mark.parametrize('validate_integrity', [True, False]) + def test_missing_keys_raises_error_during_validation(self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format): + sharded_state_dict = self._get_base_state_dict() + with TempNamedDir(tmp_path_dist_ckpt / 'test_missing_keys_raises_error_during_validation') as ckpt_dir: + save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1) + save(sharded_state_dict, ckpt_dir, save_strategy) + + def load_with_flag(strict): + sharded_state_dict = self._get_base_state_dict() + del sharded_state_dict['TenA'] + del sharded_state_dict['ObjB'] + return load(sharded_state_dict, ckpt_dir, validate_access_integrity=validate_integrity, strict=strict) + + def test_error(error_msg): + assert 'Unexpected keys' not in error_msg + assert 'TenA' in error_msg + assert 'ObjB' in error_msg + assert 'Missing keys' in error_msg + + # no mismatch for `*_UNEXPECTED` flag + loaded_state_dict = load_with_flag(StrictHandling.ASSUME_OK_UNEXPECTED) + assert 'TenB' in loaded_state_dict + + loaded_state_dict = load_with_flag(StrictHandling.RAISE_UNEXPECTED) + assert 'TenB' in loaded_state_dict + + with caplog.at_level(logging.WARNING): + loaded_state_dict = load_with_flag(StrictHandling.LOG_UNEXPECTED) + assert caplog.text == '' + assert 'TenB' in loaded_state_dict + + loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(StrictHandling.RETURN_UNEXPECTED) + assert 'TenB' in loaded_state_dict + assert missing_keys == set() + assert unexpected_keys == set() + + loaded_state_dict = load_with_flag(StrictHandling.IGNORE_ALL) + assert 'TenB' in loaded_state_dict + + # Informative exceptions with `RAISE_ALL` option: + with pytest.raises(CheckpointingException) as exc_info: + load_with_flag(StrictHandling.RAISE_ALL) + test_error(str(exc_info.value)) + + # Logged mismatches: + with caplog.at_level(logging.WARNING): + loaded_state_dict = load_with_flag(StrictHandling.LOG_ALL) + assert 'TenB' in loaded_state_dict + test_error(caplog.text) + + # Returned mismatches + loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(StrictHandling.RETURN_ALL) + assert 'TenB' in loaded_state_dict + assert unexpected_keys == set() + assert missing_keys == {'TenA', 'ObjB'} + + @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist']) + @pytest.mark.parametrize('validate_integrity', [True, False]) + def test_exact_load_handling(self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format): + sharded_state_dict = self._get_base_state_dict() + with TempNamedDir(tmp_path_dist_ckpt / 'test_exact_load_handling') as ckpt_dir: + save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1) + save(sharded_state_dict, ckpt_dir, save_strategy) + + def load_with_flag(strict): + sharded_state_dict = self._get_base_state_dict() + return load(sharded_state_dict, ckpt_dir, validate_access_integrity=validate_integrity, strict=strict) + + for strict in ( + StrictHandling.ASSUME_OK_UNEXPECTED, + StrictHandling.LOG_UNEXPECTED, + StrictHandling.LOG_ALL, + StrictHandling.RAISE_UNEXPECTED, + StrictHandling.RAISE_ALL, + StrictHandling.IGNORE_ALL, + ): + with caplog.at_level(logging.WARNING): + loaded_state_dict = load_with_flag(strict) + assert caplog.text == '' + assert 'TenB' in loaded_state_dict + assert 'ObjB' in loaded_state_dict + + for strict in ( + StrictHandling.RETURN_UNEXPECTED, + StrictHandling.RETURN_ALL, + ): + with caplog.at_level(logging.WARNING): + loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(strict) + assert caplog.text == '' + assert 'TenB' in loaded_state_dict + assert 'ObjB' in loaded_state_dict + assert missing_keys == set() + assert unexpected_keys == set() + + @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist']) + def test_sharded_metadata(self, tmp_path_dist_ckpt, save_format): + + sharded_state_dict = self._get_base_state_dict() + with TempNamedDir(tmp_path_dist_ckpt / 'test_exact_load_handling') as ckpt_dir: + save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1) + save(sharded_state_dict, ckpt_dir, save_strategy) + torch.distributed.barrier() + sharded_metadata = load_sharded_metadata(ckpt_dir) + assert set(sh_base.key for sh_base in sharded_metadata.values()) == {'TenA', 'TenB', 'TenC', 'ObjA', 'ObjB'} + assert set(sharded_metadata.keys()) == { + 'TenA', 'TenB', 'TenC', + 'ObjA/shard_0_1', + *(f'ObjB/shard_0.{i}_1.8' for i in range(8)), + } + + loaded_state_dict = load(sharded_metadata, ckpt_dir, validate_access_integrity=False) + + assert loaded_state_dict['ObjA/shard_0_1'] == list(range(10)) + for shard_idx in range(8): + assert loaded_state_dict[f'ObjB/shard_0.{shard_idx}_1.8'] == {shard_idx + 7} + assert torch.all(loaded_state_dict['TenA'] == torch.arange(2)) + assert torch.all(loaded_state_dict['TenB'] == torch.arange(3).repeat(8)) + assert torch.all(loaded_state_dict['TenC'] == torch.arange(3)) diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py new file mode 100644 index 0000000..14d3be7 --- /dev/null +++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py @@ -0,0 +1,175 @@ +import contextlib +import math +import pytest +import torch + +from megatron.core import parallel_state +from megatron.core.distributed import DistributedDataParallelConfig, ParamAndGradBuffer +from tests.unit_tests.test_utilities import Utils, TestModel + + +def get_model_and_buffers( + input_dim: int, + output_dim: int, + num_layers: int, + bias: bool, + bucket_size: int, + use_distributed_optimizer: bool, + overlap_grad_reduce: bool, +): + ddp_config = DistributedDataParallelConfig( + grad_reduce_in_fp32=True, + use_distributed_optimizer=use_distributed_optimizer, + overlap_grad_reduce=overlap_grad_reduce, + ) + model = TestModel(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers, bias=bias) + params = list(model.parameters()) + param_to_name = {} + for name, param in model.named_parameters(): + param_to_name[param] = name + + param_and_grad_buffer = ParamAndGradBuffer( + ddp_config, + param_dtype=torch.bfloat16, + grad_dtype=torch.float32, + params=params, + data_parallel_group=parallel_state.get_data_parallel_group(), + bucket_size=bucket_size, + param_to_name=param_to_name, + gradient_scaling_factor=1.0, + ) + + return model, param_and_grad_buffer + + +@pytest.mark.parametrize("bucket_size", [None, 9999, 10000, 10001, 19999, 20000]) +@pytest.mark.parametrize("use_distributed_optimizer", [False, True]) +@pytest.mark.parametrize("bias", [False, True]) +def test_bucket_sizes(bucket_size: int, use_distributed_optimizer: bool, bias: bool): + Utils.initialize_model_parallel() + + input_dim = 100 + output_dim = 100 + num_layers = 10 + _, param_and_grad_buffer = get_model_and_buffers( + input_dim=input_dim, + output_dim=output_dim, + num_layers=num_layers, + bias=bias, + bucket_size=bucket_size, + use_distributed_optimizer=use_distributed_optimizer, + overlap_grad_reduce=False, + ) + + actual_numel_in_each_bucket = [ + bucket.numel_unpadded for bucket in param_and_grad_buffer.buckets + ] + actual_numel_padded_in_each_bucket = [ + bucket.grad_data.numel() for bucket in param_and_grad_buffer.buckets + ] + + def _pad_if_needed(numel_unpadded, divisor): + if use_distributed_optimizer: + return math.ceil(numel_unpadded / divisor) * divisor + return numel_unpadded + + def _pad_bucket_if_needed(numel_unpadded): + # Want 128-byte alignment for distributed optimizer. + divisor = math.lcm(parallel_state.get_data_parallel_world_size(), 128) + return _pad_if_needed(numel_unpadded, divisor) + + def _pad_param_if_needed(numel_unpadded): + # Want 64-byte alignment for params. + return _pad_if_needed(numel_unpadded, 64) + + if bucket_size is None: + # If bucket_size is infinite (None), number of buckets should be 1. + assert len(param_and_grad_buffer.buckets) == 1 + else: + # Else, compute number of buckets. + numel_in_each_bucket = [] + numel_padded_in_each_bucket = [] + numel_in_last_bucket = 0 + param_sizes = [] + for _ in range(num_layers): + param_sizes.append(input_dim * output_dim) + if bias: # Include bias term. + param_sizes.append(output_dim) + # Iterate through params in backward direction. + for param_size in param_sizes[::-1]: + numel_in_last_bucket = _pad_param_if_needed(numel_in_last_bucket) + numel_in_last_bucket += param_size + if numel_in_last_bucket >= bucket_size: + numel_in_each_bucket.append(numel_in_last_bucket) + numel_padded_in_each_bucket.append(_pad_bucket_if_needed(numel_in_last_bucket)) + numel_in_last_bucket = 0 + if numel_in_last_bucket > 0: + numel_in_each_bucket.append(numel_in_last_bucket) + numel_padded_in_each_bucket.append(_pad_bucket_if_needed(numel_in_last_bucket)) + + assert len(param_and_grad_buffer.buckets) == len( + numel_in_each_bucket + ), f"Buckets don't match (got {actual_numel_in_each_bucket} but should be {numel_in_each_bucket})" + assert actual_numel_in_each_bucket == numel_in_each_bucket, ( + f"Number of parameters in each bucket should be {numel_in_each_bucket}, " + f"but is {actual_numel_in_each_bucket}" + ) + assert actual_numel_padded_in_each_bucket == numel_padded_in_each_bucket, ( + f"Number of parameters in each padded bucket should be {numel_padded_in_each_bucket}, " + f"but is {actual_numel_padded_in_each_bucket}" + ) + + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize("use_distributed_optimizer", [False, True]) +@pytest.mark.parametrize("overlap_grad_reduce", [False, True]) +def test_grad_sync(use_distributed_optimizer: bool, overlap_grad_reduce: bool): + Utils.initialize_model_parallel() + + input_dim = 100 + output_dim = 100 + num_layers = 10 + model, param_and_grad_buffer = get_model_and_buffers( + input_dim=input_dim, + output_dim=output_dim, + num_layers=num_layers, + bias=True, + bucket_size=None, # Group all params into single bucket. + use_distributed_optimizer=use_distributed_optimizer, + overlap_grad_reduce=overlap_grad_reduce, + ) + + param_and_grad_buffer.grad_data.data.fill_(1.0) + expected_grad_data_value_after_collective = 1 + if torch.distributed.get_rank() == 0 or not use_distributed_optimizer: + expected_grad_data_value_after_collective = parallel_state.get_data_parallel_world_size() + + params = list(model.parameters()) + for i, param in enumerate(params): + register_grad_sync_context = ( + contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError) + ) + finish_grad_sync_context = contextlib.nullcontext() + if i < (len(params) - 1) and overlap_grad_reduce: + # Can't finish grad sync until all params have been registered ready. + finish_grad_sync_context = pytest.raises(AssertionError) + + with register_grad_sync_context: + param_and_grad_buffer.register_grad_ready(param) + with finish_grad_sync_context: + # When overlap_grad_reduce is True, this should throw an assertion error until all + # params in the model have registered their grad above. + # When overlap_grad_reduce is False, the collective is forced through. + param_and_grad_buffer.finish_grad_sync() + + expected_grad_data_value = expected_grad_data_value_after_collective + if overlap_grad_reduce and i < (len(params) - 1): + expected_grad_data_value = 1 + assert int(param_and_grad_buffer.grad_data[0]) == expected_grad_data_value + + if not overlap_grad_reduce: + # Reset grad_data for subsequent collectives. + param_and_grad_buffer.grad_data.data.fill_(1.0) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/fusions/test_torch_softmax.py b/tests/unit_tests/fusions/test_torch_softmax.py new file mode 100644 index 0000000..e09c089 --- /dev/null +++ b/tests/unit_tests/fusions/test_torch_softmax.py @@ -0,0 +1,44 @@ +import pytest +import torch + +from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.utils import attention_mask_func + + +class TestTorchSoftmax: + def setup_method(self, method): + # The important settings tested are forward_torch_softmax path + # with locally generated casual mask for attention_mask_func: + self.softmax = FusedScaleMaskSoftmax( + input_in_fp16=False, + input_in_bf16=False, + attn_mask_type=AttnMaskType.causal, + scaled_masked_softmax_fusion=False, + mask_func=attention_mask_func, + softmax_in_fp32=True, + scale=None, + ) + + def test_output_shape(self): + x = torch.randn(8, 2, 4, 4, device="cuda") + y = self.softmax(x, None) + assert x.shape == y.shape + + def test_causal_mask_input_shape_assert(self): + x = torch.randn(1, 1, 4, 16, device="cuda") + with pytest.raises(AssertionError): + self.softmax(x, None) + + def test_causal_mask_equal_scores(self): + # For equal input values (e.g. zero) correctly masked softmax should + # produce equal scores among non-masked elements. For example, in case + # sq == sk == 2 the expected output is (ignoring b and np dimensions): + # [[1.0, 0.0], + # [0.5, 0.5]] + b, np, sq, sk = 8, 2, 32, 32 + x = torch.zeros([b, np, sq, sk]).cuda() + y = self.softmax(x, None) + y_expected = torch.tril(torch.ones(b, np, sq, sk, device="cuda")) + y_expected /= torch.arange(1, sq + 1, device="cuda").reshape((-1, 1)) + assert torch.allclose(y, y_expected, rtol=1e-08, atol=1e-08) diff --git a/tests/unit_tests/inference/__init__.py b/tests/unit_tests/inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/inference/engines/__init__.py b/tests/unit_tests/inference/engines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py new file mode 100644 index 0000000..dc6aba2 --- /dev/null +++ b/tests/unit_tests/inference/engines/test_mcore_engine.py @@ -0,0 +1,64 @@ +from typing import List +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig +import torch +import random +import string + +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.engines.mcore_engine import MCoreEngine +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils +from unittest import mock + +class TestMCoreEngine: + def setup_method(self, method): + Utils.initialize_model_parallel(tensor_model_parallel_size=1,pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + self.batch_size = 4 + self.hidden_size = 12 + self.vocab_size = 100 + self.sequence_length = 64 + transformer_config = TransformerConfig(num_layers=4, hidden_size=self.hidden_size, num_attention_heads=4, use_cpu_initialization=True) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=self.vocab_size, + max_sequence_length=self.sequence_length, + parallel_output = True).cuda() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=self.hidden_size, + inference_batch_times_seqlen_threshold=400, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size + ) + + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config) + self.mock_tokenizer = mock.Mock() + text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer) + + self.mcore_engine = MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=4) + + def test_generate(self): + self.mock_tokenizer.vocab_size = self.vocab_size + self.mock_tokenizer.eod = self.vocab_size - 1 + # Generating random length integer prompts + self.mock_tokenizer.tokenize.return_value = [random.randint(0, self.vocab_size -1) for _ in range(random.randint(5,10))] + # Generates some random string + self.mock_tokenizer.detokenize.return_value = ''.join(random.choices(string.ascii_letters, k=random.randint(4,10))) + + prompts = ["sample"*(i+1) for i in range(self.batch_size)] + results : List[InferenceRequest] = self.mcore_engine.generate(prompts, common_inference_params=CommonInferenceParams(num_tokens_to_generate=10)) + + for result in results: + assert result.status == Status.COMPLETED, f"Status should be completed but its {result.status}" + assert result.generated_length > 0 , f"Generated length should be greater than zero" + assert result.generated_text is not None , f'Generated text should not be None' diff --git a/tests/unit_tests/inference/model_inference_wrappers/__init__.py b/tests/unit_tests/inference/model_inference_wrappers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py new file mode 100644 index 0000000..c6c2152 --- /dev/null +++ b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py @@ -0,0 +1,81 @@ +from argparse import Namespace +from megatron.core import parallel_state +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig +import torch +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + +class TestGPTInferenceWrapper: + + def setup_model(self, tensor_parallel_size, pipeline_parallel_size): + Utils.initialize_model_parallel(tensor_model_parallel_size=tensor_parallel_size,pipeline_model_parallel_size=pipeline_parallel_size) + model_parallel_cuda_manual_seed(123) + self.vocab_size = 100 + self.batch_size = 4 + self.sequence_length = 32 + hidden_size = 12 + + transformer_config = TransformerConfig(num_layers=4, hidden_size=hidden_size, num_attention_heads=4, use_cpu_initialization=True) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=self.vocab_size, + max_sequence_length=self.sequence_length, + parallel_output = True).cuda() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=20, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size + ) + + self.inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config) + + # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_small_input_batch() + def test_inference_pipeline_parallel_small_size(self): + self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) + + batch_prompt_tokens = torch.randint(low = 0, high = self.vocab_size, size=(self.batch_size, self.sequence_length)).int().cuda() + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens) + + inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 5) + + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + # Logits are not returned in all ranks in PP + if parallel_state.is_pipeline_last_stage(): + assert logits.shape == (self.batch_size, 5, self.vocab_size), f"Shape mismatch . Expected {(self.batch_size, 5, self.vocab_size)}, but got {logits.shape}" + + + # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_large_input_batch() + def test_inference_pipeline_parallel_large__size(self): + self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) + + batch_prompt_tokens = torch.randint(low = 0, high = self.vocab_size, size=(self.batch_size, self.sequence_length)).int().cuda() + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens) + + inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 10) + + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + + if parallel_state.is_pipeline_last_stage(): + assert logits.shape == (self.batch_size, 10, self.vocab_size), f"Shape mismatch . Expected {(self.batch_size,10, self.vocab_size)}, but got {logits.shape}" + + + def test_inference_only_tensor_parallel(self): + self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1) + + batch_prompt_tokens = torch.randint(low = 0, high = self.vocab_size, size=(self.batch_size, self.sequence_length)).int().cuda() + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens) + + inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 5) + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + + assert logits.shape == (self.batch_size, 5, self.vocab_size), f"Shape mismatch . Expected {(self.batch_size, 5, self.vocab_size)}, but got {logits.shape}" + diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py new file mode 100644 index 0000000..5c6f422 --- /dev/null +++ b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py @@ -0,0 +1,15 @@ +import torch +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig + +class TestModelInferenceWrapperConfig: + + def test_inference_params(self): + inference_parameters = InferenceWrapperConfig( + hidden_size=10, + inference_batch_times_seqlen_threshold=10, + padded_vocab_size=10, + params_dtype=torch.float, + fp32_residual_connection=False + ) + inference_parameters.add_attributes({"abc": 45}) + assert inference_parameters.abc == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}" \ No newline at end of file diff --git a/tests/unit_tests/inference/test_common_inference_params.py b/tests/unit_tests/inference/test_common_inference_params.py new file mode 100644 index 0000000..c22a72d --- /dev/null +++ b/tests/unit_tests/inference/test_common_inference_params.py @@ -0,0 +1,8 @@ +from megatron.core.inference.common_inference_params import CommonInferenceParams + +class TestCommonInferenceParams: + + def test_inference_params(self): + inference_parameters = CommonInferenceParams() + inference_parameters.add_attributes({"min_tokens": 45}) + assert inference_parameters.min_tokens == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}" \ No newline at end of file diff --git a/tests/unit_tests/inference/test_inference_utils.py b/tests/unit_tests/inference/test_inference_utils.py new file mode 100644 index 0000000..7f00619 --- /dev/null +++ b/tests/unit_tests/inference/test_inference_utils.py @@ -0,0 +1,11 @@ +from megatron.core.inference.utils import Counter + +class TestInferenceUtils: + + def test_counter(self): + counter = Counter() + r = next(counter) + assert r == 0, f'Counter return value should be 0 but it is {r}' + assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}' + counter.reset() + assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}' diff --git a/tests/unit_tests/inference/test_modelopt_gpt_model.py b/tests/unit_tests/inference/test_modelopt_gpt_model.py new file mode 100644 index 0000000..4b2d7de --- /dev/null +++ b/tests/unit_tests/inference/test_modelopt_gpt_model.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.inference.ammo_support.gpt.model_specs import get_gpt_layer_modelopt_spec +from megatron.core.inference.ammo_support.gpt.state_dict_hooks import mcore_gpt_load_te_state_dict_pre_hook + + +class TestModelOptGPTModel: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + use_cpu_initialization=True, + ) + self.gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + vocab_size=100, + max_sequence_length=4, + ) + # Ensure that a GPTModel can be built with the modelopt spec. + self.modelopt_gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_modelopt_spec(), + vocab_size=100, + max_sequence_length=4, + ) + + def test_load_te_state_dict_pre_hook(self): + handle = self.modelopt_gpt_model._register_load_state_dict_pre_hook( + mcore_gpt_load_te_state_dict_pre_hook + ) + self.modelopt_gpt_model.load_state_dict(self.gpt_model.state_dict()) + handle.remove() + + def teardown_method(self, method): + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/inference/test_scheduler.py b/tests/unit_tests/inference/test_scheduler.py new file mode 100644 index 0000000..57e0810 --- /dev/null +++ b/tests/unit_tests/inference/test_scheduler.py @@ -0,0 +1,63 @@ +from typing import Dict +import torch +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.scheduler import Scheduler + +class TestScheduler: + + def setup_method(self, method): + self.max_batch_size = 4 + self.scheduler = Scheduler(max_batch_size=self.max_batch_size) + assert len(self.scheduler.active_request_pool) == 0, "Active request pool should be empty on initalization" + assert len(self.scheduler.waiting_request_pool) == 0, "Waiting request pool should be empty on initalization" + assert len(self.scheduler.completed_request_pool) == 0, "Completed request pool should be empty on initalization" + + def test_scheduler(self): + prompt = "sample prompt" + prompt_tokens = torch.randn(5) + inference_parameters = CommonInferenceParams() + + for i in range(self.max_batch_size): + self.scheduler.add_request(prompt, prompt_tokens, inference_parameters) + assert len(self.scheduler.active_request_pool) == i + 1, f"Active request pool should have {i+1} requests, but it has only {len(self.scheduler.active_request_pool)}" + + self.scheduler.add_request(prompt, prompt_tokens, inference_parameters) + assert len(self.scheduler.waiting_request_pool) == 1, f"Waiting request pool should have 1 request but it has {len(self.scheduler.waiting_request_pool)} requests" + + waiting_request: InferenceRequest = list(self.scheduler.waiting_request_pool.values())[0] + assert waiting_request.status == Status.WAITING_IN_QUEUE, f"Status should be WAITING_IN_QUEUE, but its {waiting_request.status} for the waiting request" + + assert self.scheduler.have_requests_pending(), "Scheduler should have requests pending, but it seems to be having no requests" + + active_request_dict: Dict[int, InferenceRequest] = self.scheduler.active_request_pool + for request_id, request in active_request_dict.items(): + # Mark every even request compelted + if int(request_id) % 2 == 0: + request.status = Status.COMPLETED + + self.scheduler.update_requests_pools(active_request_dict) + assert len(self.scheduler.active_request_pool) == 3, f"Active request pool should have 3 requests, but it has {len(self.scheduler.active_request_pool)}" + + assert len(self.scheduler.waiting_request_pool) == 0, f"Waiting request pool should be empty but it has {len(self.scheduler.waiting_request_pool)} requests" + + assert len(self.scheduler.completed_request_pool) == 2, f"Completed request pool should have 2 requests but it has {len(self.scheduler.completed_request_pool)} requests " + + active_request_dict: Dict[int, InferenceRequest] = self.scheduler.active_request_pool + for request_id, request in active_request_dict.items(): + # Mark all requests compelted + request.status = Status.COMPLETED + + self.scheduler.update_requests_pools(active_request_dict) + assert len(self.scheduler.active_request_pool) == 0, f"Active request pool should be empty, but it has {len(self.scheduler.active_request_pool)}" + + assert len(self.scheduler.waiting_request_pool) == 0, f"Waiting request pool should be empty but it has {len(self.scheduler.waiting_request_pool)} requests" + + assert len(self.scheduler.completed_request_pool) == 5, f"Completed request pool should have 5 requests but it has {len(self.scheduler.completed_request_pool)} requests " + + assert self.scheduler.have_requests_pending() == False, "Scheduler should not have any requests pending" + + + + + \ No newline at end of file diff --git a/tests/unit_tests/inference/text_generation_controllers/__init__.py b/tests/unit_tests/inference/text_generation_controllers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py new file mode 100644 index 0000000..ede1ecb --- /dev/null +++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py @@ -0,0 +1,115 @@ + +from collections import OrderedDict +from typing import Dict +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig +import torch +import random +import string +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from unittest import mock +import pytest +import time + +from tests.unit_tests.test_utilities import Utils + +class TestTextGenerationController: + + def setup_method(self, method): + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=2) + model_parallel_cuda_manual_seed(123) + self.batch_size = 4 + self.hidden_size = 12 + self.vocab_size = 100 + self.sequence_length = 64 + transformer_config = TransformerConfig(num_layers=4, hidden_size=self.hidden_size, num_attention_heads=4, use_cpu_initialization=True) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=self.vocab_size, + max_sequence_length=self.sequence_length, + parallel_output = True).cuda() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=self.hidden_size, + inference_batch_times_seqlen_threshold=20, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size + ) + + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config) + + self.mock_tokenizer = mock.Mock() + + self.text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer) + + def test_sample_from_logits(self): + with pytest.raises(AssertionError) as aerror: + self.text_generation_controller.sample_from_logits(last_token_logits=None, common_inference_params=CommonInferenceParams(top_k=2, top_p=0.4), vocab_size=self.vocab_size ) + assert str(aerror.value) == 'Cannot have top-p and top-k both greater than zero' + + with pytest.raises(AssertionError) as aerror: + self.text_generation_controller.sample_from_logits(last_token_logits=None, common_inference_params=CommonInferenceParams(top_p=1.4, top_k=0), vocab_size=self.vocab_size ) + assert str(aerror.value) == 'top-p should be in (0,1]' + + with pytest.raises(AssertionError) as aerror: + self.text_generation_controller.sample_from_logits(last_token_logits=torch.randn(self.batch_size, 1), common_inference_params=CommonInferenceParams(top_k = self.vocab_size + 10), vocab_size=self.vocab_size) + assert str(aerror.value) == 'top-k is larger than logit size.' + + + last_token_logits = torch.arange(0, self.vocab_size).repeat(self.batch_size,1).float().cuda() + sampled_logits = self.text_generation_controller.sample_from_logits(last_token_logits, CommonInferenceParams(top_k=1), self.vocab_size) + assert torch.all(sampled_logits.cpu() == torch.ones(self.batch_size) * self.vocab_size - 1), f"The sampled logits should all be {self.vocab_size} but its {sampled_logits}" + + sampled_logits = self.text_generation_controller.sample_from_logits(last_token_logits, CommonInferenceParams(top_k=2), self.vocab_size) + assert torch.all(sampled_logits >= self.vocab_size - 2), f"The sampled logits should all be greater than {self.vocab_size-2} but its {sampled_logits}" + + l = last_token_logits[0] + top_p = 0.3 + expected_min_value = l[l.softmax(dim=-1).cumsum(dim=-1) > top_p][0].item() + sampled_logits = self.text_generation_controller.sample_from_logits(last_token_logits, CommonInferenceParams(top_p=top_p, top_k=0), self.vocab_size) + assert torch.all(sampled_logits >= expected_min_value), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}" + + top_p = 0.95 + temperature=2 + expected_min_value = l[l.div_(temperature).softmax(dim=-1).cumsum(dim=-1) > top_p][0].item() + sampled_logits = self.text_generation_controller.sample_from_logits(last_token_logits, CommonInferenceParams(top_p=top_p, temperature=temperature, top_k=0), self.vocab_size) + assert torch.all(sampled_logits >= expected_min_value), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}" + + def test_generate_all_output_tokens_static_batch(self): + self.mock_tokenizer.vocab_size = self.vocab_size + self.mock_tokenizer.eod = self.vocab_size - 1 + self.mock_tokenizer.detokenize.return_value = ''.join(random.choices(string.ascii_letters, k=random.randint(4,10))) + + active_requests: Dict[int, InferenceRequest] = OrderedDict() + for i in range(self.batch_size): + prompt = "sample" * (i+1) + self.mock_tokenizer.tokenize.return_value = torch.randn(self.batch_size, self.vocab_size).cuda() + inference_request = InferenceRequest( + request_id=i, + prompt=prompt, + inference_parameters=CommonInferenceParams(num_tokens_to_generate=10), + arrival_time=time.time(), + prompt_tokens=torch.randint(low=0, high=self.vocab_size - 1, size=(len(prompt),)).tolist(), + status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS + ) + active_requests[i] = inference_request + + requests = self.text_generation_controller.generate_all_output_tokens_static_batch(active_requests) + + for request_id, request in requests.items(): + assert request.status == Status.COMPLETED, f"Status should be completed but its {request.status}" + assert request.generated_length > 0 , f"Generated length should be greater than zero" + assert request.generated_text is not None, "Generated text should not be None" + + + + \ No newline at end of file diff --git a/tests/unit_tests/models/__init__.py b/tests/unit_tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/models/test_base_embedding.py b/tests/unit_tests/models/test_base_embedding.py new file mode 100644 index 0000000..511b026 --- /dev/null +++ b/tests/unit_tests/models/test_base_embedding.py @@ -0,0 +1,58 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding +from tests.unit_tests.test_utilities import Utils + + +class TestBaseEmbedding: + + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.base_embedding = LanguageModelEmbedding( + config=transformer_config, vocab_size=100, max_sequence_length=4, position_embedding_type='learned_absolute') + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.base_embedding, LanguageModelEmbedding) + num_weights = sum([p.numel() + for p in self.base_embedding.parameters()]) + assert num_weights == 1248 + + def test_zero_parameters(self): + sum_weights = sum([p.sum() for p in self.base_embedding.parameters()]) + assert sum_weights != 0 + self.base_embedding.zero_parameters() + sum_weights = sum([p.sum() for p in self.base_embedding.parameters()]) + assert sum_weights == 0 + + def test_cpu_forward(self): + input_ids = torch.tensor( + [0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)) + position_ids = torch.tensor( + [0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)) + embeddings = self.base_embedding(input_ids, position_ids) + assert embeddings.device.type == 'cpu' + assert embeddings.shape[0] == self.base_embedding.max_sequence_length + assert embeddings.shape[1] == input_ids.shape[0] + assert embeddings.shape[2] == self.base_embedding.config.hidden_size + + def test_gpu_forward(self): + self.base_embedding.cuda() + input_ids = torch.tensor( + [0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() + position_ids = torch.tensor( + [0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() + embeddings = self.base_embedding(input_ids, position_ids) + assert embeddings.device.type == 'cuda' + assert embeddings.shape[0] == self.base_embedding.max_sequence_length + assert embeddings.shape[1] == input_ids.shape[0] + assert embeddings.shape[2] == self.base_embedding.config.hidden_size diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py new file mode 100644 index 0000000..e1d0155 --- /dev/null +++ b/tests/unit_tests/models/test_bert_model.py @@ -0,0 +1,77 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch +import os + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.bert.bert_model import BertModel +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec + +class TestBertModel: + + def setup_method(self, method): + os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0' #Bert does not support flash attention + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True, perform_initialization=True) + self.bert_model = BertModel(config=transformer_config, num_tokentypes=0, transformer_layer_spec=bert_layer_with_transformer_engine_spec, vocab_size=100, max_sequence_length=4) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.bert_model, BertModel) + + assert self.bert_model.max_sequence_length == 4 + + num_weights = sum([p.numel() for p in self.bert_model.parameters()]) + assert num_weights == 6702 + + def test_set_input_tensor(self): + config: TransformerConfig = self.bert_model.config + sequence_length = self.bert_model.max_sequence_length + micro_batch_size = 2 + + # [sequence length, batch size, hidden size] + input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + + self.bert_model.set_input_tensor(input_tensor) + + assert self.bert_model.encoder.input_tensor.shape[0] == sequence_length + assert self.bert_model.encoder.input_tensor.shape[1] == micro_batch_size + assert self.bert_model.encoder.input_tensor.shape[2] == config.hidden_size + + def test_post_process_forward(self): + config: TransformerConfig = self.bert_model.config + sequence_length = self.bert_model.max_sequence_length + micro_batch_size = 2 + + self.bert_model.cuda() + + data = list(range(sequence_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones((micro_batch_size, sequence_length), dtype=bool).cuda() + + logits = self.bert_model.forward(input_ids=input_ids, attention_mask=attention_mask) + + assert logits[0].shape[0] == micro_batch_size + assert logits[0].shape[1] == sequence_length + assert logits[0].shape[2] == self.bert_model.vocab_size + + def test_no_post_process_forward(self): + pass + + def test_no_preprocess_forward(self): + pass + + def test_state_dict_for_save_checkpoint(self): + pass + + def test_load_state_dict(self): + pass + diff --git a/tests/unit_tests/models/test_clip_vit_model.py b/tests/unit_tests/models/test_clip_vit_model.py new file mode 100644 index 0000000..b20ab2d --- /dev/null +++ b/tests/unit_tests/models/test_clip_vit_model.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import pytest +import torch + +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.vision.clip_vit_model import CLIPViTModel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + + +class TestCLIPViTModel: + """Test CLIP ViT model.""" + + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True + ) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec() + self.model = CLIPViTModel(transformer_config, transformer_layer_spec) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.model, CLIPViTModel) + + num_weights = sum([p.numel() for p in self.model.parameters()]) + assert num_weights == 174720 + + def test_set_input_tensor(self): + # [s, b, h] expected to the transformer. + expected_shape = (577, 2, 64) + input_tensor = torch.zeros(expected_shape) + + self.model.set_input_tensor(input_tensor) + + assert self.model.decoder.input_tensor.shape == torch.Size(expected_shape) + + def test_forward(self): + self.model.cuda() + + img = torch.zeros((2, 3, 336, 336)).cuda() + + out = self.model.forward(img) + assert out.shape == torch.Size([2, 577, 64]) + + def test_save_load(self, tmp_path): + path = tmp_path / "model.pt" + torch.save(self.model.state_dict(), path) + + self.model.load_state_dict(torch.load(path)) diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py new file mode 100644 index 0000000..08a7dd0 --- /dev/null +++ b/tests/unit_tests/models/test_gpt_model.py @@ -0,0 +1,75 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + +class TestGPTModel: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.gpt_model = GPTModel(config=transformer_config, transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), vocab_size=100, max_sequence_length=4) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.gpt_model, GPTModel) + + assert self.gpt_model.max_sequence_length == 4 + + num_weights = sum([p.numel() for p in self.gpt_model.parameters()]) + assert num_weights == 6240 + + def test_set_input_tensor(self): + config: TransformerConfig = self.gpt_model.config + sequence_length = self.gpt_model.max_sequence_length + micro_batch_size = 2 + + # [sequence length, batch size, hidden size] + input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + + self.gpt_model.set_input_tensor(input_tensor) + + assert self.gpt_model.decoder.input_tensor.shape[0] == sequence_length + assert self.gpt_model.decoder.input_tensor.shape[1] == micro_batch_size + assert self.gpt_model.decoder.input_tensor.shape[2] == config.hidden_size + + def test_post_process_forward(self): + config: TransformerConfig = self.gpt_model.config + sequence_length = self.gpt_model.max_sequence_length + micro_batch_size = 2 + + self.gpt_model.cuda() + + data = list(range(sequence_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + logits = self.gpt_model.forward(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask) + + assert logits.shape[0] == micro_batch_size + assert logits.shape[1] == sequence_length + assert logits.shape[2] == self.gpt_model.vocab_size + + def test_no_post_process_forward(self): + pass + + def test_no_preprocess_forward(self): + pass + + def test_state_dict_for_save_checkpoint(self): + pass + + def test_load_state_dict(self): + pass + diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py new file mode 100644 index 0000000..07609ca --- /dev/null +++ b/tests/unit_tests/models/test_llava_model.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from copy import deepcopy + +import pytest +import torch + +from megatron.core import InferenceParams +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.multimodal.llava_model import LLaVAModel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + + +class TestLLaVAModel: + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + + language_config = TransformerConfig( + num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True + ) + vision_config = TransformerConfig( + num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True + ) + vision_projection_config = TransformerConfig( + num_layers=2, + hidden_size=128, + ffn_hidden_size=72, + num_attention_heads=1, + use_cpu_initialization=True, + ) + + language_layer_spec = get_gpt_layer_with_transformer_engine_spec() + vision_layer_spec = deepcopy(language_layer_spec) + vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules) + + self.model = LLaVAModel( + language_transformer_config=language_config, + language_transformer_layer_spec=language_layer_spec, + language_vocab_size=2048, + language_max_sequence_length=1024, + vision_transformer_config=vision_config, + vision_transformer_layer_spec=vision_layer_spec, + drop_vision_class_token=False, + vision_projection_config=vision_projection_config, + vision_projection_layer_spec=vision_projection_spec, + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.model, LLaVAModel) + + num_weights = sum([p.numel() for p in self.model.parameters()]) + assert num_weights == 1439304 + + def test_set_input_tensor(self): + expected_shape = (1, 2, 3, 4) + input_tensor = torch.zeros(expected_shape) + self.model.set_input_tensor(input_tensor) + assert self.model.vision_model.decoder.input_tensor.shape == expected_shape + + def test_forward(self): + self.model.cuda() + + img = torch.randn((2, 3, 336, 336)).cuda() + input_ids = torch.randint(0, 2048, (2, 1024)).cuda() + position_ids = torch.arange(0, 1024, dtype=torch.int).cuda() + position_ids = position_ids.expand(2, 1024) + # With default image and patch sizes of 336 and 14, respectively, and a class token, the combined sequence length is 1024 + (336/14) ** 2 + 1 = 1601. + attention_mask = torch.tril(torch.ones((2, 1, 1601, 1601))).cuda() + attention_mask = attention_mask < 0.5 + labels = torch.randint(0, 2048, (2, 1601)).cuda() + + # Try with labels. + loss = self.model.forward(img, input_ids, position_ids, attention_mask, labels) + assert loss.shape == torch.Size((2, 1601)) + + # Try without labels and without inference params. + logits = self.model.forward(img, input_ids, position_ids, attention_mask, labels=None) + assert logits.shape == torch.Size((2, 1601, 2048)) + + # Try without labels and with inference params. + inference_params = InferenceParams(2, 1601) + logits = self.model.forward( + img, + input_ids, + position_ids, + attention_mask, + labels=None, + inference_params=inference_params, + ) + assert logits.shape == torch.Size((2, 1601, 2048)) + + # Check KV cache got created correctly. + kv_dict = inference_params.key_value_memory_dict + + assert kv_dict["image_tokens_count"] == 577 + for layer_no in range(1, 4): # 3 layers in the model. + layer_kv = kv_dict[layer_no] + # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head] + assert layer_kv[0].shape == layer_kv[1].shape == torch.Size((1601, 2, 8, 16)) + + def test_save_load(self, tmp_path): + path = tmp_path / "model.pt" + torch.save(self.model.state_dict(), path) + + self.model.load_state_dict(torch.load(path)) + + def test_freeze(self): + self.model.freeze( + freeze_language_model=True, freeze_vision_model=True, freeze_vision_projection=False + ) + + for module in [self.model.language_model, self.model.vision_model]: + for param in module.parameters(): + assert not param.requires_grad + + for param in self.model.vision_projection.parameters(): + assert param.requires_grad diff --git a/tests/unit_tests/models/test_multimodal_projector.py b/tests/unit_tests/models/test_multimodal_projector.py new file mode 100644 index 0000000..f5ef29c --- /dev/null +++ b/tests/unit_tests/models/test_multimodal_projector.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.vision.multimodal_projector import MultimodalProjector +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec +from megatron.core.transformer.mlp import MLPSubmodules +from megatron.core.tensor_parallel.layers import ColumnParallelLinear + + +class TestMultimodalProjector: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=1, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True) + mlp_layer_spec = _get_mlp_module_spec().submodules + + affine_layer_spec = MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=None, + ) + self.mlp = MultimodalProjector(config = transformer_config, submodules = mlp_layer_spec, projector_type = "mlp", input_size = 1024) + self.affine = MultimodalProjector(config = transformer_config, submodules = affine_layer_spec, projector_type = "affine", input_size = 1024) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.mlp, MultimodalProjector) + assert isinstance(self.affine, MultimodalProjector) + + num_weights = sum([p.numel() for p in self.mlp.parameters()]) + assert num_weights == 280896 + + num_weights = sum([p.numel() for p in self.affine.parameters()]) + assert num_weights == 65600 + + def test_forward(self): + self.mlp.cuda() + self.affine.cuda() + + image_projection = torch.zeros((2, 1024)).cuda() + + logits = self.mlp.forward(image_projection) + assert len(logits) == 2 + assert logits.shape == torch.Size([2, 64]) + + logits = self.affine.forward(image_projection) + assert len(logits) == 2 + assert logits.shape == torch.Size([2, 64]) + + def test_save_load(self, tmp_path): + path = tmp_path / "mlp.pt" + torch.save(self.mlp.state_dict(), path) + + self.mlp.load_state_dict(torch.load(path)) + + path = tmp_path / "affine.pt" + torch.save(self.affine.state_dict(), path) + + self.affine.load_state_dict(torch.load(path)) + diff --git a/tests/unit_tests/models/test_t5_model.py b/tests/unit_tests/models/test_t5_model.py new file mode 100644 index 0000000..c3d925f --- /dev/null +++ b/tests/unit_tests/models/test_t5_model.py @@ -0,0 +1,85 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.T5.t5_model import T5Model +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.T5.t5_spec import (get_t5_encoder_with_transformer_engine_block_spec, + get_t5_decoder_with_transformer_engine_block_spec, + get_t5_encoder_with_local_block_spec, + get_t5_decoder_with_local_block_spec) + +class TestT5Model: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=12, hidden_size=768, num_attention_heads=12, kv_channels=64, ffn_hidden_size=3072, use_cpu_initialization=True) + en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(12) + de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(12) + self.t5_model = T5Model(config=transformer_config, transformer_encoder_layer_spec=en_block_spec, transformer_decoder_layer_spec=de_block_spec, vocab_size=29184, max_sequence_length=4) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.t5_model, T5Model) + + assert self.t5_model.max_sequence_length == 4 + + def test_set_input_tensor(self): + config: TransformerConfig = self.t5_model.config + sequence_length = self.t5_model.max_sequence_length + micro_batch_size = 2 + + # [sequence length, batch size, hidden size] + input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + + self.t5_model.set_input_tensor(input_tensor) + + assert self.t5_model.encoder.input_tensor.shape[0] == sequence_length + assert self.t5_model.encoder.input_tensor.shape[1] == micro_batch_size + assert self.t5_model.encoder.input_tensor.shape[2] == config.hidden_size + + def test_post_process_forward(self): + config: TransformerConfig = self.t5_model.config + sequence_length = self.t5_model.max_sequence_length + micro_batch_size = 2 + + self.t5_model.cuda() + + data = list(range(sequence_length)) + encoder_input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + decoder_input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + encoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda() + decoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda() + encoder_decoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda() + + logits = self.t5_model.forward( + encoder_input_ids=encoder_input_ids, + decoder_input_ids=decoder_input_ids, + encoder_attn_mask=encoder_attn_mask, + decoder_attn_mask=decoder_attn_mask, + encoder_decoder_attn_mask=encoder_decoder_attn_mask + ) + + assert logits.shape[0] == micro_batch_size + assert logits.shape[1] == sequence_length + assert logits.shape[2] == self.t5_model.vocab_size + + def test_no_post_process_forward(self): + pass + + def test_no_preprocess_forward(self): + pass + + def test_state_dict_for_save_checkpoint(self): + pass + + def test_load_state_dict(self): + pass + diff --git a/tests/unit_tests/pipeline_parallel/__init__.py b/tests/unit_tests/pipeline_parallel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py new file mode 100644 index 0000000..5dd6605 --- /dev/null +++ b/tests/unit_tests/pipeline_parallel/test_schedules.py @@ -0,0 +1,212 @@ +import torch +from tests.unit_tests.test_utilities import Utils +from megatron.core import ModelParallelConfig +import megatron.core.pipeline_parallel.schedules as schedule +from pytest_mock import mocker +import pytest + +rank = Utils.rank + +def test_get_forward_backward_func(): + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining) + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) + assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving) + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2) + assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving) + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=4) + assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving) + Utils.destroy_model_parallel() + +def test_deallocate_output_tensor(): + out = torch.tensor([[1, 2, 3], [4, 5, 6]]) + schedule.deallocate_output_tensor(out) + assert(out.nelement() == 6) + +def test_forward_backward_func_without_pipeline_parallel(mocker): + from megatron.core.pipeline_parallel import get_forward_backward_func + + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + + def forward_step_func(data_iterator, model): + import os + rank = int(os.environ['LOCAL_RANK']) + dummy_data = torch.ones(1,4) + def loss_func(output_tensor): + return rank, {'loss_reduced':rank} + return model(dummy_data), loss_func + + model = torch.nn.Linear(4,1) + model.model_type = 'unit-test' + def set_input_tensor(input_tensor): + return None + model.set_input_tensor = set_input_tensor + + forward_backward_func = get_forward_backward_func() + assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining) + + mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2) + config = ModelParallelConfig( + pipeline_model_parallel_size = 1 + ) + model.config = config + + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=range(0,100), + model=[model], + num_microbatches=4, + seq_length=None, + micro_batch_size=None, + forward_only=True) + + + loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}] + + for i,j in zip(losses_reduced, loss_reduced_expected): + print(losses_reduced) + assert(i['loss_reduced'] == j['loss_reduced']) + Utils.destroy_model_parallel() + + +def test_forward_backward_func_with_pipeline_parallel(mocker): + from megatron.core.pipeline_parallel import get_forward_backward_func + + Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4) + + def forward_step_func(data_iterator, model): + import os + rank = int(os.environ['LOCAL_RANK']) + def loss_func(output_tensor): + return rank, {'loss_reduced':rank} + return torch.rand(512,8,256).cuda(), loss_func + + model = torch.nn.Linear(4,1) + model.model_type = 'unit-test' + def set_input_tensor(input_tensor): + return None + model.set_input_tensor = set_input_tensor + + forward_backward_func = get_forward_backward_func() + assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving) + + sequence_length = 512 + micro_batch_size = 8 + hidden_size = 256 + + config = ModelParallelConfig( + pipeline_model_parallel_size = 4, + sequence_parallel = False, + pipeline_dtype=torch.float, + ) + config.hidden_size = hidden_size + model.config = config + + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=None, + model=[model], + num_microbatches= micro_batch_size, + seq_length=sequence_length, + micro_batch_size=micro_batch_size, + forward_only=True) + + loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}] + for i,j in zip(losses_reduced, loss_reduced_expected): + print(losses_reduced) + assert(i['loss_reduced'] == j['loss_reduced']) + Utils.destroy_model_parallel() + + +def test_forward_backward_func_with_interleaving(mocker): + from megatron.core.pipeline_parallel import get_forward_backward_func + from megatron.core.enums import ModelType + + Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2) + + def forward_step_func(data_iterator, model): + import os + rank = int(os.environ['LOCAL_RANK']) + def loss_func(output_tensor): + return rank, {'loss_reduced':rank} + return torch.rand(512,8,256).cuda(), loss_func + + model = torch.nn.Linear(4,1) + def set_input_tensor(input_tensor): + return None + model.set_input_tensor = set_input_tensor + + forward_backward_func = get_forward_backward_func() + assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving) + + sequence_length = 512 + micro_batch_size = 8 + hidden_size = 256 + + config = ModelParallelConfig( + pipeline_model_parallel_size = 4, + sequence_parallel = False, + pipeline_dtype=torch.float, + ) + config.hidden_size = hidden_size + model.config = config + + mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2) + + with pytest.raises(RuntimeError): + model.model_type = ModelType.encoder_and_decoder + forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=[range(0,100)], + model=[model, model], + num_microbatches= micro_batch_size, + seq_length=sequence_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=sequence_length, + forward_only=True) + + with pytest.raises(RuntimeError): + model.model_type = ModelType.encoder_or_decoder + forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=[range(0,100)], + model=[model, model], + num_microbatches= micro_batch_size, + seq_length=sequence_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=256, + forward_only=True) + + with pytest.raises(RuntimeError): + model.model_type = ModelType.encoder_or_decoder + forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=[range(0,100)], + model=[model, model], + num_microbatches= 7, + seq_length=sequence_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=512, + forward_only=True) + + + model.model_type = ModelType.encoder_or_decoder + losses_reduced = forward_backward_func( + forward_step_func=forward_step_func, + data_iterator=[range(0,100), range(0,100)], + model=[model, model], + num_microbatches= micro_batch_size, + seq_length=sequence_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=sequence_length, + forward_only=True) + + loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}] + for i,j in zip(losses_reduced, loss_reduced_expected): + print(losses_reduced) + assert(i['loss_reduced'] == j['loss_reduced']) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/tensor_parallel/__init__.py b/tests/unit_tests/tensor_parallel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/tensor_parallel/test_cross_entropy.py b/tests/unit_tests/tensor_parallel/test_cross_entropy.py new file mode 100644 index 0000000..a29365e --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_cross_entropy.py @@ -0,0 +1,14 @@ +from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy +import torch +from tests.unit_tests.test_utilities import Utils +import numpy as np + +def test_vocab_parallel_cross_entropy(): + Utils.initialize_model_parallel(4,2) + vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda() + target = torch.arange(0,32,2).cuda() + output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) + expected_output = torch.tensor([10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309, + 10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309]).cuda() + assert(torch.equal(torch.round(expected_output), torch.round(output))) + Utils.destroy_model_parallel() \ No newline at end of file diff --git a/tests/unit_tests/tensor_parallel/test_data.py b/tests/unit_tests/tensor_parallel/test_data.py new file mode 100644 index 0000000..38a39ce --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_data.py @@ -0,0 +1,21 @@ +from megatron.core.tensor_parallel.data import broadcast_data +import torch +from tests.unit_tests.test_utilities import Utils + +def test_broadcast_data(): + Utils.initialize_model_parallel(2,4) + input_data = { + 0 : torch.ones((8,8)).cuda() * 0.0, + 1 : torch.ones((8,8)).cuda() * 1.0, + 2 : torch.ones((8,8)).cuda() * 2.0, + 3 : torch.ones((8,8)).cuda() * 3.0, + 4 : torch.ones((8,8)).cuda() * 4.0, + 5 : torch.ones((8,8)).cuda() * 5.0, + 6 : torch.ones((8,8)).cuda() * 6.0, + 7 : torch.ones((8,8)).cuda() * 7.0 + } + dtype = torch.float32 + actual_output = broadcast_data([0,1],input_data, dtype) + assert(torch.equal(actual_output[0], input_data[0])) + assert(torch.equal(actual_output[1], input_data[1])) + Utils.destroy_model_parallel() \ No newline at end of file diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py new file mode 100644 index 0000000..c0b11be --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_initialization.py @@ -0,0 +1,97 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.tensor_parallel.layers import VocabParallelEmbedding, RowParallelLinear, ColumnParallelLinear +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + +class Test: + + transformer_config = TransformerConfig(num_layers=1, hidden_size=12, + num_attention_heads=4, use_cpu_initialization=True) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(100) + def test_embedding_init(self): + + Utils.initialize_model_parallel(1, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(42) + + + tp1 = VocabParallelEmbedding(num_embeddings=16, embedding_dim=4, + init_method=self.transformer_config.init_method, + config=self.transformer_config).weight + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(4, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(41) # intentionally different. + tp4 = VocabParallelEmbedding(num_embeddings=16, embedding_dim=4, + init_method=self.transformer_config.init_method, + config=self.transformer_config).weight + + if torch.distributed.get_rank() == 0: + assert tp4.shape[0] * 4 == tp1.shape[0] + assert torch.allclose(tp1[:4], tp4) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(100) + def test_row_init(self): + + Utils.initialize_model_parallel(1, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(42) + + tp1 = RowParallelLinear(input_size=16, output_size=16, + init_method=self.transformer_config.init_method, + bias=True, input_is_parallel=False, + config=self.transformer_config, + skip_bias_add=False).weight + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(4, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(41) # intentionally different. + tp4 = RowParallelLinear(input_size=16, output_size=16, + init_method=self.transformer_config.init_method, + bias=True, + input_is_parallel=False, + config=self.transformer_config, + skip_bias_add=False).weight + + if torch.distributed.get_rank() == 0: + assert tp4.shape[1] * 4 == tp1.shape[1] + assert torch.allclose(tp1[:, :4], tp4) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(100) + def test_col_init(self): + + Utils.initialize_model_parallel(1, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(42) + + tp1 = ColumnParallelLinear(input_size=16, output_size=16, + init_method=self.transformer_config.init_method, + bias=True, config=self.transformer_config, + skip_bias_add=False).weight + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(4, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(41) # intentionally different. + tp4 = ColumnParallelLinear(input_size=16, output_size=16, + init_method=self.transformer_config.init_method, + bias=True, config=self.transformer_config, + skip_bias_add=False).weight + + if torch.distributed.get_rank() == 0: + assert tp4.shape[0] * 4 == tp1.shape[0] + assert torch.allclose(tp1[:4], tp4) + \ No newline at end of file diff --git a/tests/unit_tests/tensor_parallel/test_layers.py b/tests/unit_tests/tensor_parallel/test_layers.py new file mode 100644 index 0000000..709fc59 --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_layers.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import pytest +import torch + +from megatron.core.tensor_parallel.layers import linear_with_frozen_weight +from megatron.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region +from tests.unit_tests.test_utilities import Utils + + +@pytest.mark.parametrize("tensor_parallel,allreduce_dgrad", [(1, False), (8, True)]) +def test_LinearWithFrozenWeight(tensor_parallel, allreduce_dgrad): + Utils.initialize_model_parallel(tensor_parallel, 1) + + size_per_partition = int(8 / tensor_parallel) + + # Input is an 8x8 identity matrix. + input_data = torch.eye(8).cuda() + input_data.requires_grad = True + + # Weight is an 8x8 matrix of all ones. If tensor parallelism > 1, the weight is partitioned evenly across GPUs. + weight = torch.ones((size_per_partition, 8)).cuda() + + # Bias is a vector of length 8 of all zeros. If tensor parallelism > 1, the bias is partitioned evenly across GPUs + bias = torch.zeros((size_per_partition)).cuda() + + gradient_accumulation_fusion = False + async_grad_allreduce = allreduce_dgrad + sequence_parallel = False + grad_output_buffer = None + wgrad_deferral_limit = None + + output_parallel = linear_with_frozen_weight( + input_data, + weight, + bias, + gradient_accumulation_fusion, + async_grad_allreduce, + sequence_parallel, + grad_output_buffer, + wgrad_deferral_limit, + allreduce_dgrad, + ) + output = gather_from_tensor_model_parallel_region( + output_parallel + ) # no-op if tensor_parallel == 1. + output.sum().backward() + + expected_output = torch.ones(8).cuda() + expected_grad = 8 * torch.ones(8).cuda() + + assert torch.allclose(output, expected_output) + assert torch.allclose(input_data.grad, expected_grad) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/tensor_parallel/test_mappings.py b/tests/unit_tests/tensor_parallel/test_mappings.py new file mode 100644 index 0000000..6be486e --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_mappings.py @@ -0,0 +1,135 @@ +from megatron.core.tensor_parallel import mappings +from tests.unit_tests.test_utilities import Utils +import torch + +def test_CopyToModelParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.ones((1)).cuda()*Utils.rank + output_data = mappings._CopyToModelParallelRegion.backward(None, input_data) + result = torch.ones(1).cuda() + result = result * 22 if Utils.rank >= 4 else result * 6 + assert(torch.equal(output_data, result)) + assert(torch.equal(input_data, mappings.copy_to_tensor_model_parallel_region(input_data))) + assert(torch.equal(input_data, mappings._CopyToModelParallelRegion.symbolic(None, input_data))) + Utils.destroy_model_parallel() + +def test_ReduceFromModelParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.ones((1)).cuda()*Utils.rank + output_data = mappings._ReduceFromModelParallelRegion.symbolic(None, input_data) + result = torch.ones(1).cuda() + result = result * 22 if Utils.rank >= 4 else result * 6 + assert(torch.equal(output_data, result)) + input_data = torch.ones((1)).cuda()*Utils.rank + assert(torch.equal(mappings.reduce_from_tensor_model_parallel_region(input_data), result)) + assert(torch.equal(input_data, mappings._ReduceFromModelParallelRegion.backward(None, input_data))) + Utils.destroy_model_parallel() + +def test_ScatterToModelParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.rand((8,4)).cuda() + output_data = mappings.scatter_to_tensor_model_parallel_region(input_data) + req_dim = int(Utils.rank%(Utils.world_size/2)) + assert(torch.equal(output_data, input_data[:,req_dim].reshape((8,1)))) + output_data = mappings._ScatterToModelParallelRegion.symbolic(None, input_data) + assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1)))) + + input_data = torch.ones(8).cuda() * Utils.rank + actual_output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data) + expected_output = torch.cat(( + torch.ones(8)*0, + torch.ones(8)*1, + torch.ones(8)*2, + torch.ones(8)*3)).cuda() + if (Utils.rank >= 4): + expected_output = expected_output + 4 + assert(torch.equal(actual_output_data, expected_output)) + Utils.destroy_model_parallel() + +def test_GatherFromModelParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.rand((8,4)).cuda() + req_dim = int(Utils.rank%(Utils.world_size/2)) + output_data = mappings._GatherFromModelParallelRegion.backward(None, input_data) + assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1)))) + input_data = torch.ones(8).cuda() * Utils.rank + actual_output_data = mappings.gather_from_tensor_model_parallel_region(input_data) + expected_output = torch.cat(( + torch.ones(8)*0, + torch.ones(8)*1, + torch.ones(8)*2, + torch.ones(8)*3)).cuda() + if (Utils.rank >= 4): + expected_output = expected_output + 4 + assert(torch.equal(actual_output_data, expected_output)) + assert(torch.equal(mappings._GatherFromModelParallelRegion.symbolic(None, input_data), expected_output)) + Utils.destroy_model_parallel() + +def test_ScatterToSequenceParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.rand((8,4)).cuda() + req_dim = int(Utils.rank%(Utils.world_size/2))*2 + output_data = mappings._ScatterToSequenceParallelRegion.symbolic(None, input_data) + assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :])) + output_data = mappings.scatter_to_sequence_parallel_region(input_data) + assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :])) + input_data = torch.ones(4).cuda() * Utils.rank + output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data) + expected_output = torch.concat(( + torch.ones(4)*0, + torch.ones(4)*1, + torch.ones(4)*2, + torch.ones(4)*3)).cuda() + if (Utils.rank >= 4): + expected_output = expected_output + 4 + assert(torch.equal(output_data, expected_output)) + Utils.destroy_model_parallel() + +def test_GatherFromSequenceParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.ones(4).cuda() * Utils.rank + output_data = mappings.gather_from_sequence_parallel_region(input_data) + expected_output = torch.concat(( + torch.ones(4)*0, + torch.ones(4)*1, + torch.ones(4)*2, + torch.ones(4)*3)).cuda() + if (Utils.rank >= 4): + expected_output = expected_output + 4 + assert(torch.equal(output_data, expected_output)) + assert(torch.equal(mappings._GatherFromSequenceParallelRegion.symbolic(None, input_data), expected_output)) + input_data = torch.vstack(( + torch.ones(4)*0, + torch.ones(4)*1, + torch.ones(4)*2, + torch.ones(4)*3)).cuda() + class Ctx: + tensor_parallel_output_grad = True + output_data = mappings._GatherFromSequenceParallelRegion.backward(Ctx(), input_data) + expected_output = torch.ones((1,4)).cuda() * 4 * int(Utils.rank % 4) + assert(torch.equal(output_data[0], expected_output)) + Utils.destroy_model_parallel() + +def test_ReduceScatterToSequenceParallelRegion(): + Utils.initialize_model_parallel(4,2) + input_data = torch.vstack(( + torch.ones(4)*0, + torch.ones(4)*1, + torch.ones(4)*2, + torch.ones(4)*3)).cuda() + output_data = mappings.reduce_scatter_to_sequence_parallel_region(input_data) + expected_output = torch.ones(4).cuda() * 4 * int(Utils.rank % 4) + assert(torch.equal(output_data[0], expected_output)) + assert(torch.equal(mappings._ReduceScatterToSequenceParallelRegion.symbolic(None, input_data) , expected_output.reshape((1,4)))) + input_data = torch.ones(4).cuda() * Utils.rank + output_data = mappings._ReduceScatterToSequenceParallelRegion.backward(None,input_data) + expected_output = torch.concat(( + torch.ones(4)*0, + torch.ones(4)*1, + torch.ones(4)*2, + torch.ones(4)*3)).cuda() + if (Utils.rank >= 4): + expected_output = expected_output + 4 + assert(torch.equal(output_data, expected_output)) + Utils.destroy_model_parallel() + diff --git a/tests/unit_tests/tensor_parallel/test_random.py b/tests/unit_tests/tensor_parallel/test_random.py new file mode 100644 index 0000000..e2f35cf --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_random.py @@ -0,0 +1,44 @@ +from megatron.core.tensor_parallel.random import CudaRNGStatesTracker +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed,get_cuda_rng_tracker +from megatron.core.tensor_parallel.random import checkpoint +from tests.unit_tests.test_utilities import Utils +import pytest +import torch + +def test_cuda_rng_states_tracker(): + rng_tracker = CudaRNGStatesTracker() + rng_tracker.set_states({"state1":1234}) + assert(rng_tracker.get_states()["state1"] == 1234) + rng_tracker.reset() + assert(rng_tracker.get_states() == {}) + seed = 1111 + rng_tracker.add("state2",seed) + with pytest.raises(Exception): + assert(rng_tracker.add("state3",seed)) + with pytest.raises(Exception): + assert(rng_tracker.add("state2",111)) + assert(rng_tracker.get_states()['state2'] is not None) + with pytest.raises(Exception): + assert() + + rng_tracker.fork("state2") + torch.cuda.manual_seed(seed) + rng_state = torch.cuda.get_rng_state() + assert torch.equal(rng_tracker.get_states()['state2'], rng_state) + +def test_model_parallel_cuda_manual_seed(): + Utils.initialize_model_parallel(4,2) + model_parallel_cuda_manual_seed(0) + rng_tracker = get_cuda_rng_tracker() + assert(rng_tracker.get_states()['model-parallel-rng'] is not None) + Utils.destroy_model_parallel() + +def test_checkpoint(): + def test_forward(*input): + return input[0]+input[1] + assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2))) + Utils.initialize_model_parallel() + input1 = torch.ones((4,4)) + checkpoint(test_forward, True, input1, torch.ones((4,4))*2) + assert(torch.equal(torch.ones(input1.numel()).cuda(), input1)) + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py b/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py new file mode 100644 index 0000000..f82e5fa --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py @@ -0,0 +1,43 @@ +import torch +import megatron.core.tensor_parallel.utils as util +import megatron.core.parallel_state as ps +from tests.unit_tests.test_utilities import Utils + +rank = Utils.rank + +def test_split_tensor_along_last_dim(): + input_tensor = torch.rand((3,4)) + torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0]) + torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1]) + +def test_split_tensor_into_1d_equal_chunks(): + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) + input_tensor = torch.rand((3,4)) + output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor) + if rank % 2 == 0 : + start = 0 + end = int(input_tensor.numel()/2) + else : + start = int(input_tensor.numel()/2) + end = input_tensor.numel() + + assert torch.equal(output_tensor, input_tensor.flatten()[start:end]) + Utils.destroy_model_parallel() + +def test_gather_split_1d_tensor(): + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) + input_tensor = torch.ones((2,4)).cuda() * rank + actual_output_tensor = util.gather_split_1d_tensor(input_tensor) + if rank %2 == 0: + expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1)) + else : + expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten())) + assert(torch.equal(actual_output_tensor, expected_output_tensor)) + Utils.destroy_model_parallel() + +def test_vocab(): + global_vocab_size = 1600 + per_partition_vocab_size = 1600 / Utils.world_size + assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size))) + assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size))) + \ No newline at end of file diff --git a/tests/unit_tests/test_basic.py b/tests/unit_tests/test_basic.py new file mode 100644 index 0000000..915d2c1 --- /dev/null +++ b/tests/unit_tests/test_basic.py @@ -0,0 +1,3 @@ +def test_import(): + import megatron + diff --git a/tests/unit_tests/test_imports.py b/tests/unit_tests/test_imports.py new file mode 100644 index 0000000..49e7c77 --- /dev/null +++ b/tests/unit_tests/test_imports.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import importlib +import inspect +import os +import traceback + +import torch +import wrapt + +from megatron.core.transformer.module import MegatronModule + + +def import_class_by_path(path: str): + paths = path.split('.') + path = ".".join(paths[:-1]) + class_name = paths[-1] + mod = __import__(path, fromlist=[class_name]) + mod = getattr(mod, class_name) + return mod + + +def _build_import_path(subdomains: list, imp): + import_path = ["megatron", "core"] + import_path.extend(subdomains) + import_path.append(imp) + path = ".".join(import_path) + return path + + +def _get_class_from_path(subdomains, imp): + path = _build_import_path(subdomains, imp) + print(path) + class_ = None + result = None + try: + class_ = import_class_by_path(path) + if inspect.isclass(class_): + if isinstance(class_, wrapt.FunctionWrapper): + class_ = class_.__wrapped__ + if issubclass(class_, (MegatronModule, torch.nn.Module)): + result = class_ + else: + class_ = None + error = None + except Exception: + error = traceback.format_exc() + return class_, result, error + + +def _test_domain_module_imports(module, subdomains: list): + module_list = [] + failed_list = [] + error_list = [] + + error = None + if len(subdomains) > 0: + basepath = module.__path__[0] + megatron_index = basepath.rfind("megatron") + basepath = basepath[megatron_index:].replace(os.path.sep, ".") + new_path = '.'.join([basepath, *subdomains]) + + try: + module = importlib.import_module(new_path) + except Exception: + print(f"Could not import `{new_path}` ; Traceback below :") + error = traceback.format_exc() + error_list.append(error) + + if error is None: + for imp in dir(module): + class_, result, error = _get_class_from_path( + subdomains, imp) + + if result is not None: + module_list.append(class_) + + elif class_ is not None: + failed_list.append(class_) + + if error is not None: + error_list.append(error) + + for module in module_list: + print("Module successfully imported :", module) + + print() + for module in failed_list: + print( + "Module did not match a valid signature of Megatron core Model (hence ignored):", module) + + print() + if len(error_list) > 0: + print("Imports crashed with following traceback !") + + for error in error_list: + print("*" * 100) + print() + print(error) + print() + print("*" * 100) + print() + + if len(error_list) > 0: + return False + else: + return True + + +############################### + + +def test_domain_mcore(): + import megatron.core as mcore + + all_passed = _test_domain_module_imports( + mcore, subdomains=['models']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['pipeline_parallel']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['tensor_parallel']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['transformer']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['fusions']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['distributed']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['datasets']) + + all_passed = _test_domain_module_imports( + mcore, subdomains=['dist_checkpointing']) + + if not all_passed: + exit(1) + + +if __name__ == '__main__': + test_domain_mcore() diff --git a/tests/unit_tests/test_local_multi_tensor_fns.py b/tests/unit_tests/test_local_multi_tensor_fns.py new file mode 100644 index 0000000..f47d549 --- /dev/null +++ b/tests/unit_tests/test_local_multi_tensor_fns.py @@ -0,0 +1,36 @@ +import copy +from megatron.core.utils import ( + local_multi_tensor_applier, + local_multi_tensor_l2_norm, + local_multi_tensor_scale +) +import pytest +import torch + +def test_local_multi_tensor_l2_norm_and_scale(): + amp_C = pytest.importorskip("amp_C") + multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply") + + torch.manual_seed(42) + + tensor_list = [torch.rand(5,5).cuda() for _ in range(10)] + tensor_list_copy = copy.deepcopy(tensor_list) + + norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) + norm_local, _ = multi_tensor_apply.multi_tensor_applier(local_multi_tensor_l2_norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy], False) + torch.testing.assert_close(norm_apex, norm_local) + + clip_coeff = 0.05 + multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list, tensor_list], clip_coeff) + multi_tensor_apply.multi_tensor_applier(local_multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy, tensor_list_copy], clip_coeff) + torch.testing.assert_close(tensor_list, tensor_list_copy) + +def test_local_multi_tensor_apply(): + amp_C = pytest.importorskip("amp_C") + multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply") + + tensor_list = [torch.rand(5,5).cuda() for _ in range(10)] + + norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) + norm_local, _ = local_multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) + torch.testing.assert_close(norm_apex, norm_local) diff --git a/tests/unit_tests/test_num_microbatches_calculator.py b/tests/unit_tests/test_num_microbatches_calculator.py new file mode 100644 index 0000000..8a0673f --- /dev/null +++ b/tests/unit_tests/test_num_microbatches_calculator.py @@ -0,0 +1,128 @@ +from typing import List, Optional + +import pytest + +import megatron.core.num_microbatches_calculator as mb_calculator + + +def reconfigure_num_microbatches_calculator( + rank: int, + rampup_batch_size: Optional[List[int]], + global_batch_size: int, + micro_batch_size: int, + data_parallel_size: int, +): + """Reconfigure number of micro-batches calculator. + + Args: + rank (int): Rank of the GPU, only rank 0 will log the information. + rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of [start_global_batch_size, batch_size_increment, ramup_samples]. + global_batch_size (int): Global batch size for the model. + micro_batch_size (int): Micro batch size at initialization. + data_parallel_size (int): Data parallel size. + """ + + mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = mb_calculator.build_num_microbatches_calculator( + rank, rampup_batch_size, global_batch_size, micro_batch_size, data_parallel_size + ) + + +def test_init_num_microbatches_calculator(): + mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 2) + assert mb_calculator.get_num_microbatches() == 2 + assert mb_calculator.get_current_global_batch_size() == 32 + + with pytest.raises(AssertionError): + mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 2) + + +def test_get_num_microbatches(): + reconfigure_num_microbatches_calculator(0, None, 16, 8, 2) + assert mb_calculator.get_num_microbatches() == 1 + + +def test_get_current_global_batch_size(): + reconfigure_num_microbatches_calculator(0, None, 16, 8, 2) + assert mb_calculator.get_current_global_batch_size() == 16 + + +def test_update_num_microbatches(): + reconfigure_num_microbatches_calculator(0, [16, 8, 96], 32, 4, 2) + assert mb_calculator.get_num_microbatches() == 2 + mb_calculator.update_num_microbatches(48, False) + assert mb_calculator.get_num_microbatches() == 3 + + reconfigure_num_microbatches_calculator(0, [16, 8, 96], 32, 8, 2) + with pytest.raises(AssertionError): + mb_calculator.update_num_microbatches(49, True) + + reconfigure_num_microbatches_calculator(0, None, 32, 8, 2) + mb_calculator.update_num_microbatches(16) + assert mb_calculator.get_num_microbatches() == 2 + + +def test_build_num_microbatches_calculator(): + temp_calculator = mb_calculator.build_num_microbatches_calculator(0, None, 32, 8, 2) + assert temp_calculator.get() == 2 + assert temp_calculator.get_current_global_batch_size() == 32 + assert type(temp_calculator) is mb_calculator.ConstantNumMicroBatchesCalculator + + temp_calculator = mb_calculator.build_num_microbatches_calculator(0, [16, 16, 48], 32, 8, 2) + assert temp_calculator.get() == 1 + assert temp_calculator.get_current_global_batch_size() == 16 + assert type(temp_calculator) is mb_calculator.RampupBatchsizeNumMicroBatchesCalculator + + +class TestConstantNumMicroBatchesCalculator: + def setup_method(self, method): + self.mb_calculator = mb_calculator.ConstantNumMicroBatchesCalculator(32, 8, 2) + + def test_constructor(self): + assert type(self.mb_calculator) is mb_calculator.ConstantNumMicroBatchesCalculator + assert self.mb_calculator.num_micro_batches == 2 + assert self.mb_calculator.current_global_batch_size == 32 + assert self.mb_calculator.micro_batch_size == 8 + + def test_get(self): + assert self.mb_calculator.get() == 2 + + def test_get_current_global_batch_size(self): + assert self.mb_calculator.get_current_global_batch_size() == 32 + + +class TestRampupBatchsizeNumMicroBatchesCalculator: + def setup_method(self, method): + self.mb_calculator = mb_calculator.RampupBatchsizeNumMicroBatchesCalculator( + 32, 8, 2, 16, 16, 48 + ) + + def test_constructor(self): + assert type(self.mb_calculator) is mb_calculator.RampupBatchsizeNumMicroBatchesCalculator + assert self.mb_calculator.global_batch_size == 32 + assert self.mb_calculator.micro_batch_size == 8 + assert self.mb_calculator.data_parallel_size == 2 + assert self.mb_calculator.start_global_batch_size == 16 + assert self.mb_calculator.batch_size_increment == 16 + assert self.mb_calculator.ramup_samples == 48 + assert self.mb_calculator.micro_batch_times_data_parallel_size == 16 + assert self.mb_calculator.num_micro_batches == 1 + + def test_get(self): + assert self.mb_calculator.get() == 1 + + def test_get_current_global_batch_size(self): + assert self.mb_calculator.get_current_global_batch_size() == 16 + + +def test_ramp_up(): + reconfigure_num_microbatches_calculator(0, [16, 16, 96], 32, 8, 2) + consumed_samples = 0 + count = 0 + expected_consumed_samples = [0, 16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256] + + while consumed_samples < 256: + consumed_samples += mb_calculator.get_current_global_batch_size() + count += 1 + assert consumed_samples == expected_consumed_samples[count] + mb_calculator.update_num_microbatches(consumed_samples, True) diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py new file mode 100644 index 0000000..247da4a --- /dev/null +++ b/tests/unit_tests/test_optimizer.py @@ -0,0 +1,66 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.optim import SGD, Adam + +from megatron.core.optimizer import ChainedOptimizer + + +class Net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +def test_chained_optimizer(): + net = Net() + optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01,) + optimizer_2 = SGD(list(net.parameters())[2:], lr=0.1, momentum=0.9,) + chained_optimizer = ChainedOptimizer([optimizer_1, optimizer_2]) + + # Test the chained optimizer's param groups is a reference of the underlying optimizers' param groups + assert optimizer_1.param_groups[0]["lr"] == 0.01 + chained_optimizer.param_groups[0]["lr"] = 0.02 + assert optimizer_1.param_groups[0]["lr"] == 0.02 + + # Test the chained optimizer's state is a reference of the underlying optimizers' state + # 1. run step on optimizers, make sure there is state + assert len(chained_optimizer.state) == 0 + input = torch.randn(1, 3, 32, 32) + output = net(input) + output.sum().backward() + optimizer_1.step() + optimizer_2.step() + assert len(chained_optimizer.state) != 0 + + # 2. check the state is a reference + assert not list(optimizer_1.state.values())[0]["exp_avg"].is_cuda + assert not list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda + + def to_cuda(d): + for k, v in d.items(): + if isinstance(v, torch.Tensor): + d[k] = v.to("cuda") + elif isinstance(v, dict): + to_cuda(v) + return d + + for k, v in chained_optimizer.state.items(): + chained_optimizer.state[k] = to_cuda(v) + + assert list(optimizer_1.state.values())[0]["exp_avg"].is_cuda + assert list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py new file mode 100644 index 0000000..4d8050c --- /dev/null +++ b/tests/unit_tests/test_parallel_state.py @@ -0,0 +1,468 @@ +import torch +import megatron.core.parallel_state as ps +import pytest +from tests.unit_tests.test_utilities import Utils +import os + +rank = Utils.rank +world_size = Utils.world_size +test_parallel_order = ['tp-cp-ep-dp-pp', 'tp-cp-pp-ep-dp'] + +@pytest.mark.parametrize('order', test_parallel_order) +def test_initialize_and_destroy_model_parallel(order): + with pytest.raises(AssertionError): + assert(ps.initialize_model_parallel(order=order)) + Utils.initialize_distributed() + with pytest.raises(RuntimeError): + assert(ps.initialize_model_parallel(tensor_model_parallel_size=2*world_size, order=order)) + with pytest.raises(RuntimeError): + assert(ps.initialize_model_parallel(pipeline_model_parallel_size=2*world_size, order=order)) + with pytest.raises(RuntimeError): + assert(ps.initialize_model_parallel(pipeline_model_parallel_size=world_size, tensor_model_parallel_size=world_size, order=order)) + with pytest.raises(RuntimeError): + assert(ps.initialize_model_parallel(virtual_pipeline_model_parallel_size=2, order=order)) + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, order=order) + + assert(ps.model_parallel_is_initialized()) + assert(ps.get_model_parallel_group() is not None) + assert(ps.get_tensor_model_parallel_group() is not None) + assert(ps.get_pipeline_model_parallel_group() is not None) + assert(ps.get_data_parallel_group() is not None) + Utils.destroy_model_parallel() + assert(ps._MODEL_PARALLEL_GROUP is None) + +@pytest.mark.parametrize('order', test_parallel_order) +def test_pipeline_parallel_initializations(order): + Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, order=order) + assert(ps.get_pipeline_model_parallel_first_rank() == rank % 2 ) + assert(ps.get_data_parallel_src_rank() == rank) + assert(ps.get_pipeline_model_parallel_next_rank() == ((rank + 2) % world_size)) + assert(ps.get_pipeline_model_parallel_prev_rank() == ((rank - 2) % world_size)) + Utils.destroy_model_parallel() + +@pytest.mark.parametrize('order', test_parallel_order) +def test_data_parallel_initializations(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + assert(ps.get_data_parallel_src_rank() == rank) + assert(ps.get_data_parallel_world_size() == 1) + assert(ps.get_data_parallel_rank() == 0) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_tensor_model_parellel_world_size(order): + Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order) + assert(ps.get_tensor_model_parallel_world_size() == world_size) + ps.set_tensor_model_parallel_world_size(None) + assert(ps.get_tensor_model_parallel_world_size() == world_size) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_pipeline_model_parallel_world_size(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + assert(ps.get_pipeline_model_parallel_world_size() == world_size) + ps.set_pipeline_model_parallel_world_size(None) + assert(ps.get_pipeline_model_parallel_world_size() == world_size) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_tensor_model_parallel_rank(order): + Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order) + assert(ps.get_tensor_model_parallel_rank() == rank) + ps.set_tensor_model_parallel_rank(None) + assert(ps.get_tensor_model_parallel_rank() == rank) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_pipeline_model_parallel_rank(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + assert(ps.get_pipeline_model_parallel_rank() == rank) + ps.set_pipeline_model_parallel_rank(None) + assert(ps.get_pipeline_model_parallel_rank() == rank) + Utils.destroy_model_parallel() + +def test_context_parallel_rank(): + Utils.initialize_model_parallel(context_parallel_size=world_size) + assert(ps.get_context_parallel_rank() == rank) + Utils.destroy_model_parallel() + +def test_expert_model_parallel_rank(): + Utils.initialize_model_parallel(expert_model_parallel_size=world_size) + assert(ps.get_expert_model_parallel_rank() == rank) + ps.set_expert_model_parallel_rank(None) + assert(ps.get_expert_model_parallel_rank() == rank) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_is_pipeline_first_stage(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + assert(ps.is_pipeline_first_stage(ignore_virtual=True) == (rank == 0)) + assert(ps.is_pipeline_first_stage() == (rank == 0)) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_is_pipeline_last_stage(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + assert(ps.is_pipeline_last_stage(ignore_virtual=True) == (rank == world_size-1)) + assert(ps.is_pipeline_last_stage() == (rank == world_size-1)) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_virtual_pipeline_model_parallel_rank(order): + Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order) + ps.set_virtual_pipeline_model_parallel_rank(rank) + assert(ps.get_virtual_pipeline_model_parallel_rank() == rank) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize('order', test_parallel_order) +def test_get_tensor_model_parallel_src_rank(order): + Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order) + assert(ps.get_tensor_model_parallel_src_rank() == ((rank // world_size) * world_size)) + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize( + 'src_tp_pp, ep_size', + [ + ((1, 8), 1), + ((2, 4), 1), + ((4, 2), 1), + ((8, 1), 1), + ((4, 1), 2), + ((1, 1), 8), + ((1, 1), 2), + ((2, 1), 4), + ], +) +def test_different_initialize_order_consistency(src_tp_pp, ep_size): + Utils.initialize_model_parallel( + *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-ep-dp-pp' + ) + tp_rank = ps.get_tensor_model_parallel_rank() + dp_rank = ps.get_data_parallel_rank() + pp_rank = ps.get_pipeline_model_parallel_rank() + ep_rank = ps.get_expert_model_parallel_rank() + + tp_g = torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group()) + dp_g = torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False)) + pp_g = torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group()) + dp_no_ep_g = torch.distributed.get_process_group_ranks( + ps.get_data_modulo_expert_parallel_group() + ) + cp_g = torch.distributed.get_process_group_ranks(ps.get_context_parallel_group()) + amax_g = torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False)) + mp_g = torch.distributed.get_process_group_ranks(ps.get_model_parallel_group()) + tp_ep_g = torch.distributed.get_process_group_ranks(ps.get_tensor_and_expert_parallel_group()) + tp_dp_g = torch.distributed.get_process_group_ranks( + ps.get_tensor_and_data_parallel_group(False) + ) + + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel( + *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-pp-ep-dp' + ) + assert tp_rank == ps.get_tensor_model_parallel_rank() + assert dp_rank == ps.get_data_parallel_rank() + assert pp_rank == ps.get_pipeline_model_parallel_rank() + assert ep_rank == ps.get_expert_model_parallel_rank() + + assert tp_g == torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group()) + assert dp_g == torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False)) + assert pp_g == torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group()) + assert dp_no_ep_g == torch.distributed.get_process_group_ranks( + ps.get_data_modulo_expert_parallel_group() + ) + assert cp_g == torch.distributed.get_process_group_ranks(ps.get_context_parallel_group()) + assert amax_g == torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False)) + assert mp_g == torch.distributed.get_process_group_ranks(ps.get_model_parallel_group()) + assert tp_ep_g == torch.distributed.get_process_group_ranks( + ps.get_tensor_and_expert_parallel_group() + ) + assert tp_dp_g == torch.distributed.get_process_group_ranks( + ps.get_tensor_and_data_parallel_group(False) + ) + + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize( + 'src_tp_pp, ep_size', + [((1, 2), 1), ((1, 4), 1), ((2, 2), 1), ((1, 2), 2), ((1, 4), 2), ((2, 2), 2),], +) +def test_different_initialize_order_unconsistency(src_tp_pp, ep_size): + Utils.initialize_model_parallel( + *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-ep-dp-pp' + ) + + tp_g = torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group()) + dp_g = torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False)) + pp_g = torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group()) + cp_g = torch.distributed.get_process_group_ranks(ps.get_context_parallel_group()) + amax_g = torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False)) + mp_g = torch.distributed.get_process_group_ranks(ps.get_model_parallel_group()) + + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel( + *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-pp-ep-dp' + ) + assert tp_g == torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group()) + assert dp_g != torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False)) + assert pp_g != torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group()) + assert cp_g == torch.distributed.get_process_group_ranks(ps.get_context_parallel_group()) + assert amax_g == torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False)) + assert mp_g != torch.distributed.get_process_group_ranks(ps.get_model_parallel_group()) + + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize( + 'nodes, num_gpu, tp, pp, cp, ep', + [ + (1, 1, 1, 1, 1, 1), + (1, 8, 8, 1, 1, 1), + (1, 8, 2, 2, 1, 1), + (1, 8, 2, 4, 1, 1), + (3, 8, 8, 3, 1, 1), + (4, 8, 2, 4, 1, 1), + (8, 8, 8, 8, 1, 1), + (8, 8, 2, 1, 1, 4), + (8, 8, 2, 2, 2, 4), + (8, 8, 2, 1, 4, 8), + (8, 8, 2, 2, 2, 8), + (16, 8, 4, 8, 1, 1), + (16, 8, 4, 8, 1, 4), + (16, 8, 4, 8, 4, 1), + (16, 8, 8, 8, 1, 1), + (16, 8, 4, 8, 1, 1), + (16, 8, 8, 8, 1, 1), + (32, 8, 4, 8, 1, 1), + (32, 8, 8, 8, 1, 1), + (32, 8, 4, 8, 1, 4), + (32, 8, 8, 8, 4, 1), + (64, 8, 4, 2, 8, 8), + (64, 8, 4, 8, 1, 1), + (64, 8, 8, 8, 1, 1), + (96, 8, 4, 8, 1, 1), + (128, 8, 4, 2, 8, 8), + (128, 8, 4, 8, 1, 1), + (256, 8, 4, 8, 1, 1), + (316, 8, 4, 8, 1, 1), + (384, 8, 4, 8, 1, 1), + (512, 8, 4, 8, 1, 1), + (768, 8, 4, 8, 1, 1), + (1024, 8, 4, 8, 1, 1), + (1280, 8, 4, 8, 1, 1), + (1344, 8, 4, 8, 1, 1), + ], +) +def test_rank_generator_for_tp_dp_pp(nodes, num_gpu, tp, pp, cp, ep): + def golden_rank_result_from_past_code( + world_size: int, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + context_parallel_size: int = 1, + expert_model_parallel_size: int = 1, + ): + data_parallel_size: int = world_size // ( + tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size + ) + num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size + num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size + + dp_groups = [] + dp_groups_with_cp = [] + + all_data_parallel_group_ranks_with_cp = [] + for i in range(pipeline_model_parallel_size): + start_rank = i * num_pipeline_model_parallel_groups + end_rank = (i + 1) * num_pipeline_model_parallel_groups + for j in range(context_parallel_size * tensor_model_parallel_size): + ranks = range( + start_rank + j, end_rank, context_parallel_size * tensor_model_parallel_size + ) + dp_groups.append(list(ranks)) + for j in range(tensor_model_parallel_size): + ranks_with_cp = range(start_rank + j, end_rank, tensor_model_parallel_size) + all_data_parallel_group_ranks_with_cp.append(list(ranks_with_cp)) + dp_groups_with_cp.append(list(ranks_with_cp)) + + cp_group = [] + for i in range(pipeline_model_parallel_size): + for j in range(data_parallel_size): + start_rank = ( + i * num_pipeline_model_parallel_groups + + j * tensor_model_parallel_size * context_parallel_size + ) + end_rank = ( + i * num_pipeline_model_parallel_groups + + (j + 1) * tensor_model_parallel_size * context_parallel_size + ) + for k in range(tensor_model_parallel_size): + ranks = range(start_rank + k, end_rank, tensor_model_parallel_size) + cp_group.append(list(ranks)) + + mp_group = [] + for i in range(data_parallel_size * context_parallel_size): + ranks = [ + data_parallel_group_ranks_with_cp[i] + for data_parallel_group_ranks_with_cp in all_data_parallel_group_ranks_with_cp + ] + mp_group.append(list(ranks)) + + tp_group = [] + for i in range(num_tensor_model_parallel_groups): + ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) + tp_group.append(list(ranks)) + + pp_group = [] + for i in range(num_pipeline_model_parallel_groups): + ranks = range(i, world_size, num_pipeline_model_parallel_groups) + pp_group.append(list(ranks)) + + tp_dp_group = [] + tp_dp_cp_group = [] + tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * data_parallel_size * context_parallel_size + num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp + for i in range(num_tensor_and_data_groups_with_cp): + start_rank = i * tensor_and_data_group_size_with_cp + end_rank = start_rank + tensor_and_data_group_size_with_cp + ranks = range(start_rank, end_rank) + tp_dp_cp_group.append(list(ranks)) + + for j in range(context_parallel_size): + ranks = [] + for k in range(data_parallel_size): + start_rank = ( + i * tensor_and_data_group_size_with_cp + + j * tensor_model_parallel_size + + k * tensor_model_parallel_size * context_parallel_size + ) + end_rank = start_rank + tensor_model_parallel_size + ranks = ranks + list(range(start_rank, end_rank)) + tp_dp_group.append(list(ranks)) + + tp_ep_group = [] + dp_no_ep_group = [] + dp_no_ep_group_with_cp = [] + + all_ranks = torch.arange(world_size).reshape(( + pipeline_model_parallel_size, + data_parallel_size // expert_model_parallel_size, + expert_model_parallel_size, + context_parallel_size, + tensor_model_parallel_size + )) + # 'pp edp ep cp tp -> (pp edp cp) (ep tp)' + tp_ep_rearrange = torch.transpose(all_ranks, 2, 3) + tp_ep_rearrange = torch.reshape(tp_ep_rearrange, (-1, expert_model_parallel_size * tensor_model_parallel_size)) + tp_ep_rearrange = tp_ep_rearrange.tolist() + tp_ep_rearrange.sort() + for tensor_and_expert_parallel_ranks in tp_ep_rearrange: + tensor_and_expert_parallel_ranks = list(tensor_and_expert_parallel_ranks) + tensor_and_expert_parallel_ranks.sort() + tp_ep_group.append(tensor_and_expert_parallel_ranks) + # 'pp edp ep cp tp -> (pp ep cp tp) edp' + edp_rearrange = torch.transpose(all_ranks, 1, 4) + edp_rearrange = torch.reshape(edp_rearrange, (-1, data_parallel_size // expert_model_parallel_size)) + edp_rearrange = edp_rearrange.tolist() + edp_rearrange.sort() + for expert_data_parallel_ranks in edp_rearrange: + expert_data_parallel_ranks = list(expert_data_parallel_ranks) + expert_data_parallel_ranks.sort() + dp_no_ep_group.append(expert_data_parallel_ranks) + # 'pp edp ep cp tp -> (pp ep tp) (cp edp)' + edp_cp_rearrange = torch.transpose(all_ranks, 1, 2) + edp_cp_rearrange = torch.transpose(edp_cp_rearrange, 2, 4) + edp_cp_rearrange = torch.reshape( + edp_cp_rearrange, + (-1, context_parallel_size * data_parallel_size // expert_model_parallel_size) + ) + edp_cp_rearrange = edp_cp_rearrange.tolist() + edp_cp_rearrange.sort() + for expert_data_parallel_ranksj_with_cp in edp_cp_rearrange: + expert_data_parallel_ranksj_with_cp = list(expert_data_parallel_ranksj_with_cp) + expert_data_parallel_ranksj_with_cp.sort() + dp_no_ep_group_with_cp.append(expert_data_parallel_ranksj_with_cp) + + return ( + dp_groups, + dp_groups_with_cp, + cp_group, + mp_group, + tp_group, + pp_group, + tp_dp_group, + tp_dp_cp_group, + tp_ep_group, + dp_no_ep_group, + dp_no_ep_group_with_cp, + ) + + world_size = nodes * num_gpu + dp = world_size // (tp * pp * cp) + assert dp % ep == 0, f"dp size ({dp}) is not divisible by ep {ep} ." + assert ( + world_size % (tp * pp * cp) == 0 + ), f"world_size ({world_size}) is not divisible by tp {tp} x pp {pp} x cp {cp}." + ( + dp_groups, + dp_groups_with_cp, + cp_group, + mp_group, + tp_group, + pp_group, + tp_dp_group, + tp_dp_cp_group, + tp_ep_group, + dp_no_ep_group, + dp_no_ep_group_with_cp, + ) = golden_rank_result_from_past_code( + world_size=world_size, + tensor_model_parallel_size=tp, + pipeline_model_parallel_size=pp, + context_parallel_size=cp, + expert_model_parallel_size=ep, + ) + rank_generator = ps.RankGenerator(tp=tp, ep=ep, dp=dp, pp=pp, cp=cp, order="tp-cp-ep-dp-pp",) + assert dp_groups == rank_generator.get_ranks( + "dp" + ), f"{dp_groups} != {rank_generator.get_ranks('dp')}" + assert dp_groups_with_cp == rank_generator.get_ranks( + 'dp-cp' + ), f"{dp_groups_with_cp} != {rank_generator.get_ranks('dp-cp')}" + assert cp_group == rank_generator.get_ranks( + "cp" + ), f"{cp_group} != {rank_generator.get_ranks('cp')}." + assert mp_group == rank_generator.get_ranks( + "tp-pp" + ), f"{mp_group} != {rank_generator.get_ranks('tp-pp')}" + assert tp_group == rank_generator.get_ranks( + "tp" + ), f"{tp_group} != {rank_generator.get_ranks('tp')}" + assert pp_group == rank_generator.get_ranks( + "pp" + ), f"{pp_group} != {rank_generator.get_ranks('pp')}" + assert tp_dp_group == rank_generator.get_ranks( + "tp-dp" + ), f"{tp_dp_group} != {rank_generator.get_ranks('tp-dp')}" + assert tp_dp_cp_group == rank_generator.get_ranks( + "tp-dp-cp" + ), f"{tp_dp_cp_group} != {rank_generator.get_ranks('tp-dp-cp')}" + assert tp_ep_group == rank_generator.get_ranks( + "tp-ep", independent_ep=True + ), f"{tp_ep_group} != {rank_generator.get_ranks('tp-ep', independent_ep=True)}." + assert dp_no_ep_group == rank_generator.get_ranks( + "dp", independent_ep=True + ), f"{dp_no_ep_group} != {rank_generator.get_ranks('dp', independent_ep=True)}." + assert dp_no_ep_group_with_cp == rank_generator.get_ranks( + "dp-cp", independent_ep=True + ), f"{dp_no_ep_group_with_cp} != {rank_generator.get_ranks('dp-cp', independent_ep=True)}." diff --git a/tests/unit_tests/test_training.py b/tests/unit_tests/test_training.py new file mode 100644 index 0000000..bc2f9ef --- /dev/null +++ b/tests/unit_tests/test_training.py @@ -0,0 +1,43 @@ +from types import SimpleNamespace + +from megatron.training.global_vars import set_args +from megatron.training.training import build_train_valid_test_data_iterators +from tests.unit_tests.test_utilities import Utils + + +def mock_train_valid_test_datasets_provider(train_val_test_num_samples): + return 1, 2, 3 + + +def create_test_args(): + # Set dummy values for the args. + args = SimpleNamespace() + args.iteration = 0 + args.train_samples = 1 + args.train_iters = 1 + args.eval_interval = 1 + args.eval_iters = 1 + args.global_batch_size = 1 + args.consumed_train_samples = 1 + args.consumed_valid_samples = 1 + args.dataloader_type = "external" + args.skip_train = False + + return args + + +class TestTraining: + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + args = create_test_args() + set_args(args) + + def test_build_train_valid_test_data_iterators(self): + train_iter, valid_iter, test_iter = build_train_valid_test_data_iterators( + mock_train_valid_test_datasets_provider + ) + + assert (train_iter, valid_iter, test_iter) == (1, 2, 3) + + def teardown_method(self, method): + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py new file mode 100644 index 0000000..d59a92e --- /dev/null +++ b/tests/unit_tests/test_utilities.py @@ -0,0 +1,82 @@ +import os +import torch +import megatron.core.parallel_state as ps + + +class TestModel(torch.nn.Module): + def __init__(self, input_dim: int, output_dim: int, num_layers: int, bias: bool): + super().__init__() + self.layers = torch.nn.ModuleList( + [torch.nn.Linear(input_dim, output_dim, bias) for _ in range(num_layers)] + ) + + +class Utils: + + world_size = torch.cuda.device_count() + rank = int(os.environ['LOCAL_RANK']) + inited = False + + @staticmethod + def initialize_distributed(): + if not torch.distributed.is_initialized() and Utils.rank >= 0: + print( + f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}' + ) + torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend='nccl', + world_size=Utils.world_size, + rank=Utils.rank, + init_method=init_method, + ) + + torch.distributed.barrier() + Utils.inited = True + + @staticmethod + def set_world_size(world_size=None, rank=None): + Utils.world_size = torch.cuda.device_count() if world_size is None else world_size + if ( + torch.distributed.is_initialized() + and Utils.world_size != torch.distributed.get_world_size() + ): + torch.distributed.destroy_process_group() + + if rank is None: + Utils.rank = int(os.environ['LOCAL_RANK']) + if Utils.rank >= Utils.world_size: + Utils.rank = -1 + else: + Utils.rank = rank + + @staticmethod + def destroy_model_parallel(): + if not Utils.inited: + return + ps.destroy_model_parallel() + torch.distributed.barrier() + Utils.inited = False + + @staticmethod + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + **kwargs, + ): + ps.destroy_model_parallel() + Utils.initialize_distributed() + ps.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + **kwargs, + ) + Utils.inited = True diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py new file mode 100644 index 0000000..509b33b --- /dev/null +++ b/tests/unit_tests/test_utils.py @@ -0,0 +1,183 @@ +import os +import time +import urllib.request as req + +import numpy as np +import pytest +import torch + +import megatron.core.utils as util +from tests.unit_tests.test_utilities import Utils + + +def test_divide_properly(): + assert util.divide(4,2) == 2 + +def test_divide_improperly(): + with pytest.raises(AssertionError): + util.divide(4,5) + +def test_global_memory_buffer(): + global_memory_buffer = util.GlobalMemoryBuffer() + obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor") + expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device()) + assert obtained_tensor.shape == expected_tensor.shape + +def test_make_viewless_tensor(): + inp = torch.rand((3,4)) + assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True))) + assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False))) + +def test_safely_set_viewless_tensor_data(): + tensor = torch.zeros((3,4)) + new_data_tensor = torch.tensor(np.random.rand(3,4)) + util.safely_set_viewless_tensor_data(tensor, new_data_tensor) + assert(torch.equal(tensor, new_data_tensor)) + +def test_assert_viewless_tensor(): + tensor = torch.rand((3,4)) + assert(torch.equal(util.assert_viewless_tensor(tensor), tensor)) + input_tensor_list=[tensor,tensor,tensor] + output_tensor_list = util.assert_viewless_tensor(input_tensor_list) + for inp,out in zip(input_tensor_list, output_tensor_list): + assert(torch.equal(inp,out)) + +# Initialize torch.distributed; do not call init_process_group here, call +# Utils.initialize_distributed() instead. +def _init_distributed(world, rank): + Utils.initialize_distributed() + assert torch.distributed.is_initialized() == True + assert torch.distributed.get_rank() == rank + assert torch.cuda.device_count() == world + torch.distributed.barrier() + +# Deinitialization and cleanup. +# Do not call torch.distributed.destroy_process_group, may be needed by other tests. +def _deinit_distributed(): + assert torch.distributed.is_initialized() == True + torch.distributed.barrier() + +def test_check_param_hashes_across_dp_replicas(): + world = int(os.getenv('WORLD_SIZE', '1')) + rank = int(os.getenv('RANK', '0')) + + # Setup. + _init_distributed(world, rank) + Utils.initialize_model_parallel() + model = torch.nn.Linear(100, 100, bias=False) + + # First check case where all replicas agree. + model.weight.data.fill_(1.0) + assert util.check_param_hashes_across_dp_replicas([model]) + + # Now check case where replica 0 disagrees with all other replicas. + if rank == 0: + model.weight.data.fill_(0.0) + param_hashes_match = util.check_param_hashes_across_dp_replicas([model]) + expected_param_hashes_match = (rank == 0) + assert param_hashes_match == expected_param_hashes_match + + # Teardown. + _deinit_distributed() + + +def test_straggler_detector(): + world = int(os.getenv('WORLD_SIZE', '1')) + rank = int(os.getenv('RANK', '0')) + master = os.getenv('MASTER_ADDR', 'localhost') + port = 65535 + + # Checks if the instance is disabled. + def straggler_detector_disabled(): + assert stimer.enabled == False + + # Checks if the instance is enabled. + def straggler_detector_enabled(): + assert stimer.enabled == True + + # Enable. + def straggler_detector_enable(): + if rank == 0: + resp = req.urlopen(f"http://{master}:{port}").read().decode().split() + assert resp[3] == "ON" + # Call the report function, this will propagate the change. + stimer.report() + + # Time an operation. + def straggler_detector_timeit(): + s = 2 # Sleep for 2 seconds. + M = 20 + K = 30 + N = 40 + mat1 = torch.randn(M, K, device='cuda') + mat2 = torch.randn(K, N, device='cuda') + # batch_data. + with stimer(bdata=True): + time.sleep(s) + # GEMM. + with stimer: + res = torch.matmul(mat1, mat2) + delta, batch_delta, _, _, _, _, = stimer.elapsed() + assert delta > 0.0 + assert batch_delta >= s + + # Test function to raise ValueError + def straggler_value_error(): + raise ValueError("Exception value raised") + + # Check that exception is not suppressed. + def straggler_detector_exception_propagate(): + # batch_data + with pytest.raises(ZeroDivisionError): + with stimer(bdata=True): + x = 1 / 0 + # non-batch-data + with pytest.raises(ValueError, match=r".* value .*"): + with stimer(): + straggler_value_error() + + # Reporting. + def straggler_detector_report(): + s = 2 # Sleep for 2 seconds. + N = 20 + P = 30 + M = 40 + mat1 = torch.randn(N, P, device='cuda') + mat2 = torch.randn(P, M, device='cuda') + tfp = (N * M) * (2 * P - 1) # Theoretical. + iter = 10 # Mock. + # batch_data. + with stimer(bdata=True): + time.sleep(s) + # GEMM. + with stimer: + res = torch.matmul(mat1, mat2) + r = stimer.report(total_flops=tfp, log_interval=iter) + rb = True if rank == 0 else False + assert r == rb + + # Start test. + # Setup. + _init_distributed(world, rank) + + # Create a straggler_detector with enabled set to false. + stimer = util.StragglerDetector() + stimer.configure(world, rank, enabled=False, port=port) + # Check if configuration was success. + assert stimer.configured == True + + # Check if the instance is in disabled state. + straggler_detector_disabled() + # Enable it now, must call report. + straggler_detector_enable() + # Check if all ranks have straggler detector enabled. + straggler_detector_enabled() + # Time some operation. + straggler_detector_timeit() + # Report only from rank 0. + straggler_detector_report() + # Check that exception is not suppressed. + straggler_detector_exception_propagate() + + # Teardown. + _deinit_distributed() diff --git a/tests/unit_tests/transformer/__init__.py b/tests/unit_tests/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/transformer/moe/__init__.py b/tests/unit_tests/transformer/moe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py new file mode 100644 index 0000000..38eb9aa --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py @@ -0,0 +1,84 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch + +from megatron.core.transformer.moe.moe_utils import permute, unpermute +from tests.unit_tests.test_utilities import Utils +from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer + +class TestAlltoAllDispatcher: + def setup_method(self, method): + pass + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(120) + @pytest.mark.parametrize("tp_size,ep_size", [ + (1, 8), + (8, 1), + (4, 2), + (1, 1), + ]) + def test_forward_backward(self, tp_size, ep_size): + container = MoEModelTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + ) + container.dispatcher_dropless_test() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(120) + @pytest.mark.parametrize("tp_size,ep_size", [ + (1, 8), + (8, 1), + (4, 2), + (1, 1), + ]) + def test_capacity_forward_backward(self, tp_size, ep_size): + container = MoEModelTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + moe_token_drop_policy="probs", + moe_expert_capacity_factor=0.5, + moe_pad_expert_input_to_capacity=False, + ) + container.dispacher_capacity_test() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(120) + @pytest.mark.parametrize("tp_size,ep_size", [ + (1, 8), + (8, 1), + (4, 2), + (1, 1) + ]) + def test_capacity_padding_forward_backward(self, tp_size, ep_size): + import time + time.sleep(5) + container = MoEModelTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + moe_token_drop_policy="probs", + moe_expert_capacity_factor=0.5, + moe_pad_expert_input_to_capacity=True, + ) + container.dispatcher_drop_and_pad_test() + diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py new file mode 100644 index 0000000..217a0a2 --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_aux_loss.py @@ -0,0 +1,97 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch +from megatron.core.transformer.moe.moe_utils import clear_aux_losses_tracker + +from tests.unit_tests.test_utilities import Utils +from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer +from megatron.core import parallel_state + +class AuxlossTestContainer(MoEModelTestContainer): + def partition_input(self, input): + partitioned_input = input.chunk(parallel_state.get_tensor_and_context_parallel_world_size(), dim=1)[parallel_state.get_tensor_and_context_parallel_rank()] + output = partitioned_input.clone().detach() + output.requires_grad = True + return output + + def aux_loss_test(self, input, baseline_grad): + partitioned_input = self.partition_input(input) + moe_layer = self.moe_layer + probs, indices = moe_layer.router(partitioned_input) + probs.sum().mul_(0).backward() + aux_loss_grad = partitioned_input.grad + torch.distributed.barrier() + ans = self.partition_input(baseline_grad) + assert torch.allclose(aux_loss_grad, ans), f"Diff: {(aux_loss_grad/ans).mean()}" + loss = parallel_state.get_moe_layer_wise_logging_tracker()['load_balancing_loss'] + clear_aux_losses_tracker() + +class TestAuxLoss: + def setup_method(self, method): + baseline_container = AuxlossTestContainer( + tp_size=1, + ep_size=1, + pp_size=1, + cp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + moe_aux_loss_coeff=0.1, + ) + moe_layer = baseline_container.moe_layer + self.input = torch.randn((32, 8, moe_layer.config.hidden_size)).cuda() + self.input.requires_grad = True + probs, indices = moe_layer.router(self.input) + probs.sum().mul_(0).backward() # zero out the main gradients + self.baseline_grad = self.input.grad + self.input.grad = None + clear_aux_losses_tracker() + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("tp_size,ep_size,cp_size", [ + (8, 1, 1), + (4, 2, 1), + (1, 1, 8), + (2, 1, 4), + (2, 2, 2), + ]) + def test_allgather_dispatcher(self, tp_size, ep_size, cp_size): + container = AuxlossTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + cp_size=cp_size, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="allgather", + moe_aux_loss_coeff=0.1, + ) + container.aux_loss_test(self.input, self.baseline_grad) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("tp_size,ep_size,cp_size", [ + (8, 1, 1), + (4, 2, 1), + (1, 1, 8), + (2, 1, 4), + (2, 2, 2), + ]) + def test_a2a_dispatcher(self, tp_size, ep_size, cp_size): + container = AuxlossTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + cp_size=cp_size, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + moe_aux_loss_coeff=0.1, + ) + container.aux_loss_test(self.input, self.baseline_grad) diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py new file mode 100644 index 0000000..b86edde --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py @@ -0,0 +1,358 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +from pkg_resources import packaging +from importlib.metadata import version + +import torch +import torch.nn.functional as F + +from megatron.training.arguments import parse_args +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.transformer.moe import grouped_gemm_util as gg +from megatron.core.transformer.moe.moe_layer import MoELayer +from megatron.core.transformer.moe.experts import TEGroupedMLP +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.training.initialize import _set_random_seed +from megatron.legacy.model import Float16Module +from tests.unit_tests.test_utilities import Utils + +DEVICE_CAPABILITY = None +if torch.cuda.is_available(): + DEVICE_CAPABILITY = torch.cuda.get_device_capability() + +_te_version = packaging.version.Version(version("transformer-engine")) + + +class TestParallelGroupedMLP: + + def setup_method(self, method, use_cpu_initialization=False, swiglu=True): + print("============") + print("Test for use_cpu_initilization={} and swiglu={}.".format(use_cpu_initialization, swiglu)) + print("============") + Utils.initialize_model_parallel(1,1) + num_layers = 1 # 2 + self.hidden_size = 16 # must be an multiple of 16, otherwise trigger CUTLASS misaligned issue + self.num_experts = 2 + self.gated_linear_unit = swiglu + self.activation_func = F.silu if swiglu else F.gelu + self.use_cpu_initialization = use_cpu_initialization + + tf_config = TransformerConfig( + num_layers=num_layers, hidden_size=self.hidden_size, num_attention_heads=4, + num_moe_experts=self.num_experts, use_cpu_initialization=self.use_cpu_initialization, + add_bias_linear=False, gated_linear_unit=self.gated_linear_unit, + activation_func=self.activation_func, + bias_activation_fusion=False, + bf16=True, params_dtype=torch.bfloat16, moe_router_load_balancing_type="sinkhorn", moe_router_topk=1) + + self.fc1_ffn_hidden_size = tf_config.ffn_hidden_size + self.fc2_ffn_hidden_size = tf_config.ffn_hidden_size + # If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + if self.gated_linear_unit: + self.fc1_ffn_hidden_size *= 2 + + ## Vanilla sequential GEMM + # Set random seed for reproducability + _set_random_seed(seed_=123, data_parallel_random_init=False) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + self.num_experts, moe_grouped_gemm=False) + self.sequential_mlp = MoELayer(tf_config, + transformer_layer_spec.submodules.mlp.submodules) + + self.args = parse_args(ignore_unknown_args=True) + self.args.bf16=True + # Bias is not supported in grouped gemm currently, thus we disable the + # bias in the linear layer. + self.args.add_bias_linear=False + self.sequential_mlp = Float16Module(self.sequential_mlp, self.args).module + print("done intializing for sequential gemm") + + ## Grouped GEMM + _set_random_seed(seed_=123, data_parallel_random_init=False) + tf_config.moe_grouped_gemm = True + self.grouped_mlp = MoELayer(tf_config) + self.grouped_mlp = Float16Module(self.grouped_mlp, self.args).module + print("done intializing for grouped gemm") + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.sequential_mlp, MoELayer) + assert isinstance(self.grouped_mlp, MoELayer) + + num_weights_smm = sum([p.numel() for p in self.sequential_mlp.parameters()]) + num_weights_gmm = sum([p.numel() for p in self.grouped_mlp.parameters()]) + + # For the same hyper-parm model configs except the `moe_grouped_gemm`, + # GroupedGEMM and sequential GEMMs should hold the same number of parms. + assert num_weights_smm == num_weights_gmm + # expected num weights: router linear weights+bias + MLP weights(no bias) of all experts + expected_num_weights = \ + self.hidden_size * self.num_experts + \ + self.hidden_size * (self.fc1_ffn_hidden_size + self.fc2_ffn_hidden_size) * self.num_experts + assert num_weights_smm == expected_num_weights + + assert torch.equal(self.sequential_mlp.router.weight, self.grouped_mlp.router.weight) + + # weight1: [h, num_experts*4h] + # weight2: [num_experts*4h, h] + assert self.grouped_mlp.experts.weight1.shape[0] == self.hidden_size + assert self.grouped_mlp.experts.weight1.shape[1] == self.num_experts * self.fc1_ffn_hidden_size + if self.gated_linear_unit: + assert self.grouped_mlp.experts.weight2.shape[0] == self.num_experts * self.fc2_ffn_hidden_size + assert self.grouped_mlp.experts.weight2.shape[1] == self.hidden_size + else: + assert self.grouped_mlp.experts.weight1.shape == self.grouped_mlp.experts.weight2.t().shape + + def test_weight_init_value_the_same(self): + gmm_w1 = self.grouped_mlp.experts.weight1.view(self.num_experts, -1, self.hidden_size) + gmm_w2 = self.grouped_mlp.experts.weight2.view(self.num_experts, self.hidden_size, -1) + gmm_expert1_fc1 = gmm_w1[0] + gmm_expert1_fc2 = gmm_w2[0] + gmm_expert2_fc1 = gmm_w1[1] + gmm_expert2_fc2 = gmm_w2[1] + + smm_expert1_fc1 = self.sequential_mlp.experts.local_experts[0].linear_fc1.weight + smm_expert1_fc2 = self.sequential_mlp.experts.local_experts[0].linear_fc2.weight + smm_expert2_fc1 = self.sequential_mlp.experts.local_experts[1].linear_fc1.weight + smm_expert2_fc2 = self.sequential_mlp.experts.local_experts[1].linear_fc2.weight + + assert torch.equal(gmm_expert1_fc1, smm_expert1_fc1) + if not self.use_cpu_initialization: + assert torch.equal(gmm_expert1_fc2, smm_expert1_fc2) + # the param init value is not exactly the same between gmm and smm (refer to test_weight_init_value_the_same.) + # TODO: is it necessary to keep smm and gmm share exactly the same init params? + # assert torch.equal(gmm_expert2_fc1, smm_expert2_fc1) + if self.use_cpu_initialization: + assert torch.equal(gmm_expert2_fc2, smm_expert2_fc2) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif( + not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='GroupedGEMM kernels are not supported on this device.' + ) + def test_gpu_forward(self): + self.sequential_mlp.cuda() + self.grouped_mlp.cuda() + # [sequence length, batch size, hidden size] + seq_len = 3 #32 + batch_size = 2 + hidden_states = torch.rand( + (seq_len, batch_size, self.sequential_mlp.config.hidden_size), + dtype=torch.bfloat16) + hidden_states = hidden_states.cuda() + output_smm, _ = self.sequential_mlp(hidden_states) + output_gmm, _ = self.grouped_mlp(hidden_states) + + # The following assert fails due to the param init value is not exactly + # the same between gmm and smm (refer to test_weight_init_value_the_same.) + # assert torch.equal(output_smm, output_gmm) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif( + not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='GroupedGEMM kernels are not supported on this device.' + ) + def test_gpu_forward_with_no_tokens_allocated(self): + """Test the case when no token is allocated for groupedGEMM kernels.""" + w1 = self.grouped_mlp.experts.weight1.view(self.num_experts, -1, self.hidden_size) + num_allocated_tokens = 0 + tokens_per_expert = torch.zeros(self.num_experts) + hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16) + hidden_states = hidden_states.cuda() + try: + gg.ops.gmm(hidden_states, w1, tokens_per_expert, trans_b=False) + except Exception as e: + print("Expected error message from groupedGEMM:", e) + assert str(e) == "Input batch_sizes should not be all zeros!" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif( + not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='GroupedGEMM kernels are not supported on this device.' + ) + def test_gradient_with_no_tokens_allocated(self): + """Test that when no token is passed in, the parameters of the grouped MLP will also have gradients.""" + self.grouped_mlp.cuda() + num_allocated_tokens = 0 + tokens_per_expert = torch.zeros(self.num_experts) + hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16) + hidden_states = hidden_states.cuda() + output_gmm, _ = self.grouped_mlp.experts( + hidden_states, + tokens_per_expert=tokens_per_expert, + ) + output_gmm.mean().backward() + assert self.grouped_mlp.experts.weight1.grad is not None + + +@pytest.mark.skipif( + _te_version < packaging.version.Version("1.9.0.dev0"), + reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.", +) +class TestTEGroupedMLP: + + def setup_method(self, method, use_cpu_initialization=False, swiglu=True): + Utils.initialize_model_parallel(1, 1) + num_layers = 1 + self.hidden_size = 16 + self.num_experts = 2 + self.gated_linear_unit = swiglu + self.activation_func = F.silu if swiglu else F.gelu + self.use_cpu_initialization = use_cpu_initialization + + tf_config = TransformerConfig( + num_layers=num_layers, + hidden_size=self.hidden_size, + num_attention_heads=4, + num_moe_experts=self.num_experts, + use_cpu_initialization=self.use_cpu_initialization, + add_bias_linear=False, + gated_linear_unit=self.gated_linear_unit, + activation_func=self.activation_func, + bias_activation_fusion=False, + bf16=True, + params_dtype=torch.bfloat16, + moe_router_load_balancing_type="sinkhorn", + moe_router_topk=1, + ) + + self.fc1_ffn_hidden_size = tf_config.ffn_hidden_size + self.fc2_ffn_hidden_size = tf_config.ffn_hidden_size + # If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + if self.gated_linear_unit: + self.fc1_ffn_hidden_size *= 2 + + ## Vanilla sequential GEMM + # Set random seed for reproducability + _set_random_seed(seed_=123, data_parallel_random_init=False) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + self.num_experts, moe_grouped_gemm=False + ) + self.sequential_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules) + + self.args = parse_args(ignore_unknown_args=True) + self.args.bf16 = True + # Bias is not supported in grouped gemm currently, thus we disable the + # bias in the linear layer. + self.args.add_bias_linear = False + self.sequential_mlp = Float16Module(self.sequential_mlp, self.args).module + + ## Grouped GEMM + _set_random_seed(seed_=123, data_parallel_random_init=False) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + self.num_experts, moe_grouped_gemm=True + ) + tf_config.moe_grouped_gemm = True + self.grouped_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules) + assert isinstance(self.grouped_mlp.experts, TEGroupedMLP) + self.grouped_mlp = Float16Module(self.grouped_mlp, self.args).module + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.sequential_mlp, MoELayer) + assert isinstance(self.grouped_mlp, MoELayer) + + num_weights_smm = sum([p.numel() for p in self.sequential_mlp.parameters()]) + num_weights_gmm = sum([p.numel() for p in self.grouped_mlp.parameters()]) + + # For the same hyper-parm model configs except the `moe_grouped_gemm`, + # GroupedGEMM and sequential GEMMs should hold the same number of parms. + assert num_weights_smm == num_weights_gmm + # expected num weights: router linear weights+bias + MLP weights(no bias) of all experts + expected_num_weights = ( + self.hidden_size * self.num_experts + + self.hidden_size + * (self.fc1_ffn_hidden_size + self.fc2_ffn_hidden_size) + * self.num_experts + ) + assert num_weights_smm == expected_num_weights + + assert torch.equal(self.sequential_mlp.router.weight, self.grouped_mlp.router.weight) + + # weights of linear_fc1: [fc1_ffn_hidden_size, hidden_size] + # weights of linear_fc2: [hidden_size, fc2_ffn_hidden_size] + for i in range(self.num_experts): + assert getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").shape == ( + self.fc1_ffn_hidden_size, + self.hidden_size, + ) + assert getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").shape == ( + self.hidden_size, + self.fc2_ffn_hidden_size, + ) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_gpu_forward_backward(self): + self.sequential_mlp.cuda() + self.grouped_mlp.cuda() + # Copy the weights to ensure the same init value + with torch.no_grad(): + for i in range(self.num_experts): + self.sequential_mlp.experts.local_experts[i].linear_fc1.weight.copy_( + getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}") + ) + self.sequential_mlp.experts.local_experts[i].linear_fc2.weight.copy_( + getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}") + ) + # [sequence length, batch size, hidden size] + seq_len = 32 + batch_size = 2 + hidden_states = torch.rand( + (seq_len, batch_size, self.hidden_size), + dtype=torch.bfloat16, + device="cuda", + requires_grad=True, + ) + hidden_states.retain_grad() + + output_smm, _ = self.sequential_mlp(hidden_states) + output_smm.mean().backward() + smm_results = [output_smm, hidden_states.grad] + for i in range(self.num_experts): + smm_results.append(self.sequential_mlp.experts.local_experts[i].linear_fc1.weight.grad) + smm_results.append(self.sequential_mlp.experts.local_experts[i].linear_fc2.weight.grad) + + hidden_states.grad = None + output_gmm, _ = self.grouped_mlp(hidden_states) + output_gmm.mean().backward() + gmm_results = [output_gmm, hidden_states.grad] + for i in range(self.num_experts): + gmm_results.append(getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").grad) + gmm_results.append(getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").grad) + + for smm_result, gmm_result in zip(smm_results, gmm_results): + torch.testing.assert_close(smm_result, gmm_result) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_gpu_forward_backward_with_no_tokens_allocated(self): + """Test the case when no token is allocated for groupedGEMM kernels.""" + self.grouped_mlp.cuda() + num_allocated_tokens = 0 + tokens_per_expert = torch.zeros(self.num_experts, dtype=torch.int32) + hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16) + hidden_states = hidden_states.cuda() + output, _ = self.grouped_mlp.experts(hidden_states, tokens_per_expert=tokens_per_expert) + assert torch.equal(output, torch.zeros_like(output)) + assert output.shape == (num_allocated_tokens, self.hidden_size) + + output.mean().backward() + for i in range(self.num_experts): + assert getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").grad is not None + assert getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").grad is not None + + +if __name__ == "__main__": + for use_cpu_unitilization in [True, False]: + for swiglu in [True, False]: + GMLP_test = TestParallelGroupedMLP() + GMLP_test.setup_method( + method=None, + use_cpu_initialization=use_cpu_unitilization, + swiglu=swiglu) + GMLP_test.test_constructor() + GMLP_test.test_weight_init_value_the_same() + GMLP_test.test_gpu_forward() + GMLP_test.test_gpu_forward_with_no_tokens_allocated() + GMLP_test.teardown_method(method=None) diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py new file mode 100644 index 0000000..fbeb744 --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_routers.py @@ -0,0 +1,91 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.moe.router import Router +from megatron.training.initialize import _set_random_seed +from tests.unit_tests.test_utilities import Utils +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.moe.moe_layer import MoELayer +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + + +class TestTop2Router: + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + _set_random_seed(seed_=123, data_parallel_random_init=False) + print("done intializing") + num_moe_experts = 4 + self.transformer_config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=True, + moe_router_load_balancing_type="aux_loss", + moe_router_topk=2, + moe_aux_loss_coeff=0, + ) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=False + ) + self.sequential_mlp = MoELayer( + self.transformer_config, transformer_layer_spec.submodules.mlp.submodules + ) + self.router = self.sequential_mlp.router + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.router, Router) + + num_weights = sum([p.numel() for p in self.router.parameters()]) + assert num_weights == 12 * 4, num_weights + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("moe_router_pre_softmax", [ + (True), + (False), + ]) + def test_router_forward(self, moe_router_pre_softmax): + with torch.no_grad(): + self.router = self.router.cuda() + self.router.config.moe_router_pre_softmax = moe_router_pre_softmax + # [num tokens, hidden size] + hidden_states = torch.randn((32, 2, self.router.config.hidden_size)) + hidden_states = hidden_states.cuda() + scores, indices = self.router(hidden_states) + print(scores.shape, indices.shape) + assert scores.shape == (64, 2) + assert indices.shape == (64, 2) + print( + (indices == 0).sum(), (indices == 1).sum(), (indices == 2).sum(), (indices == 3).sum() + ) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_aux_loss(self): + self.sequential_mlp = self.sequential_mlp.cuda() + + # Without aux loss + hidden_states = torch.randn((32, 2, self.router.config.hidden_size)) + hidden_states = hidden_states.cuda() + out = self.sequential_mlp(hidden_states)[0] + out.sum().mul_(0).backward() + assert self.sequential_mlp.router.weight.grad.abs().sum() == 0 + + # With aux loss + self.transformer_config.moe_aux_loss_coeff = 1 + out = self.sequential_mlp(hidden_states)[0] + out.sum().mul_(0).backward() + assert self.sequential_mlp.router.weight.grad.abs().sum() > 0 + + # With Z loss + self.transformer_config.moe_aux_loss_coeff = 0 + self.transformer_config.moe_z_loss_coeff = 1 + self.sequential_mlp.router.weight.grad.fill_(0) + out = self.sequential_mlp(hidden_states)[0] + out.sum().mul_(0).backward() + assert self.sequential_mlp.router.weight.grad.abs().sum() > 0 \ No newline at end of file diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py new file mode 100644 index 0000000..0ebb853 --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py @@ -0,0 +1,61 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.moe.moe_layer import MoELayer +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + +class TestParallelSequentialMLP: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + print("done intializing") + num_moe_experts = 2 + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=True, + activation_func=torch.nn.functional.silu, + gated_linear_unit=True, + bias_activation_fusion=True, + moe_router_load_balancing_type="sinkhorn", + moe_router_topk=1 + ) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=False) + self.sequential_mlp = MoELayer(transformer_config, transformer_layer_spec.submodules.mlp.submodules) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.sequential_mlp, MoELayer) + + num_weights = sum([p.numel() for p in self.sequential_mlp.parameters()]) + assert num_weights == 3696 + + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_gpu_forward(self): + sequential_mlp = self.sequential_mlp + sequential_mlp.cuda() + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((32, 2, sequential_mlp.config.hidden_size)) + hidden_states = hidden_states.cuda() + output, output_bias = sequential_mlp(hidden_states) + assert output.shape[0] == 32 + assert output.shape[1] == 2 + assert output.shape[2] == sequential_mlp.config.hidden_size + assert output_bias.shape[2] == sequential_mlp.config.hidden_size + assert output.dtype == torch.float32 + assert output.device.type == 'cuda' + assert output_bias.device.type == 'cuda' + diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py new file mode 100644 index 0000000..f538414 --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -0,0 +1,290 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch +from megatron.core import parallel_state + +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.transformer.moe.moe_layer import MoELayer +from megatron.core.transformer.moe.moe_utils import permute, unpermute +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.training.initialize import _set_random_seed +from tests.unit_tests.test_utilities import Utils + + +class MoEModelTestContainer: + def __init__( + self, + tp_size, + ep_size, + pp_size, + cp_size=1, + data_parallel_random_init=False, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="alltoall", + moe_expert_capacity_factor=None, + moe_pad_expert_input_to_capacity=False, + moe_aux_loss_coeff=0.1, + **kwargs, + ): + self.num_local_experts = num_moe_experts // ep_size + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=pp_size, + expert_model_parallel_size=ep_size, + context_parallel_size=cp_size + ) + _set_random_seed(seed_=123, data_parallel_random_init=data_parallel_random_init) + local_expert_indices_offset = ( + parallel_state.get_expert_model_parallel_rank() * self.num_local_experts + ) + self.local_expert_indices = [ + local_expert_indices_offset + i for i in range(self.num_local_experts) + ] + + self.config = TransformerConfig( + tensor_model_parallel_size=tp_size, + expert_model_parallel_size=ep_size, + pipeline_model_parallel_size=pp_size, + context_parallel_size=cp_size, + moe_router_topk=moe_router_topk, + num_moe_experts=num_moe_experts, + moe_router_load_balancing_type=moe_router_load_balancing_type, + moe_token_dispatcher_type=moe_token_dispatcher_type, + moe_expert_capacity_factor=moe_expert_capacity_factor, + moe_pad_expert_input_to_capacity=moe_pad_expert_input_to_capacity, + moe_aux_loss_coeff=moe_aux_loss_coeff, + num_layers=1, + moe_extended_tp=kwargs.get("moe_extended_tp", False), + moe_grouped_gemm=kwargs.get("moe_grouped_gemm", False), + hidden_size=kwargs.get("hidden_size", 1024), + num_attention_heads=kwargs.get("num_attention_heads", 8), + use_cpu_initialization=kwargs.get("use_cpu_initialization", True), + sequence_parallel=tp_size > 1, + add_bias_linear=kwargs.get("add_bias_linear", False), + ) + + # init moe layer + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=kwargs.get("moe_grouped_gemm", False) + ) + self.moe_layer = MoELayer( + self.config, transformer_layer_spec.submodules.mlp.submodules + ).cuda() + self.moe_layer.set_layer_number(0) + + def __del__(self): + torch.distributed.barrier() + torch.cuda.synchronize() + Utils.destroy_model_parallel() + + def dispatcher_dropless_test(self): + moe_layer = self.moe_layer + bs = 32 + seql = 8 + hidden_states = torch.randn((bs, seql, moe_layer.config.hidden_size)) + hidden_states = hidden_states.cuda() + hidden_states.requires_grad = True + probs, indices = moe_layer.router(hidden_states) + probs = torch.ones_like(probs) / moe_layer.router.topk + + ## Uncomment these lines to assist in bug location. + # hidden_states = torch.ones_like(hidden_states) * torch.distributed.get_rank() + # hidden_states.requires_grad = True + # indices = torch.ones_like(indices) * torch.distributed.get_rank() + # print(permuted_local_hidden_states) + + ( + permuted_local_hidden_states, + tokens_per_expert, + ) = moe_layer.token_dispatcher.token_permutation( + hidden_states, probs, indices + ) + + permuted_local_hidden_states /= moe_layer.config.tensor_model_parallel_size + + restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation( + permuted_local_hidden_states + ) + + assert torch.allclose( + restored_hidden_states, hidden_states + ), "Restored hidden states do not match original hidden states" + + # check if the grad of the hidden states is same as the hidden states + torch.autograd.backward(restored_hidden_states, restored_hidden_states) + assert torch.allclose( + hidden_states.grad, hidden_states + ), "Gradient of hidden states should be same as hidden states" + + def dispacher_capacity_test(self): + moe_layer = self.moe_layer + hidden_states = torch.randn((256, moe_layer.config.hidden_size)) + hidden_states = hidden_states.cuda() + hidden_states.requires_grad = True + probs, indices = moe_layer.router(hidden_states) + tp_size = moe_layer.config.tensor_model_parallel_size + tp_rank = parallel_state.get_tensor_model_parallel_rank() + + # Create the answer. + prob_mask = probs != 0 + probs = torch.ones_like(probs) * prob_mask / moe_layer.router.topk + local_probss = probs[ + probs.size(0) // tp_size * (tp_rank) : probs.size(0) // tp_size * (tp_rank + 1) + ] + restored_hidden_states_answer = hidden_states * local_probss.sum(dim=1).unsqueeze(1) + + ( + permuted_local_hidden_states, + tokens_per_expert, + ) = moe_layer.token_dispatcher.token_permutation( + hidden_states, probs, indices + ) + + print(f"Dispatched tokens per expert: {tokens_per_expert}") + + permuted_local_hidden_states /= moe_layer.config.tensor_model_parallel_size + + restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation( + permuted_local_hidden_states + ) + assert torch.allclose( + restored_hidden_states, restored_hidden_states_answer + ), "Restored hidden states does not match" + + # check if the grad of the hidden states is same as the hidden states + torch.autograd.backward(restored_hidden_states, hidden_states) + assert torch.allclose( + hidden_states.grad, restored_hidden_states_answer + ), "Gradient of hidden states should be same as hidden states" + + def dispatcher_drop_and_pad_test(self): + "Test if the tokens are dropped and padded correctly" + moe_layer = self.moe_layer + hidden_states = torch.randn((256, moe_layer.config.hidden_size)).cuda() + hidden_states.requires_grad = True + + # Create the answer. + moe_layer.config.moe_pad_expert_input_to_capacity = False + moe_layer.token_dispatcher.drop_and_pad = False + + # Uncomment these lines to help bug location. + # hidden_states = torch.ones((8, moe_layer.config.hidden_size)).cuda() + # hidden_states = hidden_states * torch.range(1, 8).unsqueeze(1).cuda() + # hidden_states.requires_grad = True + # indices_1 = torch.tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]).cuda() + # probs_1 = torch.ones_like(indices_1) + # indices_2 = torch.tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]).cuda() + # probs_2 = torch.ones_like(indices_2) + # num_local_tokens_per_expert = torch.tensor([2, 2, 2, 2, 2, 2, 2, 2]).cuda() + + probs_1, indices_1 = moe_layer.router(hidden_states) + (permuted_input_1, tokens_per_expert,) = moe_layer.token_dispatcher.token_permutation( + hidden_states, probs_1, indices_1 + ) + torch.distributed.barrier() + forward_answer, restored_bias = moe_layer.token_dispatcher.token_unpermutation( + permuted_input_1 + ) + torch.autograd.backward(forward_answer, forward_answer) + backward_answer = hidden_states.grad.clone() + hidden_states.grad = None + torch.cuda.synchronize() + moe_layer.token_dispatcher.drop_and_pad = True + moe_layer.config.moe_pad_expert_input_to_capacity = True + # End + + probs_2, indices_2 = moe_layer.router(hidden_states) + (permuted_input_2, tokens_per_expert,) = moe_layer.token_dispatcher.token_permutation( + hidden_states, probs_2, indices_2 + ) + restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation( + permuted_input_2 + ) + torch.distributed.barrier() + assert torch.allclose( + restored_hidden_states, forward_answer + ), "Restored hidden states does not match" + + # check if the grad of the hidden states is same as the hidden states + torch.autograd.backward(restored_hidden_states, restored_hidden_states) + assert torch.allclose( + hidden_states.grad, backward_answer + ), "Gradient of hidden states should be same as hidden states" + + def set_params(self): + # TODO: Set consistent parameters for various parallelisms. + raise NotImplementedError + + def destroy(self): + Utils.destroy_model_parallel() + + +class TestAllgatherDispatcher: + def setup_method(self, method): + pass + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("tp_size,ep_size", [ + (8, 1), + ]) + def test_forward_backward(self, tp_size, ep_size): + container = MoEModelTestContainer( + tp_size=tp_size, + ep_size=ep_size, + pp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="allgather", + ) + container.dispatcher_dropless_test() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_extended_tp_forward_backward(self): + container = MoEModelTestContainer( + tp_size=2, + ep_size=4, + pp_size=1, + num_moe_experts=8, + moe_router_topk=2, + moe_router_load_balancing_type="aux_loss", + moe_token_dispatcher_type="allgather", + sequence_parallel=True, + moe_extended_tp=True, + moe_grouped_gemm=True, + use_cpu_initialization=False, + ) + moe_layer = container.moe_layer + # [bs, seql, hidden size] + hidden_states = torch.randn((32, 8, moe_layer.router.config.hidden_size)) + hidden_states = hidden_states.cuda() + hidden_states.requires_grad = True + scores, indices = moe_layer.router(hidden_states) + assert scores.shape == (256, moe_layer.router.topk), "Scores shape is not correct" + assert indices.shape == (256, moe_layer.router.topk), "Indices shape is not correct" + scores = torch.ones_like(scores) / 2 + ( + permuted_local_hidden_states, + tokens_per_expert, + ) = moe_layer.token_dispatcher.token_permutation(hidden_states, scores, indices) + permuted_local_hidden_states /= moe_layer.config.tensor_model_parallel_size * moe_layer.config.expert_model_parallel_size + restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation( + permuted_local_hidden_states, bias=torch.zeros_like(permuted_local_hidden_states), + ) + + assert torch.allclose( + restored_hidden_states, hidden_states + ), "Restored hidden states do not match original hidden states" + + # check if the grad of the hidden states is same as the hidden states + torch.autograd.backward(restored_hidden_states, restored_hidden_states) + assert torch.allclose( + hidden_states.grad, hidden_states + ), "Gradient of hidden states should be same as hidden states" + container.destroy() diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py new file mode 100644 index 0000000..4a5680e --- /dev/null +++ b/tests/unit_tests/transformer/test_attention.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.attention import SelfAttention +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + +class TestParallelAttention: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + self.transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.parallel_attention = SelfAttention(self.transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules, + layer_number=1) + + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.parallel_attention, SelfAttention) + assert self.parallel_attention.layer_number == 1 + + num_weights = sum([p.numel() for p in self.parallel_attention.parameters()]) + assert num_weights == 648 + + def test_cpu_forward(self): + # we can't currently do this because the global memory buffer is on GPU + pass + + def test_gpu_forward(self): + + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 2 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + output, bias = self.parallel_attention(hidden_states, attention_mask) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + + def test_fused_rope_gpu_forward(self): + self.parallel_attention.config.apply_rope_fusion = True + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 2 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + rotary_pos_emb = torch.ones(sequence_length, 1, 1, self.parallel_attention.config.kv_channels).cuda() + output, bias = self.parallel_attention(hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + self.parallel_attention.config.apply_rope_fusion = False + + + def test_checkpointed_gpu_forward(self): + transformer_config = self.transformer_config + transformer_config.recompute_granularity='selective' + checkpointed_parallel_attention = SelfAttention(transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules, + layer_number=1) + config = checkpointed_parallel_attention.config + + sequence_length = 32 + micro_batch_size = 2 + + checkpointed_parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + (sequence_length, micro_batch_size, checkpointed_parallel_attention.config.hidden_size) + ) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + output, bias = checkpointed_parallel_attention(hidden_states, attention_mask) + + assert config.recompute_granularity == 'selective' + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size diff --git a/tests/unit_tests/transformer/test_attention_packed_seq.py b/tests/unit_tests/transformer/test_attention_packed_seq.py new file mode 100644 index 0000000..c8be7db --- /dev/null +++ b/tests/unit_tests/transformer/test_attention_packed_seq.py @@ -0,0 +1,131 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.transformer.attention import SelfAttention +from megatron.core.transformer.enums import AttnMaskType +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + +# Note: this test requires TE >= 0.13 as well as Flash Attention to run +# FIXME this unit test doesn't work in the current test container. to be fixed soon +""" +def make_test_packed_seq_params(sequence_length): + cu_seqlens = torch.IntTensor([0, 6, 19, 22, sequence_length]).cuda() + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + max_seqlen, _ = seqlens.max(dim=0, keepdim=True) + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_kv=max_seqlen, + qkv_format='thd', + ) + return packed_seq_params + + +class TestParallelAttentionWithPackedSequence: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + # use BF16 and a large enough hidden size to enable FlashAttention for thd format. + self.transformer_config = TransformerConfig(num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True, + bf16=True, params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, autocast_dtype=torch.bfloat16) + self.parallel_attention = SelfAttention(self.transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules, + layer_number=1, + attn_mask_type=AttnMaskType.causal) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_cpu_forward(self): + # we can't currently do this because the global memory buffer is on GPU + pass + + def test_gpu_forward(self): + + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 1 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)) + hidden_states = hidden_states.cuda().to(torch.bfloat16) + + attention_mask = None + + packed_seq_params = make_test_packed_seq_params(sequence_length) + output, bias = self.parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + + def test_fused_rope_gpu_forward(self): + self.parallel_attention.config.apply_rope_fusion = True + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 1 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)) + hidden_states = hidden_states.cuda().to(torch.bfloat16) + + attention_mask = None + rotary_pos_emb = torch.ones(sequence_length, 1, 1, self.parallel_attention.config.kv_channels).cuda() + + packed_seq_params = make_test_packed_seq_params(sequence_length) + output, bias = self.parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + self.parallel_attention.config.apply_rope_fusion = False + + def test_checkpointed_gpu_forward(self): + transformer_config = self.transformer_config + transformer_config.recompute_granularity='selective' + checkpointed_parallel_attention = SelfAttention(transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules, + layer_number=1, + attn_mask_type=AttnMaskType.causal) + config = checkpointed_parallel_attention.config + + sequence_length = 32 + micro_batch_size = 1 + + checkpointed_parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + (sequence_length, micro_batch_size, checkpointed_parallel_attention.config.hidden_size) + ) + hidden_states = hidden_states.cuda().to(torch.bfloat16) + + attention_mask = None + + packed_seq_params = make_test_packed_seq_params(sequence_length) + output, bias = checkpointed_parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params) + + assert config.recompute_granularity == 'selective' + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size +""" \ No newline at end of file diff --git a/tests/unit_tests/transformer/test_core_attention.py b/tests/unit_tests/transformer/test_core_attention.py new file mode 100644 index 0000000..2966b98 --- /dev/null +++ b/tests/unit_tests/transformer/test_core_attention.py @@ -0,0 +1,64 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +import pytest + +import torch + +from megatron.core.transformer.attention import CrossAttention +""" + +@pytest.fixture +def core_attention(transformer_config): + return CrossAttention(transformer_config) + + +class TestCoreAttention: + def test_constructor(self, core_attention): + assert isinstance(core_attention, CrossAttention) + assert core_attention.layer_number == 1 + + num_weights = sum([p.numel() for p in core_attention.parameters()]) + assert num_weights == 0 + + def test_cpu_forward(self, core_attention): + # we can't currently do this because the global memory buffer is on GPU + pass + + def test_gpu_forward(self, core_attention): + + # destroy_global_memory_buffer() + # _set_global_memory_buffer() + # model_parallel_cuda_manual_seed(123) + + core_attention.cuda() + config = core_attention.config + sequence_length = 32 + micro_batch_size = 2 + # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads] + query_layer = torch.ones( + ( + sequence_length, + micro_batch_size, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ) + ).cuda() + + key_layer = torch.ones_like(query_layer).cuda() + + value_layer = torch.ones_like(query_layer).cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + context_layer = core_attention( + query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask + ) + + assert context_layer.shape[0] == sequence_length + assert context_layer.shape[1] == micro_batch_size + assert context_layer.shape[2] == config.hidden_size + assert context_layer.device.type == 'cuda' + assert context_layer.dtype == torch.float32 + +""" \ No newline at end of file diff --git a/tests/unit_tests/transformer/test_mlp.py b/tests/unit_tests/transformer/test_mlp.py new file mode 100644 index 0000000..8e3f146 --- /dev/null +++ b/tests/unit_tests/transformer/test_mlp.py @@ -0,0 +1,58 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.mlp import MLP +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + +class TestParallelMLP: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.mlp = MLP(transformer_config, + get_gpt_layer_local_spec().submodules.mlp.submodules) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.mlp, MLP) + + num_weights = sum([p.numel() for p in self.mlp.parameters()]) + assert num_weights == 1212 + + """ + def test_cpu_forward(self, mlp): + # [sequence length, micro batch size, hidden size] + hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) + output, output_bias = mlp(hidden_states) + assert output.shape[0] == 32 + assert output.shape[1] == 2 + assert output.shape[2] == mlp.config.hidden_size + assert output_bias.shape[0] == mlp.config.hidden_size + assert output.dtype == torch.float32 + """ + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_gpu_forward(self): + mlp = self.mlp + mlp.cuda() + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) + hidden_states = hidden_states.cuda() + output, output_bias = mlp(hidden_states) + assert output.shape[0] == 32 + assert output.shape[1] == 2 + assert output.shape[2] == mlp.config.hidden_size + assert output_bias.shape[0] == mlp.config.hidden_size + assert output.dtype == torch.float32 + assert output.device.type == 'cuda' + assert output_bias.device.type == 'cuda' + diff --git a/tests/unit_tests/transformer/test_module.py b/tests/unit_tests/transformer/test_module.py new file mode 100644 index 0000000..b530709 --- /dev/null +++ b/tests/unit_tests/transformer/test_module.py @@ -0,0 +1,98 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest + +import torch + +from megatron.core.transformer.module import Float16Module, MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + +DEVICE_CAPABILITY = None +if torch.cuda.is_available(): + DEVICE_CAPABILITY = torch.cuda.get_device_capability() + + +class DummyModule(MegatronModule): + # def __init__(self, config: TransformerConfig, share_embeddings_and_output_weights=True): + def __init__(self, config: TransformerConfig): + super().__init__(config) + + self.linear = torch.nn.modules.Linear(in_features=2, out_features=1) + + def forward(self, x): + return self.linear(x) + +class TestMegatronModule: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.megatron_module = DummyModule(config=transformer_config).cuda() + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_megatron_module(self): + megatron_module = self.megatron_module + assert megatron_module + assert megatron_module.config.hidden_size == 12 + assert megatron_module.config.ffn_hidden_size == 48 + assert megatron_module.linear.weight.dtype == torch.float32 + + x = torch.ones((2, 2)).cuda() + assert megatron_module(x).dtype == torch.float32 + + # TODO: test bad configs actually fail + # failed_module = megatron_module + # failed_module.fp16 = True + # failed_module.bf16 = True + + +class TestFloat16Module: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + self.transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.megatron_module = DummyModule(config=self.transformer_config).cuda() + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_fp16_module(self): + transformer_config = self.transformer_config + megatron_module = self.megatron_module + transformer_config.fp16 = True + fp16_module = Float16Module(config=transformer_config, module=megatron_module) + + assert fp16_module + assert fp16_module.config.hidden_size == 12 + assert fp16_module.config.ffn_hidden_size == 48 + assert fp16_module.module.linear.weight.dtype == torch.float16 + + x = torch.ones((2, 2)).cuda() + # inputs are converted to fp16 then outputs are converted to fp32 + assert fp16_module(x).dtype == torch.float32 + + pytest.mark.skipif( + not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='bfloat16 is not supported on this device' + ) + + def test_bf16_module(self): + transformer_config = self.transformer_config + megatron_module = self.megatron_module + transformer_config.bf16 = True + bf16_module = Float16Module(config=transformer_config, module=megatron_module) + + assert bf16_module + assert bf16_module.config.hidden_size == 12 + assert bf16_module.config.ffn_hidden_size == 48 + assert bf16_module.module.linear.weight.dtype == torch.bfloat16 + + x = torch.ones((2, 2)).cuda() + # inputs are converted to bf16 then outputs are converted to fp32 + assert bf16_module(x).dtype == torch.float32 + diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py new file mode 100644 index 0000000..11ec7d5 --- /dev/null +++ b/tests/unit_tests/transformer/test_retro_attention.py @@ -0,0 +1,207 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import torch +import types + +from megatron.core.models.retro import RetroConfig, get_retro_decoder_block_spec +from megatron.core.models.retro.decoder_attention import ( + RetroDecoderCrossAttention, + RetroDecoderBiasDropoutAdd, +) +from megatron.core.models.retro.encoder_attention import ( + RetroEncoderCrossAttention, + RetroEncoderBiasDropoutAdd, + RetroEncoderLayerNorm, +) +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_block import TransformerBlock +from tests.unit_tests.test_utilities import Utils + + +class TestRetroAttention: + + @classmethod + def get_config(cls): + return RetroConfig( + num_layers=12, + hidden_size=16, + num_attention_heads=4, + use_cpu_initialization=True, + retro_num_neighbors=2, + retro_chunk_length=4, + retro_retrieved_length=8, + retro_split_preprocessing="98,2,0", + ) + + @classmethod + def get_modules(cls, config, use_transformer_engine, use_gpu): + + # Retro decoder layer. + decoder_block_spec = get_retro_decoder_block_spec( + config, use_transformer_engine=use_transformer_engine) + decoder_block = TransformerBlock(config=config, spec=decoder_block_spec) + decoder_layers = [ layer for layer in decoder_block.layers if isinstance(layer.cross_attention, RetroDecoderCrossAttention) ] + decoder_layer = decoder_layers[0] + + # Retro encoder layer. + encoder_block = decoder_layer.cross_attention.encoder + encoder_layers = [ layer for layer in encoder_block.layers if isinstance(layer.cross_attention, RetroEncoderCrossAttention) ] + encoder_layer = encoder_layers[0] + + # Modules. + modules = types.SimpleNamespace( + decoder_attn = decoder_layer.cross_attention, + decoder_bda = decoder_layer.cross_attn_bda, + encoder_attn = encoder_layer.cross_attention, + encoder_bda = encoder_layer.cross_attn_bda, + encoder_norm = encoder_layer.pre_mlp_layernorm, + ) + + # GPU. + if use_gpu: + [ m.cuda() for m in vars(modules).values() ] + + return modules + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + + config = self.get_config() + modules = self.get_modules( + config, + use_transformer_engine=True, + use_gpu=False, + ) + + assert isinstance(modules.decoder_attn, RetroDecoderCrossAttention) + assert isinstance(modules.decoder_bda, RetroDecoderBiasDropoutAdd) + assert isinstance(modules.encoder_attn, RetroEncoderCrossAttention) + assert isinstance(modules.encoder_bda, RetroEncoderBiasDropoutAdd) + assert isinstance(modules.encoder_norm, RetroEncoderLayerNorm) + + assert modules.decoder_attn.attn.layer_number == 6 + assert modules.encoder_attn.attn.layer_number == 1 + + get_nparams = lambda m : sum(p.numel() for p in m.parameters()) + assert get_nparams(modules.decoder_attn) == 8768 + assert get_nparams(modules.decoder_bda) == 0 + assert get_nparams(modules.encoder_attn) == 1088 + assert get_nparams(modules.encoder_bda) == 0 + assert get_nparams(modules.encoder_norm) == 32 + + def test_cpu_forward(self): + # we can't currently do this because the global memory buffer is on GPU + pass + + def run_gpu_forward(self, recompute_granularity, use_transformer_engine): + + config = self.get_config() + config.recompute_granularity = recompute_granularity + modules = self.get_modules(config, use_transformer_engine, use_gpu=True) + + seq_length = 32 + micro_batch_size = 2 + n_chunks_per_sample = seq_length // config.retro_chunk_length + + # Init tensors. + hidden_states = torch.ones(( + seq_length, + micro_batch_size, + config.hidden_size, + )).cuda() + attention_mask = None + decoder_context = torch.ones(( + config.retro_retrieved_length, + config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample, + config.hidden_size, + )).cuda() + encoder_context = torch.ones(( + config.retro_chunk_length, + micro_batch_size * n_chunks_per_sample, + config.hidden_size, + )).cuda() + + # Forward decoder. + decoder_attn_output = modules.decoder_attn( + hidden_states, + attention_mask, + decoder_context, + ) + with torch.enable_grad(): + decoder_bda_output = modules.decoder_bda(True, True)( + decoder_attn_output, + hidden_states, + config.hidden_dropout, + ) + + # Forward encoder. + encoder_attn_output_tuples = modules.encoder_attn( + decoder_context, + None, + encoder_context, + ) + with torch.enable_grad(): + encoder_bda_output = modules.encoder_bda(True, True)( + encoder_attn_output_tuples, + decoder_context, + config.retro_encoder_hidden_dropout, + ) + encoder_norm_output = modules.encoder_norm(encoder_bda_output) + + # Verify decoder. + assert set(decoder_attn_output.keys()) == set([ "ns", "bs", "d", "l", "pad", "attention_output", "attention_bias", "context"]) + assert decoder_attn_output["ns"] == seq_length + assert decoder_attn_output["bs"] == micro_batch_size + assert decoder_attn_output["d"] == config.hidden_size + assert decoder_attn_output["l"] == n_chunks_per_sample + assert decoder_attn_output["pad"] == 3 + assert tuple(decoder_attn_output["attention_output"].shape) == ( + config.retro_chunk_length, + micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + assert tuple(decoder_attn_output["attention_bias"].shape) == ( + config.hidden_size, + ) + assert decoder_attn_output["context"].shape == ( + config.retro_retrieved_length * config.retro_num_neighbors, + micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + assert decoder_bda_output.shape == hidden_states.shape + + # Verify encoder. + assert len(encoder_attn_output_tuples) == config.retro_num_neighbors + for output, bias, residual in encoder_attn_output_tuples: + assert tuple(output.shape) == ( + config.retro_retrieved_length, + micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + assert tuple(bias.shape) == (config.hidden_size,) + assert tuple(residual.shape) == ( + config.retro_retrieved_length, + micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + assert encoder_bda_output.shape == ( + config.retro_retrieved_length, + config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + assert encoder_norm_output.shape == ( + config.retro_retrieved_length, + config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample, + config.hidden_size, + ) + + def test_gpu_forward(self): + for recompute_granularity in (None, 'selective'): + for use_transformer_engine in (True, False): + self.run_gpu_forward(recompute_granularity, use_transformer_engine) diff --git a/tests/unit_tests/transformer/test_rope.py b/tests/unit_tests/transformer/test_rope.py new file mode 100644 index 0000000..f166180 --- /dev/null +++ b/tests/unit_tests/transformer/test_rope.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch + +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from tests.unit_tests.test_utilities import Utils + + +class TestRotaryEmbedding: + def setup_method(self): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + self.kv_channels = 8 + self.rotary_percent = 1.0 + self.rope_cpu_init = RotaryEmbedding( + self.kv_channels, self.rotary_percent, use_cpu_initialization=True + ) + self.rope_gpu_init = RotaryEmbedding( + self.kv_channels, self.rotary_percent, use_cpu_initialization=False + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_constructor(self): + assert isinstance(self.rope_cpu_init, RotaryEmbedding) + assert self.rope_cpu_init.inv_freq.device.type == 'cpu' + assert isinstance(self.rope_gpu_init, RotaryEmbedding) + assert self.rope_gpu_init.inv_freq.device.type == 'cuda' + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_gpu_forward(self): + output = self.rope_gpu_init(64) + assert output.shape[0] == 64 + assert output.shape[1] == 1 + assert output.shape[2] == 1 + assert output.shape[3] == self.kv_channels + assert output.dtype == torch.float32 + assert output.device.type == 'cuda' + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_cpu_forward(self): + output = self.rope_cpu_init(64) + assert output.shape[0] == 64 + assert output.shape[1] == 1 + assert output.shape[2] == 1 + assert output.shape[3] == self.kv_channels + assert output.dtype == torch.float32 + assert output.device.type == 'cuda' diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py new file mode 100644 index 0000000..f0ee9e7 --- /dev/null +++ b/tests/unit_tests/transformer/test_spec_customization.py @@ -0,0 +1,243 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import sys +from dataclasses import dataclass, fields +from importlib.metadata import version + +import pytest +import torch +import transformer_engine as te +from pkg_resources import packaging + +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TENorm, + TERowParallelLinear, +) +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.spec_utils import ModuleSpec, build_module, import_module +from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules +from tests.unit_tests.test_utilities import Utils + + +class TestSpecCustomization: + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + self.config = TransformerConfig( + num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True + ) + + # specify Transformer Layer spec with all identity ops + self.transformer_layer_spec = TransformerLayerSubmodules() + + # specify attention spec using already imported class + self.attention_spec = ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ) + + # specify layernorm spec with module path to test dynamic importing + self.layernorm_spec = ModuleSpec( + module=("megatron.core.transformer.custom_layers.transformer_engine", "TENorm"), + ) + + # specify bias dropout add with module path + self.bda_spec = ModuleSpec( + module=("megatron.core.fusions.fused_bias_dropout", "get_bias_dropout_add") + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_import_module(self): + self_attention_cls = import_module( + module_path=('megatron.core.transformer.attention', 'SelfAttention') + ) + assert id(self_attention_cls) == id(SelfAttention) + + layernorm_cls = import_module(module_path=self.layernorm_spec.module) + assert id(layernorm_cls) == id(TENorm) + + def test_build_module(self): + # Check NoOp TransformerLayer + random_input = 12 + noop_transformer_layer = [ + build_module(getattr(self.transformer_layer_spec, field.name)) + for field in fields(self.transformer_layer_spec) + if field.name != 'sharded_state_dict_keys_map' + ] + + x = random_input + for mod in noop_transformer_layer: + # checking for `IdentityFuncOp` before `IdentityOp` because former + # is derived from the latter and so the second if statement will + # always be `True`. + if isinstance(mod, IdentityFuncOp): + x = mod()(x) + elif isinstance(mod, IdentityOp): + x = mod(x) + + assert x == random_input + + # Check SelfAttention + self_attention = build_module(self.attention_spec, config=self.config, layer_number=1,) + assert isinstance(self_attention, SelfAttention) + assert self_attention.layer_number == 1 + assert self_attention.attn_mask_type == self.attention_spec.params['attn_mask_type'] + + num_weights = sum([p.numel() for p in self_attention.parameters()]) + assert num_weights == 648 + + # Check SelfAttention but with already initialized module + # `self_attention`. In this test, `build_module` acts as a no op as it + # simply returns the initialized module. + # NOTE: (sudhakars) Uncomment this test once this feature gets added + # back. + # self_attention2 = build_module( + # self_attention, config=self.config, spec=self.attention_spec, + # ) + # assert isinstance(self_attention2, SelfAttention) + # assert self_attention2.layer_number == 1 + # assert self_attention2.attn_mask_type == self.attention_spec.params['attn_mask_type'] + + # num_weights = sum([p.numel() for p in self_attention2.parameters()]) + # assert num_weights == 648 + + # Check LayerNorm + layernorm = build_module( + self.layernorm_spec, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + assert isinstance(layernorm, te.pytorch.LayerNorm) + + # Check BiasDropoutAdd + bda_op = build_module(self.bda_spec) + assert id(bda_op) == id(get_bias_dropout_add) + + def test_sliding_window_attention(self): + te_version = packaging.version.Version(version("transformer-engine")) + if te_version < packaging.version.Version("1.2.0"): + print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr) + return + + config = TransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + use_cpu_initialization=True, + window_size=[10, 0], + ) + # Make sure DotProductAttention throws (swa unsupported). + threw = False + try: + attn = DotProductAttention( + config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self' + ) + except: + threw = True + finally: + assert threw, 'Expected DotProductAttention to throw exception for SWA' + + # Test TEDotProductAttention + attn = TEDotProductAttention( + config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self' + ) + # Make sure window-size is what we expect. + assert attn.window_size == config.window_size + + # Single integer window-size unsupported, make sure it throws + threw = False + try: + config.window_size = 11 + attn = TEDotProductAttention( + config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self' + ) + except: + threw = True + finally: + assert threw, "Expected TEDotProductAttention to throw for integer window-size" + + # `None` makes this causal. + config.window_size = None + attn = TEDotProductAttention( + config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self' + ) + # Make sure it's causal. + assert attn.window_size == (-1, 0) + + def test_transformer_block_custom(self): + """ + This test checks that the two ways of passing `layer_spec` to a + `TransformerBlock` result in an identical model: + 1. ModuleSpec(module=..., submodules=...) + 2. TransformerBlockSubmodules(layer_specs=...) + """ + + transformer_config = TransformerConfig( + num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True + ) + layer_local_spec = get_gpt_layer_local_spec() + + # The following way can be used to pass a different `TransformerLayer` + # and internally the `TransformerBlock` would fan out the single + # `ModuleSpec` layer spec provided to all the layers of the block. + layer_spec1 = ModuleSpec(module=TransformerLayer, submodules=layer_local_spec.submodules) + model_parallel_cuda_manual_seed(123) + torch.manual_seed(0) + parallel_transformer_block1 = TransformerBlock(transformer_config, layer_spec1) + + layer_spec2 = TransformerBlockSubmodules( + layer_specs=[ + ModuleSpec(module=TransformerLayer, submodules=layer_local_spec.submodules) + ] + * transformer_config.num_layers, + layer_norm=TENorm, + ) + # make sure the model init conditions are identical + model_parallel_cuda_manual_seed(123) + torch.manual_seed(0) + parallel_transformer_block2 = TransformerBlock(transformer_config, layer_spec2) + + sequence_length = 32 + micro_batch_size = 2 + parallel_transformer_block1.cuda() + parallel_transformer_block2.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + (sequence_length, micro_batch_size, transformer_config.hidden_size) + ) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + out1 = parallel_transformer_block1( + hidden_states=hidden_states, attention_mask=attention_mask + ) + out2 = parallel_transformer_block2( + hidden_states=hidden_states, attention_mask=attention_mask + ) + + assert torch.all(torch.eq(out1, out2)) + assert out1.shape[0] == sequence_length == out2.shape[0] + assert out1.shape[1] == micro_batch_size == out2.shape[1] + assert out1.shape[2] == transformer_config.hidden_size == out2.shape[2] diff --git a/tests/unit_tests/transformer/test_transformer_block.py b/tests/unit_tests/transformer/test_transformer_block.py new file mode 100644 index 0000000..ad681ac --- /dev/null +++ b/tests/unit_tests/transformer/test_transformer_block.py @@ -0,0 +1,107 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import os +import pytest + +import torch +from megatron.core import dist_checkpointing + +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayer +from megatron.core.transformer.transformer_block import TransformerBlock +from tests.unit_tests.test_utilities import Utils +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + +class TestParallelTransformerBlock: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + self.transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.parallel_transformer_block = TransformerBlock(self.transformer_config, + get_gpt_layer_with_transformer_engine_spec()) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + parallel_transformer_block = self.parallel_transformer_block + assert isinstance(parallel_transformer_block, TransformerBlock) + num_weights = sum([p.numel() for p in parallel_transformer_block.parameters()]) + assert num_weights == 3792 + assert parallel_transformer_block.num_layers_per_pipeline_rank == 2 + assert len(parallel_transformer_block.layers) == 2 + layer_0: TransformerLayer = parallel_transformer_block._get_layer(0) + assert layer_0.layer_number == 1 + layer_1: TransformerLayer = parallel_transformer_block._get_layer(1) + assert layer_1.layer_number == 2 + + def test_gpu_forward(self): + parallel_transformer_block = self.parallel_transformer_block + config: TransformerConfig = parallel_transformer_block.config + + sequence_length = 32 + micro_batch_size = 2 + parallel_transformer_block.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + hidden_states = parallel_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size + + def test_gpu_forward_full_checkpoint(self): + transformer_config = self.transformer_config + config = transformer_config + config.recompute_granularity = 'full' + config.recompute_method = 'block' + config.recompute_num_layers = config.num_layers + full_transformer_block = TransformerBlock(config, + get_gpt_layer_with_transformer_engine_spec()) + assert full_transformer_block.config.recompute_granularity == 'full' + assert full_transformer_block.config.recompute_method == 'block' + + sequence_length = 32 + micro_batch_size = 2 + full_transformer_block.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + hidden_states = full_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size + + def test_gpu_forward_selective_checkpoint(self): + transformer_config = self.transformer_config + config = transformer_config + config.recompute_granularity = 'selective' + selective_transformer_block = TransformerBlock(config, + get_gpt_layer_with_transformer_engine_spec()) + assert selective_transformer_block.config.recompute_granularity == 'selective' + assert selective_transformer_block.checkpoint_core_attention + + sequence_length = 32 + micro_batch_size = 2 + selective_transformer_block.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + hidden_states = selective_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size diff --git a/tests/unit_tests/transformer/test_transformer_layer.py b/tests/unit_tests/transformer/test_transformer_layer.py new file mode 100644 index 0000000..31792db --- /dev/null +++ b/tests/unit_tests/transformer/test_transformer_layer.py @@ -0,0 +1,106 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +import pytest + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor +from megatron.core.transformer.transformer_layer import TransformerLayer +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from tests.unit_tests.test_utilities import Utils + + +class TestParallelTransformerLayer: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.parallel_transformer_layer = TransformerLayer(transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + parallel_transformer_layer = self.parallel_transformer_layer + assert isinstance(parallel_transformer_layer, TransformerLayer) + assert parallel_transformer_layer.layer_number == 1 + + num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()]) + assert num_weights == 1884 + + def test_gpu_forward(self): + parallel_transformer_layer = self.parallel_transformer_layer + config: TransformerConfig = parallel_transformer_layer.config + sequence_length = 32 + micro_batch_size = 2 + parallel_transformer_layer.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size + + @pytest.mark.parametrize('order', ['tp-pp-dp', 'tp-dp-pp']) + @pytest.mark.parametrize('tp_pp', [(4, 2), (1, 1), (8, 1), (2, 2)]) + def test_sharded_state_dict(self, tp_pp, order): + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(*tp_pp, order=order) + + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True) + parallel_transformer_layer = TransformerLayer(transformer_config, + get_gpt_layer_with_transformer_engine_spec().submodules) + + sharded_state_dict = parallel_transformer_layer.sharded_state_dict() + + extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')} + sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')} + assert all(isinstance(t, ShardedObject) for t in extra_states.values()) + assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values()) + + # Test all local shapes + tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} + tp_size = parallel_state.get_tensor_model_parallel_world_size() + assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) + + # Test all global shapes. Prepend num layers in front of expected shapes + tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} + expected_global_shapes = get_tensor_shapes_for_tp(transformer_config, 1) + assert tensor_global_shapes == expected_global_shapes + + # Test ShardedTensor keys + for state_dict_key, sh_ten in sharded_tensors.items(): + assert state_dict_key == sh_ten.key + + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(1, 1) + + +def get_tensor_shapes_for_tp(transformer_config, tp_size): + hs = transformer_config.hidden_size + return { + 'mlp.linear_fc1.layer_norm_weight': (hs,), + 'mlp.linear_fc1.layer_norm_bias': (hs,), + 'mlp.linear_fc1.weight': (hs * 4 // tp_size, hs), + 'mlp.linear_fc1.bias': (hs * 4 // tp_size,), + 'mlp.linear_fc2.weight': (hs, hs * 4 // tp_size), + 'mlp.linear_fc2.bias': (hs,), + 'self_attention.linear_proj.weight': (hs, hs // tp_size), + 'self_attention.linear_proj.bias': (hs,), + 'self_attention.linear_qkv.layer_norm_weight': (hs,), + 'self_attention.linear_qkv.layer_norm_bias': (hs,), + 'self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), + 'self_attention.linear_qkv.bias': (hs * 3 // tp_size,), + } diff --git a/tools/autoformat.sh b/tools/autoformat.sh new file mode 100644 index 0000000..784a784 --- /dev/null +++ b/tools/autoformat.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -euox pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +CHECK_ONLY=${CHECK_ONLY:-false} +CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/main megatron/core | grep '\.py$' || true) +ADDITIONAL_ARGS="" +ADDITIONAL_BLACK_ARGS="" + +if [[ $CHECK_ONLY == true ]]; then + ADDITIONAL_ARGS="--check" + ADDITIONAL_BLACK_ARGS="--diff" +fi + +# for now we just format core +if [[ -n "$CHANGED_FILES" ]]; then + black $ADDITIONAL_ARGS $ADDITIONAL_BLACK_ARGS --verbose $CHANGED_FILES + isort $ADDITIONAL_ARGS $CHANGED_FILES +else + echo Changeset is empty, all good. +fi diff --git a/tools/bert_embedding/__init__.py b/tools/bert_embedding/__init__.py new file mode 100644 index 0000000..766a66b --- /dev/null +++ b/tools/bert_embedding/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from .embed import BertEmbedder, DiskDataParallelBertEmbedder diff --git a/tools/bert_embedding/dataset.py b/tools/bert_embedding/dataset.py new file mode 100644 index 0000000..da165b8 --- /dev/null +++ b/tools/bert_embedding/dataset.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import numpy as np +import torch + +from megatron.training import get_args, get_tokenizer + + +class BertEmbeddingDataset(torch.utils.data.Dataset): + '''Dataset to convert a text dataset to Bert tokens.''' + + def __init__(self, text_dataset, max_seq_length): + + super().__init__() + + args = get_args() + + # Dataset, tokenizer. + self.text_dataset = text_dataset + self.max_seq_length = max_seq_length + self.bert_tokenizer = get_tokenizer() + + def __len__(self): + return len(self.text_dataset) + + @classmethod + def build_sample(cls, tokenizer, token_ids): + get_constant_array = lambda c : np.full((len(token_ids) + 2,), c, "int64") + return { + "text" : np.array([ tokenizer.cls, *token_ids, tokenizer.sep ], dtype="int64"), + "types" : get_constant_array(0), + "labels" : get_constant_array(-1), + "is_random" : 0, + "loss_mask" : get_constant_array(0), + "padding_mask" : get_constant_array(1), + "truncated" : 0, + } + + def __getitem__(self, idx): + + # Text. + text_sample = self.text_dataset[idx] + text = text_sample["text"] + text = text.replace("<|endoftext|>", "") + + # Bert/Wordpiece tokens (+truncate). + bert_token_ids = self.bert_tokenizer.tokenize(text) + bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep. + if not bert_token_ids: + bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq + + # Bert sample. + sample = self.build_sample(self.bert_tokenizer, bert_token_ids) + + return sample diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py new file mode 100644 index 0000000..2236182 --- /dev/null +++ b/tools/bert_embedding/embed.py @@ -0,0 +1,278 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from functools import partial +import numpy as np +import os +import time +import torch +from torch.utils.data import BatchSampler, DataLoader, SequentialSampler, Subset +from torch.utils.data._utils.collate import default_collate +from tqdm import tqdm + +from megatron.training import get_args, get_tokenizer, print_rank_0 +from megatron import core +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.datasets.retro.utils import get_blocks_by_rank +from megatron.core.enums import ModelType +from megatron.core.pipeline_parallel import get_forward_backward_func +from megatron.legacy.model import BertModel +from megatron.training.training import setup_model_and_optimizer +from pretrain_bert import model_provider, get_batch, loss_func, forward_step + +from .dataset import BertEmbeddingDataset +from .external_libs import h5py +from .huggingface import HuggingfaceEmbedder + + +def collate_batch(samples): + """Collate samples of various lengths. + + This collate function handles samples with various sequence lengths, by + padding 'text' arrays with pad_id, and other arrays with 0. + """ + + n_samples = len(samples) + keys = list(samples[0].keys()) + tokenizer = get_tokenizer() + + # Max sample length across all samples. + max_length_map = { key:0 for key in keys } + for sample in samples: + for key in keys: + value_length = \ + len(sample[key]) if isinstance(sample[key], np.ndarray) else None + max_length_map[key] = None \ + if value_length is None else \ + max(max_length_map[key], value_length) + + # Pad samples. + padded_samples = [] + for sample in samples: + padded_sample = {} + for key in keys: + padded_sample[key] = \ + np.pad( + sample[key], + (0, max_length_map[key] - len(sample[key])), + mode="constant", + constant_values=tokenizer.pad_id if key == "text" else 0, + ) \ + if isinstance(sample[key], np.ndarray) else \ + sample[key] + padded_samples.append(padded_sample) + + # Build batch with padded samples. + batch = default_collate(padded_samples) + + return batch + + +def get_data_loader(dataset, batch_size): + """Build data loader over data subset. + + Get a subset of the dataset (from start_idx -> end_idx), and wrap it in + a sequential sampler and data loader. + """ + + args = get_args() + + # Sequential & batch samplers. + batch_sampler = BatchSampler( + sampler=SequentialSampler(dataset), + batch_size=batch_size, + drop_last=False, + ) + + # Data loader. + data_loader = DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + collate_fn=collate_batch) + + return data_loader + + +def embed_data_loader(models, data_loader, tag): + '''Iterate data loader and compute embeddings.''' + + # Verify no model parallelism. + args = get_args() + assert args.tensor_model_parallel_size == 1 and \ + args.pipeline_model_parallel_size == 1, \ + "since we call forward_step directly, only tp == pp == 1 allowed." + + # Data iterator. + data_iterator = iter(data_loader) + + # Eval mode. + for m in models: + m.eval() + + # Embed. + embeddings = [] + for _ in tqdm( + range(len(data_loader)), + " embed%s" % ("" if tag is None else " / '%s'" % tag), + miniters=len(data_loader) // 10, + disable=torch.distributed.get_rank() != 0, + ): + with torch.no_grad(): + result = forward_step(data_iterator, models[0]) + embeddings.append(result[0].detach().cpu().numpy()) + + # Concatenate embeddings. + embeddings = np.concatenate(embeddings, axis=0) + + return embeddings + + +class TextDataset(torch.utils.data.Dataset): + '''Dataset that holds a list of strings.''' + + def __init__(self, texts): + assert isinstance(texts, list) + for t in texts: + assert isinstance(t, str) + self.texts = texts + + def __len__(self): + return len(self.texts) + + def __getitem__(self, i): + return {"text": self.texts[i]} + + +class BertEmbedder: + '''Compute Bert embeddings, from a text dataset.''' + + def __init__(self, batch_size, max_bert_seq_length, embedder_type, warmup=True): + + args = get_args() + + assert args.output_bert_embeddings + + self.models, optimizer, opt_param_scheduler = \ + setup_model_and_optimizer(model_provider, + ModelType.encoder_or_decoder) + self.batch_size = batch_size + self.max_bert_seq_length = max_bert_seq_length + + # Init Huggingface, if in use. + if embedder_type == "megatron": + self.huggingface_embedder = None + elif embedder_type == "huggingface": + self.huggingface_embedder = HuggingfaceEmbedder(batch_size, + max_bert_seq_length) + else: + raise Exception("specialize for embedder type '%s'." % embedder_type) + + # Warm-up JIT. + # - Important to separately warm up: + # 1. batch_size == 1 + # 2. batch_size > 1 + if warmup: + warmup_dataset = TextDataset([ + "great fleas have lesser fleas, upon their backs to bite’em,", + "and lesser fleas have lesser fleas, and so, ad infinitum,", + "and those great fleas, themselves, in turn have greater fleas to go on,", + "while those again have greater still, and greater still, and so on.", + ]) + print_rank_0("bert / warmup single.") + for _ in range(3): + self.embed_text("hi, bert.") # batch size == 1 + print_rank_0("bert / warmup batch.") + for _ in range(3): + self.embed_text_dataset(warmup_dataset) # batch size > 1 + + def embed_text_dataset(self, text_dataset, tag=None): + '''Embed a text dataset.''' + + # Huggingface. + if self.huggingface_embedder: + return self.huggingface_embedder.embed_text_dataset(text_dataset) + + # Wrap in a BertEmbeddingDataset to tokenize samples. + bert_dataset = BertEmbeddingDataset(text_dataset, + self.max_bert_seq_length) + + # Embed. + data_loader = get_data_loader(bert_dataset, self.batch_size) + embeddings = embed_data_loader(self.models, data_loader, tag) + + return embeddings + + def embed_text(self, text): + '''Embed a single text string. + + Primarily used for on-the-fly embeddings, particularly during + analysis or debugging. For large scale, use 'embed_text_dataset()'. + ''' + + # Embed text. + text_ds = TextDataset([ text ]) + embed = self.embed_text_dataset(text_ds)[0] + + return embed + + +class DiskDataParallelBertEmbedder: + '''Process embeddings in blocks & save to disk.''' + + def __init__(self, embedder, block_size): + assert isinstance(embedder, BertEmbedder) + self.embedder = embedder + self.block_size = block_size + + def embed_text_blocks(self, name, dirname, text_dataset, + missing_embedding_blocks): + '''Process a text dataset in blocks.''' + + # Iterate blocks. + for block_index, block_info in enumerate(missing_embedding_blocks): + + # Missing block lists are extended with None to have equal-length + # lists. Skip the Nones. + if block_info is not None: + + # Progress. (*note*: move world progress to here.) + print_rank_0("embed '%s' block %d / %d ... %s." % ( + name, + block_index, + len(missing_embedding_blocks), + block_info["path"], + )) + + # Embed block. + sub_dataset = Subset(text_dataset, range(*block_info["range"])) + embeddings = self.embedder.embed_text_dataset(sub_dataset) + + # Save embeddings. + f = h5py.File(block_info["path"], "w") + f.create_dataset("data", data=embeddings) + f.close() + + # Synchronize progress across all ranks. (for easier observation) + print_rank_0(" > waiting for other ranks to finish block.") + torch.distributed.barrier() + + def embed_text_dataset(self, name, dirname, text_dataset): + '''Embed a text dataset.''' + + # Dataset dir. + os.makedirs(dirname, exist_ok=True) + + # Missing embedding blocks (stored on disk). + def validate(f): + assert f["data"].shape[1] == 1024 + blocks = get_blocks_by_rank( + dirname, + len(text_dataset), + self.block_size, + validate=validate) + + # Prevent missing file race condition. + torch.distributed.barrier() + + # Embed batches. + self.embed_text_blocks(name, dirname, text_dataset, blocks.missing) diff --git a/tools/bert_embedding/external_libs.py b/tools/bert_embedding/external_libs.py new file mode 100644 index 0000000..fb8e69f --- /dev/null +++ b/tools/bert_embedding/external_libs.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import importlib + +required_libs = [ + "h5py", + "transformers", # for huggingface bert +] + +for lib in required_libs: + try: + globals()[lib] = importlib.import_module(lib) + except ImportError as e: + raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") diff --git a/tools/bert_embedding/huggingface.py b/tools/bert_embedding/huggingface.py new file mode 100644 index 0000000..1a08a80 --- /dev/null +++ b/tools/bert_embedding/huggingface.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import numpy as np +import torch +from tqdm import tqdm + +from .external_libs import transformers + + +class IterableTextDataset(torch.utils.data.IterableDataset): + '''Iterable over a text dataset.''' + + def __init__(self, text_dataset): + self.text_dataset = text_dataset + + def __iter__(self): + '''Remove 'endoftext' string.''' + for sample_idx in range(len(self.text_dataset)): + sample = self.text_dataset[sample_idx] + text = sample["text"].replace("<|endoftext|>", "") + yield text + + +class MyFeatureExtractionPipeline(transformers.FeatureExtractionPipeline): + def _forward(self, model_inputs): + + # Embed inputs. + model_outputs = self.model(**model_inputs) + + # Attention mask. + embeddings = model_outputs[0] + masks = torch.sum(model_inputs['attention_mask'], dim=1) + + # Collect embeddings & check for nan. + outputs = [] + for embedding, mask in zip(embeddings, masks): + output = torch.mean(embedding[1: mask - 1], dim=0) + + # Nans due to empty input sequences; so only check first element. + if torch.isnan(output.view(-1)[0]).any(): + output.zero_() + + outputs.append(output) + + # Sample. + data = { + "input" : model_inputs["input_ids"], + "output" : outputs, + } + + return data + + def postprocess(self, model_outputs): + # Return input for analysis. + return { + "input" : model_outputs["input"].numpy(), + "output" : model_outputs["output"].numpy(), + } + + +class HuggingfaceEmbedder: + + def __init__(self, batch_size, max_seq_length): + + # Model, tokenizer. + self.model = transformers.BertModel.from_pretrained("bert-large-cased") + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + "bert-large-cased", model_max_length=max_seq_length) + + # Feature extraction pipeline. + self.pipe = MyFeatureExtractionPipeline( + model=self.model, + tokenizer=self.tokenizer, + device=torch.cuda.current_device(), + truncation=True, + max_length=max_seq_length, + ) + + self.batch_size = batch_size + + def embed_text_dataset(self, text_dataset, verbose=True): + + # Wrap dataset in iterable. + dataset = IterableTextDataset(text_dataset) + + # Allocate output array. + n_samples = len(text_dataset) + embeddings = np.zeros((n_samples, 1024), dtype="f4") + start_idx = 0 + + # Wrap iterator in tqdm for verbose output. + _iter = self.pipe(dataset, batch_size=self.batch_size) + if verbose: + _iter = tqdm(_iter, "hf embed", total=n_samples) + + # Embed dataset. + for idx, out_dict in enumerate(_iter): + inp = out_dict["input"] + out = out_dict["output"] + embeddings[start_idx] = out + start_idx += 1 + + return embeddings + + def embed_text(self, text): + '''Embed a single text string. + + Primarily used for on-the-fly embeddings, particularly during + analysis or debugging. For large scale, use 'embed_text_dataset()'. + ''' + + class SingleTextDataset(torch.utils.data.Dataset): + '''Dataset that holds single string.''' + def __init__(self, text): + assert isinstance(text, str) + self.text = text + def __len__(self): + return 1 + def __getitem__(self, i): + return {"text": self.text} + + # Embed text. + text_ds = SingleTextDataset(text) + embed = self.embed_text_dataset(text_ds, verbose=False)[0] + + return embed diff --git a/tools/checkpoint/convert.py b/tools/checkpoint/convert.py new file mode 100644 index 0000000..935613b --- /dev/null +++ b/tools/checkpoint/convert.py @@ -0,0 +1,154 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import argparse +import importlib +import torch.multiprocessing as mp +import sys + +# A loader is a python file with at least two functions +# - add_arguments - takes in a parser and adds any arguments needed +# - load_checkpoint - takes in the queue and parsed arguments + +# A saver is similar but has save_checkpoint instead of +# load_checkpoint + +# The loader and saver process are each given a queue, the loader +# should load the checkpoint and send the weights in messages in the +# following order, the saver should receive them in this order and +# save the checkpoints. A message consists of a python dictionary with +# a "name" for error checking and an entry for each tensor as +# indicated below. Note that the weight sent over the queue are the +# full model weights, nothing split. + +# If the loader ever sends "exit" to the queue, that means something +# went wrong and it is exiting. + +# - Metadata Namespace with the following attributes: +# model_type - GPT, BERT, T5, etc. (Part of protocol to allow this to be deduced later instead of given on command line) +# num_layers - Number of transformer layers +# hidden_size +# seq_length +# num_attention_heads +# max_position_embeddings +# tokenizer_type +# iteration +# params_dtype +# bert_binary_head - Used only if model_type is BERT +# previous_tensor_parallel_size - Optional +# previous_pipeline_parallel_size - Optional +# true_vocab_size +# make_vocab_size_divisble_by +# consumed_train_samples +# consumed_valid_samples +# messages +# { +# "name": "embeddings" +# "position embeddings" +# "word embeddings" +# } +# (for each transformer layer): +# { +# "name": "transformer layer N" +# "input norm weight" +# "input norm bias" +# "qkv weight" +# "qkv bias" +# "dense weight" +# "dense bias" +# "post norm weight" +# "post norm bias" +# "mlp l0 weight" +# "mlp l0 bias" +# "mlp l1 weight" +# "mlp l1 bias" +# } +# { +# "name": "final layer norm" +# "weight" +# "bias" +# } +# if present (i.e. for BERT): +# { +# "name": "pooler" +# "weight" +# "bias" +# } +# { +# "name": "lm head" +# "dense weight" +# "dense bias" +# "norm weight" +# "norm bias" +# } +# { +# "name": "binary head" +# "weight" +# "bias" +# } +# - "done" + +def load_plugin(plugin_type, name): + module_name = f"{plugin_type}_{name}" + try: + plugin = importlib.import_module(module_name) + except ModuleNotFoundError as e: + print(e) + module_name = name + try: + plugin = importlib.import_module(module_name) + except ModuleNotFoundError as e: + print(e) + sys.exit(f"Unable to load {plugin_type} plugin {name}. Exiting.") + + if not hasattr(plugin, 'add_arguments'): + sys.exit(f"{module_name} module is not a plugin. Exiting.") + + print(f"Loaded {module_name} as the {plugin_type}.") + return plugin + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Megatron Checkpoint Converter Arguments", + allow_abbrev=False, conflict_handler='resolve') + + parser.add_argument('--model-type', type=str, required=True, + choices=['GPT', 'BERT'], + help='Type of the model') + parser.add_argument('--loader', type=str, default='megatron', + help='Module name to load checkpoint, should be on python path') + parser.add_argument('--saver', type=str, default='megatron', + help='Module name to save checkpoint, should be on python path') + parser.add_argument('--load-dir', type=str, required=True, + help='Directory to load model checkpoint from') + parser.add_argument('--save-dir', type=str, required=True, + help='Directory to save model checkpoint to') + parser.add_argument('--max-queue-size', type=int, default=50, + help='Maximum number of tensors in the queue') + parser.add_argument('--no-checking', action='store_false', + help='Do not perform checking on the name and ordering of weights', + dest='checking') + + known_args, _ = parser.parse_known_args() + loader = load_plugin('loader', known_args.loader) + saver = load_plugin('saver', known_args.saver) + + loader.add_arguments(parser) + saver.add_arguments(parser) + + args = parser.parse_args() + + queue = mp.Queue(maxsize=args.max_queue_size) + + print("Starting saver...") + saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args)) + saver_proc.start() + + print("Starting loader...") + loader.load_checkpoint(queue, args) + + print("Waiting for saver to complete...") + saver_proc.join() + + +if __name__ == '__main__': + main() diff --git a/tools/checkpoint/hybrid_conversion.py b/tools/checkpoint/hybrid_conversion.py new file mode 100644 index 0000000..737fac6 --- /dev/null +++ b/tools/checkpoint/hybrid_conversion.py @@ -0,0 +1,398 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +# Note (rwaleffe): This is a temporary file for hybrid mamba-transformer model checkpoint conversion. +# This functionality should be integrated with the megatron core checkpoint loader/saver. + + +import copy +import os +import re +import shutil +from collections import OrderedDict + +import torch +import argparse + + +tp_split_dim = { + 'word_embeddings.weight': 0, + 'norm.weight': -1, + 'final_norm.weight': -1, + 'output_layer.weight': 0, + # mamba1/2 + 'A_log': 0, + 'D': 0, + 'dt_bias': 0, + 'in_proj.weight': 0, + 'conv1d.weight': 0, + 'conv1d.bias': 0, + 'x_proj.weight': 1, + 'dt_proj.weight': 0, + 'dt_proj.bias': 0, + 'out_proj.weight': 1, + 'mixer.norm.weight': 0, + # mlp + 'linear_fc1.layer_norm_weight': -1, + 'linear_fc1.weight': 0, + 'linear_fc2.weight': 1, + # attention + 'self_attention.linear_proj.weight': 1, + 'self_attention.linear_qkv.layer_norm_weight': -1, + 'self_attention.linear_qkv.weight': 0, +} + + +def get_split_dim(tensor_name): + # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish + if 'norm.weight' in tensor_name: + if 'mixer.norm.weight' in tensor_name: + return tp_split_dim['mixer.norm.weight'] + else: + return tp_split_dim['norm.weight'] + + for key in tp_split_dim.keys(): + if key in tensor_name: + return tp_split_dim[key] + raise Exception("Unknown tensor name {}".format(tensor_name)) + + +def combine_tp_tensors(params, key, dim, tensors): + tp_size = len(tensors) + + if 'mixer.in_proj.weight' in key and params.mamba_version == 1: + xs = []; zs = [] + for tensor in tensors: + x, z = torch.split(tensor, [params.mamba_d_inner//tp_size, + params.mamba_d_inner//tp_size], dim=dim) + xs.append(x); zs.append(z) + return torch.cat([torch.cat(xs, dim=dim), torch.cat(zs, dim=dim)], dim=dim) + + elif 'mixer.in_proj.weight' in key and params.mamba_version == 2: + xs = []; zs = []; Bs = []; Cs = []; dts = [] + for tensor in tensors: + x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner // tp_size, + params.mamba_d_inner // tp_size, + (params.mamba2_n_groups // tp_size) * args.mamba_d_state, + (params.mamba2_n_groups // tp_size) * args.mamba_d_state, + params.mamba2_n_heads // tp_size], dim=dim) + xs.append(x); zs.append(z); Bs.append(B); Cs.append(C); dts.append(dt) + + for ii in range(len(Bs)): + Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-1])) + Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-1])) + B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim) + x = torch.cat(xs, dim=dim); z = torch.cat(zs, dim=dim); dt = torch.cat(dts, dim=dim) + + return torch.cat([x, z, B.flatten(0, 1), C.flatten(0, 1), dt], dim=dim) + + elif 'mixer.conv1d' in key and params.mamba_version == 2: + xs = []; Bs = []; Cs = [] + for tensor in tensors: + x, B, C = torch.split(tensor, [params.mamba_d_inner//tp_size, + (params.mamba2_n_groups // tp_size) * params.mamba_d_state, + (params.mamba2_n_groups // tp_size) * params.mamba_d_state], dim=dim) + xs.append(x); Bs.append(B); Cs.append(C) + + for ii in range(len(Bs)): + if 'weight' in key: + Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-2], Bs[ii].shape[-1])) + Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-2], Cs[ii].shape[-1])) + elif 'bias' in key: + Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state)) + Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state)) + else: + raise Exception("Unknown key") + B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim) + x = torch.cat(xs, dim=dim) + + return torch.cat([x, B.flatten(0, 1), C.flatten(0, 1)], dim=dim) + + else: + return torch.cat(tensors, dim=dim) + + +def split_tensor_for_tp(params, key, dim, tensor): + tp_size = params.target_tp_size + tensor_sliced = [] + + if 'mixer.in_proj.weight' in key and params.mamba_version == 1: + x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + z_sliced = torch.chunk(z, tp_size, dim=dim) + for (x, z) in zip(x_sliced, z_sliced): + tensor_sliced.append(torch.cat((x, z), dim=dim)) + + elif 'mixer.in_proj.weight' in key and params.mamba_version == 2: + x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_heads], dim=dim) + B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1])) + C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1])) + + B_sliced = torch.chunk(B, tp_size, dim=dim) + C_sliced = torch.chunk(C, tp_size, dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + z_sliced = torch.chunk(z, tp_size, dim=dim) + dt_sliced = torch.chunk(dt, tp_size, dim=dim) + + tensor_sliced = [] + for (x, z, B, C, dt) in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced): + tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim)) + + elif 'mixer.conv1d' in key and params.mamba_version == 2: + x, B, C = torch.split(tensor, [params.mamba_d_inner, + params.mamba2_n_groups * params.mamba_d_state, + params.mamba2_n_groups * params.mamba_d_state], dim=dim) + if 'weight' in key: + B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1])) + C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1])) + elif 'bias' in key: + B = torch.reshape(B, (-1, params.mamba_d_state)) + C = torch.reshape(C, (-1, params.mamba_d_state)) + else: + raise Exception("Unknown key") + + B_sliced = torch.chunk(B, tp_size, dim=dim) + C_sliced = torch.chunk(C, tp_size, dim=dim) + x_sliced = torch.chunk(x, tp_size, dim=dim) + + tensor_sliced = [] + for (x, B, C) in zip(x_sliced, B_sliced, C_sliced): + tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim)) + + else: + tensor_sliced = torch.chunk(tensor, tp_size, dim=dim) + + return tensor_sliced + + +def finalize_checkpoint(sample_model, model, params, verbose=False): + # make sure the rest of the checkpoint is how we want it from the original (i.e., other than the 'model') + reset_iterations = params.reset_iterations + + # checkpoint 'args' + model['args'] = copy.deepcopy(sample_model['args']) + model['args'].tensor_model_parallel_size = params.target_tp_size + model['args'].pipeline_model_parallel_size = params.target_pp_size + if reset_iterations: + model['args'].iteration = 0 + model['args'].consumed_valid_samples = 0 + model['args'].consumed_train_samples = 0 + model['args'].train_iters = 0 + model['args'].train_samples = 0 + + # checkpoint 'checkpoint_version' + model['checkpoint_version'] = copy.deepcopy(sample_model['checkpoint_version']) + + # checkpoint 'iteration' + model['iteration'] = copy.deepcopy(sample_model['iteration']) + if reset_iterations: + model['iteration'] = 0 + + # checkpoint 'optimizer' + # ignore + + # checkpoint 'opt_param_scheduler' + if 'opt_param_scheduler' in sample_model.keys(): + model['opt_param_scheduler'] = copy.deepcopy(sample_model['opt_param_scheduler']) + + # checkpoint 'rng_state' + model['rng_state'] = copy.deepcopy(sample_model['rng_state']) + + # report on argument difference + if verbose: + original_args = sample_model['args'].__dict__ + final_args = model['args'].__dict__ + for key in original_args: + if key in final_args: + if final_args[key] != original_args[key]: + print("KEY MISMATCH: {}".format(key)) + print("\toriginal: {}\n\tfinal: {}".format(original_args[key], final_args[key])) + else: + print("KEY MISSING from final: {}, value {}".format(key, original_args[key])) + print("") + for key in final_args: + if key not in original_args: + print("KEY ADDED to final: {}, value {}".format(key, final_args[key])) + + return model + + +def main(args): + print("\n====RUNNING CHECKPOINT CONVERSION====\n") + + args.mamba_d_inner = args.d_model * 2 + args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim + + # get the latest iteration + tracker_filename = os.path.join(args.load_dir, 'latest_checkpointed_iteration.txt') + with open(tracker_filename, 'r') as f: + metastring = f.read().strip() + try: + iteration = int(metastring) + except ValueError: + raise Exception("") + out_iteration = iteration if not args.reset_iterations else 0 + + # get model directory and model parallel ranks + input_model_dir = os.path.join(args.load_dir, 'iter_{:07d}'.format(iteration)) + input_sub_models = os.listdir(input_model_dir) + # input_sub_models = sorted(input_sub_models, key=lambda x: int(re.search(r'\d+', x).group())) + + # load one of the model parallel ranks to get arguments + sample_model_file = os.path.join(input_model_dir, input_sub_models[0], "model_optim_rng.pt") + sample_model = torch.load(sample_model_file) + print(f"Sample model {sample_model_file} is loaded.\n") + + # input tensor and pipeline parallel size + input_tp_rank = sample_model['args'].tensor_model_parallel_size + input_pp_rank = sample_model['args'].pipeline_model_parallel_size + num_layers_per_pipeline_rank = sample_model['args'].num_layers // input_pp_rank + + # construct full model + full_model = OrderedDict() + for pp in range(input_pp_rank): + print("[INFO] Processing input pipeline rank {}".format(pp)) + tp_models = [] + for tp in range(input_tp_rank): + dir_name = "mp_rank_{:02d}".format(tp) + if input_pp_rank > 1: + dir_name += "_{:03d}".format(pp) + model_file = os.path.join(input_model_dir, dir_name, "model_optim_rng.pt") + + tp_models.append(torch.load(model_file)) + print(f"Model {model_file} is loaded.") + + if input_tp_rank > 1: + combined_tp_model = OrderedDict() + for ii, (key, original_tensor) in enumerate(tp_models[0]['model'].items()): + if "_extra_state" in key: + combined_tp_model[key] = original_tensor + continue + + split_dim = get_split_dim(key) + original_shape = list(original_tensor.shape) + combined_shape = copy.deepcopy(original_shape) + combined_shape[split_dim] *= input_tp_rank + # print("{}, {}, {}".format(ii, key, split_dim)) + + if split_dim != -1: + # slice together model + # print("\tshape mismatch: original {}, combined {}".format(original_shape, combined_shape)) + combined_tensor = combine_tp_tensors(args, key, split_dim, + [tp_models[jj]['model'][key].cpu() for jj in range(input_tp_rank)]) + combined_tp_model[key] = combined_tensor + else: + # copy model + combined_tp_model[key] = original_tensor + else: + combined_tp_model = tp_models[0]['model'] + # print("Combined tp model: {}".format(combined_tp_model.keys())) + + for ii, (key, original_tensor) in enumerate(combined_tp_model.items()): + try: + layer_num = int(re.findall(r'\d+', key)[0]) + new_key = key.replace(str(layer_num), str(layer_num + pp*num_layers_per_pipeline_rank), 1) + except: + new_key = key + full_model[new_key] = original_tensor + # print("Combined model: {}".format(full_model.keys())) + print("\n[INFO] Loaded combined model\n") + + # sort by layer + # full_model_sorted = dict(sorted(people.items(), key=lambda item: item[1])) + + # create new split model + pp_offset = 0 + num_layers_per_pipeline_rank = sample_model['args'].num_layers // args.target_pp_size + + for pp in range(args.target_pp_size): + print("[INFO] Processing output pipeline rank {}".format(pp)) + tp_models = [] + for ii in range(args.target_tp_size): + tp_models.append({'model': OrderedDict()}) + + for ii, (key, original_tensor) in enumerate(full_model.items()): + try: + layer_num = int(re.findall(r'\d+', key)[0]) + if layer_num >= num_layers_per_pipeline_rank * (pp+1): + break + new_key = key.replace(str(layer_num), str(layer_num - (pp * num_layers_per_pipeline_rank)), 1) + except: + new_key = key + + if ii < pp_offset: + continue + else: + pp_offset += 1 + + if "_extra_state" in new_key: + # copy + for jj in range(args.target_tp_size): + tp_models[jj]['model'][new_key] = original_tensor + continue + + split_dim = get_split_dim(new_key) + original_shape = list(original_tensor.shape) + v0 = original_shape[split_dim] + split_size = v0 // args.target_tp_size + split_shape = copy.deepcopy(original_shape) + split_shape[split_dim] = split_size + # print("{}, {}, {}".format(ii, new_key, split_dim)) + + if split_dim != -1: + # split model + # print("\tshape mismatch: original {}, combined {}".format(original_shape, split_shape)) + tensor_sliced = split_tensor_for_tp(args, new_key, split_dim, original_tensor) + for jj in range(args.target_tp_size): + tp_models[jj]['model'][new_key] = tensor_sliced[jj] + else: + # copy model + for jj in range(args.target_tp_size): + tp_models[jj]['model'][new_key] = original_tensor + # print(tp_models[0]['model'].keys()) + + for tp in range(args.target_tp_size): + dir_name = "mp_rank_{:02d}".format(tp) + if args.target_pp_size > 1: + dir_name += "_{:03d}".format(pp) + + model = finalize_checkpoint(sample_model, tp_models[tp], args, verbose=False) + + save_dir = os.path.join(args.save_dir, 'iter_{:07d}'.format(out_iteration), dir_name) + os.makedirs(save_dir, exist_ok=True) + model_file = os.path.join(save_dir, "model_optim_rng.pt") + torch.save(model, model_file) + print(f"Model {model_file} is saved.") + + # shutil.copyfile(tracker_filename, os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt')) + tracker_filename = os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt') + with open(tracker_filename, 'w') as f: + f.write(str(out_iteration)) + + +if __name__ == "__main__": + # example run command: + # python hybrid_conversion.py + # --load-dir mamba2-840m-test/checkpoints/ + # --save-dir mamba2-840m-test-conversion/checkpoints/ + # --target-pp-size 1 + # --target-tp-size 1 + + parser = argparse.ArgumentParser() + parser.add_argument('--load-dir', type=str) + parser.add_argument('--save-dir', type=str) + parser.add_argument('--target-tp-size', type=int, default=1) + parser.add_argument('--target-pp-size', type=int, default=1) + parser.add_argument('--reset-iterations', action='store_true') + + parser.add_argument('--d-model', type=int, default=4096) + parser.add_argument('--mamba-version', type=int, default=2) + parser.add_argument('--mamba-d-state', type=int, default=128) + parser.add_argument('--mamba2-n-groups', type=int, default=8) + parser.add_argument('--mamba2-head-dim', type=int, default=64) + + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py new file mode 100644 index 0000000..cf88099 --- /dev/null +++ b/tools/checkpoint/loader_llama_mistral.py @@ -0,0 +1,667 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import json +import os +import sys +import torch +try: + import transformers +except ImportError: + raise ImportError("The 'transformers' package is not installed.") +import gc +import shutil +from tqdm import tqdm +import types + + +def add_arguments(parser): + group = parser.add_argument_group(title='Llama/Mistral loader.') + + # TODO(jbarker): Need assertion to make sure *exactly* one of these is used + parser.add_argument('--model-size', type=str, required=True, + choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf', 'yi-34B'], + help='Model size can be `llama2-7B`, `llama2-13B`, `llama2-70B`, `llama3-8B`, `llama3-70B`, `mistral-7B` (for pretrained models), ' + 'and `llama2-7Bf`, `llama2-13Bf`, `llama2-70Bf`, `llama3-8Bf`, `llama3-70bf` and `mistral-7Bf` (for chat-finetuned models).') + parser.add_argument('--checkpoint-type', type=str, required=True, + help='Type of checkpoint to convert, options are "meta" or "hf"') + parser.add_argument('--bf16', action='store_true', help='Whether to load weights in bf16.') + parser.add_argument('--fp16', action='store_true', help='Whether to load weights in fp16.') + group.add_argument('--true-vocab-size', type=int, default=None, + help='original size of vocab, if specified will trim padding from embedding table.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file. If specified will use this to get vocab size and ' + 'trim padding from the embedding table.') + group.add_argument('--tokenizer-model', required=True, + help='Tokenizer model file.') + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of Megatron repository') + group.add_argument('--loader-transformer-impl', default='local', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + + +def verify_transformers_version(): + major, minor, patch = map(int, transformers.__version__.split('.')) + assert major >= 4 and minor >= 31 + + +NUM_SHARDS = { + "llama2-7B": 1, + "llama2-7Bf": 1, + "llama2-13B": 2, + "llama2-13Bf": 2, + "llama2-70B": 8, + "llama2-70Bf": 8, + "llama3-8B": 1, + "llama3-8Bf": 1, + "llama3-70B": 8, + "llama3-70Bf": 8, + "mistral-7B": 1, + "mistral-7Bf": 1, + "yi-34B": 8, +} + + +def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): + return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) + + +def read_json(path): + with open(path, "r") as f: + return json.load(f) + + +def write_json(text, path): + with open(path, "w") as f: + json.dump(text, f) + + +# This conversion is adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py +def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path): + + if "llama2" in model_size: + from transformers import LlamaConfig as ModelConfig + from transformers import LlamaTokenizer, LlamaTokenizerFast + elif "llama3" in model_size: + from transformers import LlamaConfig as ModelConfig + elif "mistral" in model_size: + from transformers import MistralConfig as ModelConfig + try: + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + except ImportError: + raise ImportError("Module 'mistral-common' is required but not installed.") + + + # for backward compatibility, before you needed the repo to be called `my_repo/model_size` + if not os.path.isfile(os.path.join(input_base_path, "params.json")): + input_base_path = os.path.join(input_base_path, model_size) + + os.makedirs(model_path, exist_ok=True) + + params = read_json(os.path.join(input_base_path, "params.json")) + num_shards = NUM_SHARDS[model_size] + params = params.get("model", params) + n_layers = params["n_layers"] + n_heads = params["n_heads"] + n_heads_per_shard = n_heads // num_shards + dim = params["dim"] + dims_per_head = dim // n_heads + base = params.get("rope_theta", 10000.0) + inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + if base > 10000.0: + max_position_embeddings = 32768 if "mistral" in model_size else 16384 + else: + max_position_embeddings = 4096 if "mistral" in model_size else 2048 + + if "llama2" in model_size: + tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast + elif "llama3" in model_size: + try: + from llama.tokenizer import Tokenizer as Llama3Tokenizer + except ImportError: + raise AssertionError("Module 'llama' is required but not installed.") + tokenizer_class = Llama3Tokenizer + elif "mistral" in model_size: + tokenizer_class = MistralTokenizer + else: + raise AttributeError(f"model_size={model_size} not supported") + if tokenizer_path is not None: + if "llama" in model_size: + tokenizer = tokenizer_class(tokenizer_path) + if "llama2" in model_size: + tokenizer.save_pretrained(model_path) + vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000 + elif "mistral" in model_size: + tokenizer = tokenizer_class.from_file(tokenizer_path) + vocab_size = 32768 + else: + raise AttributeError(f"model_size={model_size} is not supported") + + if params.get("n_kv_heads", None) is not None: + num_key_value_heads = params["n_kv_heads"] # for GQA / MQA + num_local_key_value_heads = n_heads_per_shard // num_key_value_heads + key_value_dim = dim // num_key_value_heads + else: # compatibility with other checkpoints + num_key_value_heads = n_heads + num_local_key_value_heads = n_heads_per_shard + key_value_dim = dim + + # permute for sliced rotary + def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): + return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + print(f"Fetching all parameters from the checkpoint at {input_base_path}.") + # Load weights + if num_shards == 1: + # Not sharded + # (The sharded implementation would also work, but this is simpler.) + loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") + else: + # Sharded + loaded = [ + torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") + for i in range(num_shards) + ] + param_count = 0 + index_dict = {"weight_map": {}} + for layer_i in range(n_layers): + filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" + if num_shards == 1: + # Unsharded + q_proj = loaded[f"layers.{layer_i}.attention.wq.weight"] + k_proj = loaded[f"layers.{layer_i}.attention.wk.weight"] + if ("llama2" in model_size) or ("mistral" in model_size): + q_proj = permute(q_proj) + k_proj = permute(k_proj) + state_dict = { + f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj, + f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj, + f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], + f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], + f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], + f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], + f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], + f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], + } + else: + # Sharded + # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share + # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is + # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. + + state_dict = { + f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ + f"layers.{layer_i}.attention_norm.weight" + ].clone(), + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ + f"layers.{layer_i}.ffn_norm.weight" + ].clone(), + } + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( + num_local_key_value_heads, dims_per_head, dim + ) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim), + num_key_value_heads, + key_value_dim, + dim, + ) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( + num_local_key_value_heads, dims_per_head, dim + ) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim) + + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 + ) + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 + ) + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(model_path, filename)) + + filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" + if num_shards == 1: + # Unsharded + state_dict = { + "model.embed_tokens.weight": loaded["tok_embeddings.weight"], + "model.norm.weight": loaded["norm.weight"], + "lm_head.weight": loaded["output.weight"], + } + else: + d = 0 if "llama3" in model_size else 1 + state_dict = { + "model.norm.weight": loaded[0]["norm.weight"], + "model.embed_tokens.weight": torch.cat( + [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=d + ), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), + } + + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(model_path, filename)) + + # Write configs + index_dict["metadata"] = {"total_size": param_count * 2} + write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json")) + ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 + multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + config = ModelConfig( + hidden_size=dim, + intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + num_key_value_heads=num_key_value_heads, + vocab_size=vocab_size, + rope_theta=base, + max_position_embeddings=max_position_embeddings, + ) + config.save_pretrained(model_path) + + # Make space so we can load the model properly now. + del state_dict + del loaded + gc.collect() + + return model_path + + +def load_args_from_checkpoint(args): + + # Read Llama args. + model_args_path = os.path.join(args.load, "config.json") + with open(model_args_path) as f: + model_args = json.load(f) + # Update Megatron args. + args.seq_length = 4096 + args.max_position_embeddings = model_args["max_position_embeddings"] + args.hidden_size = model_args["hidden_size"] + args.num_attention_heads = model_args["num_attention_heads"] + args.num_layers = model_args["num_hidden_layers"] + args.global_batch_size = 1024 + args.norm_epsilon = model_args["rms_norm_eps"] + args.iteration = 1 # '0', 'release' don't work + args.add_position_embedding = False + args.use_rotary_position_embeddings = True + args.swiglu = True + args.normalization = "RMSNorm" + args.add_bias_linear = False + args.untie_embeddings_and_output_weights = True + args.vocab_size = model_args["vocab_size"] + args.padded_vocab_size = model_args["vocab_size"] + args.ffn_hidden_size = model_args["intermediate_size"] + + if "num_key_value_heads" in model_args: + args.group_query_attention = True + args.num_query_groups = model_args["num_key_value_heads"] + + +def set_preprocess_state(args, model, hf_model): + '''Set embedding params.''' + model.language_model.embedding.word_embeddings.weight.data.copy_( + hf_model.model.embed_tokens.weight) + + +def set_postprocess_state(args, model, hf_model): + '''Set output layer & norm params.''' + model.language_model.encoder.final_norm.weight.data.copy_(hf_model.model.norm.weight) + model.language_model.output_layer.weight.data.copy_(hf_model.lm_head.weight) + + +def set_attn_state(args, layer, hf_layer): + '''Set self-attention params.''' + + # Get attention layer & state. + attn = layer.self_attention + hf_attn = hf_layer.self_attn + + # Reshape loaded weights. + tp = args.tensor_model_parallel_size + nh = args.num_attention_heads // tp + ng = (args.num_query_groups if args.group_query_attention \ + else args.num_attention_heads) // tp + dim = args.kv_channels + assert nh % ng == 0 + + # Copy weights (re-order dimensions for Megatron). + attn.query_key_value.weight.data.copy_(torch.cat([ + hf_attn.q_proj.weight.reshape((ng, dim*nh//ng, -1)), + hf_attn.k_proj.weight.reshape((ng, dim, -1)), + hf_attn.v_proj.weight.reshape((ng, dim, -1)), + ], dim=1).reshape((-1, args.hidden_size))) + attn.dense.weight.data.copy_(hf_attn.o_proj.weight) + + +def set_mlp_state(args, layer, hf_layer): + '''Set MLP params.''' + + mlp = layer.mlp + hf_mlp = hf_layer.mlp + + mlp.dense_h_to_4h.weight.data.copy_(torch.cat([ + hf_mlp.gate_proj.weight, + hf_mlp.up_proj.weight, + ], dim=0)) + mlp.dense_4h_to_h.weight.data.copy_(hf_mlp.down_proj.weight) + + +def set_layer_state(args, model, hf_model, layer_idx): + '''Set transformer layer params.''' + + layer = model.language_model.encoder.layers[layer_idx] + hf_layer = hf_model.model.layers[layer_idx] + + set_attn_state(args, layer, hf_layer) + set_mlp_state(args, layer, hf_layer) + layer.input_norm.weight.data.copy_(hf_layer.input_layernorm.weight) + layer.post_attention_norm.weight.data.copy_(hf_layer.post_attention_layernorm.weight) + + +def load_checkpoint_to_model(args): + '''Set model params.''' + + from pretrain_gpt import model_provider + if "llama" in args.model_size or "yi" in args.model_size: + from transformers import LlamaForCausalLM as ModelForCausalLM + elif "mistral" in args.model_size: + from transformers import MistralForCausalLM as ModelForCausalLM + else: + raise AttributeError(f"args.model_size={args.model_size} not supported") + + # Load Huggingface model. + hf_model = ModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu") + + # Init Megatron model. + model = model_provider(True, True).to(args.params_dtype) + + # Set model state. + set_preprocess_state(args, model, hf_model) + set_postprocess_state(args, model, hf_model) + for layer_idx in tqdm(range(args.num_layers), "set layer states"): + set_layer_state(args, model, hf_model, layer_idx) + + return model + + +def _load_checkpoint(queue, args): + + verify_transformers_version() + + # Search in directory above this. + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir, + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + # Convert Meta checkpoint to HF format as an intermediate step + if args.checkpoint_type == "meta": + model_tmp_path = convert_to_hf(model_path=os.path.join(args.save_dir, 'tmp'), input_base_path=args.load_dir, model_size=args.model_size, tokenizer_path=args.tokenizer_model) + args.load_dir = model_tmp_path + + try: + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.legacy.model import module + from megatron.core import mpu + from megatron.core.enums import ModelType + from megatron.legacy import fused_kernels + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + queue.put("exit") + exit(1) + + # We want all arguments to come from us. + sys.argv = ['script.py', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--mock-data', # To pass the "blend data checks" in arguments.py + '--no-initialization', + '--load', args.load_dir + ] + + margs = parse_args() + margs.tokenizer_model = args.tokenizer_model + load_args_from_checkpoint(margs) + + if "llama2" in args.model_size or "yi" in args.model_size: + margs.tokenizer_type = "Llama2Tokenizer" + elif "llama3" in args.model_size: + margs.tokenizer_type = "Llama3Tokenizer" + elif "mistral" in args.model_size: + margs.tokenizer_type = "MistralTokenizer" + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes. + margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size + + margs = validate_args(margs) + + margs.use_legacy_models = True + margs.transformer_impl = args.loader_transformer_impl + + def check_for_arg(arg_name, default=None): + if getattr(margs, arg_name, None) is None: + if default is not None: + setattr(margs, arg_name, default) + else: + print(f"Checkpoint does not specify the argument {arg_name}. Exiting.") + print(f"Arguments: {margs}") + queue.put("exit") + exit(1) + + check_for_arg('tensor_model_parallel_size') + check_for_arg('pipeline_model_parallel_size') + check_for_arg('num_layers') + check_for_arg('hidden_size') + check_for_arg('seq_length') + check_for_arg('num_attention_heads') + check_for_arg('max_position_embeddings') + check_for_arg('position_embedding_type') + check_for_arg('iteration') + check_for_arg('bert_binary_head') + check_for_arg('disable_bias_linear', False) + check_for_arg('params_dtype') + check_for_arg('swiglu', False) + + # Determine how to make our models. + assert args.model_type == 'GPT', 'Llama-2, Llama-3 and Mistral are GPT models.' + margs.model_type = ModelType.encoder_or_decoder + margs.params_dtype = torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32 + + # Suppress warning about torch.distributed not being initialized. + module.MegatronModule.embedding_warning_printed = True + + set_global_variables(margs, build_tokenizer=False) + mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) + mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size) + mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size) + fused_kernels.load(margs) + + # Short aliases. + tp_size = margs.tensor_model_parallel_size + pp_size = margs.pipeline_model_parallel_size + vp_size = margs.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + # Metadata. + md = types.SimpleNamespace() + md.model_type = args.model_type + md.num_layers = margs.num_layers + md.hidden_size = margs.hidden_size + md.seq_length = margs.seq_length + md.num_attention_heads = margs.num_attention_heads + md.max_position_embeddings = margs.max_position_embeddings + md.tokenizer_type = margs.tokenizer_type + md.iteration = margs.iteration + md.params_dtype = margs.params_dtype + md.bert_binary_head = margs.bert_binary_head + md.output_layer = margs.untie_embeddings_and_output_weights + md.position_embedding_type = margs.position_embedding_type + md.linear_bias = margs.add_bias_linear + md.norm_has_bias = False + md.swiglu = margs.swiglu + md.previous_tensor_parallel_size = margs.tensor_model_parallel_size + md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size + md.make_vocab_size_divisible_by = None + md.checkpoint_args = margs + md.consumed_train_samples = 0 + md.consumed_valid_samples = 0 + + margs.model_size = args.model_size + + # Get true (non-padded) vocab size + if margs.tokenizer_model is not None and "llama3" in args.model_size: + try: + from llama.tokenizer import Tokenizer as Llama3Tokenizer + except ImportError: + raise AssertionError("Module 'llama' is required but not installed.") + tokenizer = Llama3Tokenizer(margs.tokenizer_model) + md.true_vocab_size = tokenizer.vocab_size + else: + md.true_vocab_size = None + + # Get first pipe stage. + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + model = load_checkpoint_to_model(margs) + + queue.put(md) + + def queue_put(name, msg): + print(f"sending {name}") + msg["name"] = name + queue.put(msg) + + # Send embeddings. + message = { + "word embeddings": model.language_model.embedding.word_embeddings.weight.data + } + if md.position_embedding_type == 'learned_absolute': + message["position embeddings"] = model.language_model.embedding.position_embeddings.weight.data + else: + assert not hasattr(model.language_model.embedding, 'position_embeddings') + + queue_put("embeddings", message) + + for layer_num in range(margs.num_layers): + message = {} + + # Get non-parallel tensors from tp_rank 0. + layer = model.language_model.encoder.layers[layer_num] + message["input norm weight"] = layer.input_norm.weight.data + message["post norm weight"] = layer.post_attention_norm.weight.data + if md.linear_bias: + message["dense bias"] = layer.self_attention.dense.bias.data + message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data + + # Grab all parallel tensors for this layer. + qkv_weight = [] + qkv_bias = [] + dense_weight = [] + mlp_l0_weight = [] + mlp_l0_bias = [] + mlp_l1_weight = [] + layer = model.language_model.encoder.layers[layer_num] + qkv_weight.append(layer.self_attention.query_key_value.weight.data) + dense_weight.append(layer.self_attention.dense.weight.data) + mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data) + mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data) + if md.linear_bias: + qkv_bias.append(layer.self_attention.query_key_value.bias.data) + mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data) + + # Handle gated linear units. + if md.swiglu: + # Concat all the first halves ('W's) and all the second halves ('V's). + for tp_rank in range(tp_size): + mlp_l0_weight[tp_rank] = torch.chunk(mlp_l0_weight[tp_rank], 2, dim=0) + message["mlp l0 weight W"] = torch.cat([w[0] for w in mlp_l0_weight], dim=0) + message["mlp l0 weight V"] = torch.cat([w[1] for w in mlp_l0_weight], dim=0) + else: + message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0) + + # Simple concat of the rest. + message["qkv weight"] = torch.cat(qkv_weight, dim=0) + message["dense weight"] = torch.cat(dense_weight, dim=1) + message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1) + if md.linear_bias: + message["qkv bias"] = torch.cat(qkv_bias, dim=0) + if md.swiglu: + for tp_rank in range(tp_size): + mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0) + message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0) + message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0) + else: + message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0) + + queue_put(f"transformer layer {layer_num}", message) + + # Send final norm from tp_rank 0. + message = { + "weight": model.language_model.encoder.final_norm.weight.data, + } + queue_put("final norm", message) + + if md.output_layer: + message = { + "weight": model.language_model.output_layer.weight.data + } + queue_put("output layer", message) + + queue.put("done") + + if args.checkpoint_type == "meta": + shutil.rmtree(os.path.join(args.save_dir, 'tmp')) + + +def load_checkpoint(queue, args): + try: + _load_checkpoint(queue, args) + except: + queue.put("exit") + raise diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py new file mode 100644 index 0000000..42d0a17 --- /dev/null +++ b/tools/checkpoint/loader_mcore.py @@ -0,0 +1,383 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import json +import os +import sys +import torch +import types + +from utils import get_mcore_transformer_block_key, print_memory_usage + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron loader') + + group.add_argument('--true-vocab-size', type=int, default=None, + help='original size of vocab, if specified will trim padding from embedding table.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file. If specified will use this to get vocab size and ' + 'trim padding from the embedding table.') + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of Megatron repository') + group.add_argument('--position-embedding-type', + type=str, + default='learned_absolute', + choices=['learned_absolute', 'rope'], + help='Position embedding type.') + group.add_argument('--loader-transformer-impl', default='transformer_engine', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + + +def _load_checkpoint(queue, args): + + # Search in directory above this + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint + from megatron.legacy.model import module + from megatron.core import mpu + from megatron.core.enums import ModelType + from megatron.legacy import fused_kernels + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + queue.put("exit") + exit(1) + + # We want all arguments to come from us + sys.argv = ['script.py', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--mock-data', # To pass the "blend data checks" in arguments.py + '--load', args.load_dir, + '--position-embedding-type', args.position_embedding_type, + ] + + margs = parse_args() + margs, checkpoint_args = load_args_from_checkpoint(margs, exit_on_missing_checkpoint=True) + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size + + # Explicitly copy data types from checkpoint. + margs.fp16 = checkpoint_args.fp16 + margs.bf16 = checkpoint_args.bf16 + + # Validate margs. + margs = validate_args(margs) + + margs.use_legacy_models = False + margs.transformer_impl = args.loader_transformer_impl + + def check_for_arg(arg_name, default=None): + if getattr(margs, arg_name, None) is None: + if default is not None: + setattr(margs, arg_name, default) + else: + print(f"Checkpoint does not specify the argument {arg_name}. Exiting.") + print(f"Arguments: {margs}") + queue.put("exit") + exit(1) + + check_for_arg('tensor_model_parallel_size') + check_for_arg('pipeline_model_parallel_size') + check_for_arg('num_layers') + check_for_arg('hidden_size') + check_for_arg('seq_length') + check_for_arg('num_attention_heads') + check_for_arg('max_position_embeddings') + check_for_arg('position_embedding_type') + check_for_arg('tokenizer_type') + check_for_arg('iteration') + check_for_arg('bert_binary_head') + check_for_arg('disable_bias_linear', False) + check_for_arg('params_dtype') + check_for_arg('swiglu', False) + + # Determine how to make our models + if args.model_type == 'GPT': + from pretrain_gpt import model_provider + margs.model_type = ModelType.encoder_or_decoder + elif args.model_type == 'BERT': + from pretrain_bert import model_provider + margs.model_type = ModelType.encoder_or_decoder + else: + raise Exception(f'unrecognized model type: {args.model_type}') + + # supress warning about torch.distributed not being initialized + module.MegatronModule.embedding_warning_printed = True + + consumed_train_samples = None + consumed_valid_samples = None + def get_models(count, dtype): + nonlocal consumed_train_samples + nonlocal consumed_valid_samples + model_array_len = margs.virtual_pipeline_model_parallel_size + if model_array_len is None: + model_array_len = 1 + models = [[] for _ in range(model_array_len)] + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + for rank in range(count): + mpu.set_tensor_model_parallel_rank(rank) + if margs.virtual_pipeline_model_parallel_size is not None: + model_ = [] + for i in range(margs.virtual_pipeline_model_parallel_size): + mpu.set_virtual_pipeline_model_parallel_rank(i) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + this_model = model_provider( + pre_process=pre_process, + post_process=post_process + ).to(dtype) + model_.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + model_rank = 0 + model_ = [model_provider(pre_process, post_process).to(dtype)] + margs.consumed_train_samples = 0 + margs.consumed_valid_samples = 0 + margs.exit_on_missing_checkpoint = True + load_checkpoint(model_, None, None) + + if consumed_train_samples is not None: + assert(margs.consumed_train_samples == consumed_train_samples) + else: + consumed_train_samples = margs.consumed_train_samples + if consumed_valid_samples is not None: + assert(margs.consumed_valid_samples == consumed_valid_samples) + else: + consumed_valid_samples = margs.consumed_valid_samples + for vp_rank in range(model_array_len): + models[vp_rank].append(model_[vp_rank]) + + # Print memory usage. + print_memory_usage("loader", rank, count) + + return models + + set_global_variables(margs, build_tokenizer=False) + mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) + mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size) + mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size) + fused_kernels.load(margs) + + # Get true (non-padded) vocab size + if args.true_vocab_size is not None: + true_vocab_size = args.true_vocab_size + elif args.vocab_file is not None: + vocab = json.load(open(args.vocab_file)) + true_vocab_size = len(vocab) + if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size: + print("Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.") + queue.put("exit") + exit(1) + else: + true_vocab_size = None + + # short aliases + tp_size = margs.tensor_model_parallel_size + pp_size = margs.pipeline_model_parallel_size + vp_size = margs.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + # Layernorm has bias; RMSNorm does not. + if hasattr(checkpoint_args, 'normalization'): + norm_has_bias = checkpoint_args.normalization == "LayerNorm" + else: + # older models only supported LayerNorm + norm_has_bias = True + + # metadata + md = types.SimpleNamespace() + md.model_type = args.model_type + md.num_layers = margs.num_layers + md.hidden_size = margs.hidden_size + md.seq_length = margs.seq_length + md.num_attention_heads = margs.num_attention_heads + md.max_position_embeddings = margs.max_position_embeddings + md.tokenizer_type = margs.tokenizer_type + md.iteration = margs.iteration + md.params_dtype = margs.params_dtype + md.bert_binary_head = margs.bert_binary_head + md.output_layer = margs.untie_embeddings_and_output_weights + md.position_embedding_type = margs.position_embedding_type + md.linear_bias = margs.add_bias_linear + md.norm_has_bias = norm_has_bias + md.swiglu = margs.swiglu + md.previous_tensor_parallel_size = margs.tensor_model_parallel_size + md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size + md.true_vocab_size = true_vocab_size + md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by + md.checkpoint_args = checkpoint_args + md.use_legacy_models = margs.use_legacy_models + + # Get transformer block (named either 'encoder' or 'decoder'). + transformer_block_key = get_mcore_transformer_block_key(md.model_type) + def get_transformer_block(_model): + return getattr(_model, transformer_block_key) + + # Get first pipe stage + mpu.set_pipeline_model_parallel_rank(0) + all_models = [get_models(tp_size, md.params_dtype)] + models = all_models[0][0] + + md.consumed_train_samples = consumed_train_samples + md.consumed_valid_samples = consumed_valid_samples + queue.put(md) + + def queue_put(name, msg): + print(f"sending {name}") + msg["name"] = name + queue.put(msg) + + # Send embeddings + message = { + "word embeddings": torch.cat( + [models[tp_rank].embedding.word_embeddings.weight.data for tp_rank in range(tp_size)], + dim = 0) + } + if md.position_embedding_type == 'learned_absolute': + message["position embeddings"] = models[0].embedding.position_embeddings.weight.data + else: + assert not hasattr(models[0].embedding, 'position_embeddings') + + queue_put("embeddings", message) + + total_layer_num = 0 + for vp_rank in range(vp_size): + mpu.set_virtual_pipeline_model_parallel_rank(vp_rank) + for pp_rank in range(pp_size): + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + if vp_rank == 0: + all_models.append(get_models(tp_size, md.params_dtype)) + models = all_models[pp_rank][vp_rank] + for layer_num in range(len(get_transformer_block(models[0]).layers)): + message = {} + + # Get non-parallel tensors from tp_rank 0 + layer = get_transformer_block(models[0]).layers[layer_num] + message["input norm weight"] = layer.self_attention.linear_qkv.layer_norm_weight.data + if norm_has_bias: + message["input norm bias"] = layer.self_attention.linear_qkv.layer_norm_bias.data + message["post norm weight"] = layer.mlp.linear_fc1.layer_norm_weight.data + if norm_has_bias: + message["post norm bias"] = layer.mlp.linear_fc1.layer_norm_bias.data + if md.linear_bias: + message["dense bias"] = layer.self_attention.linear_proj.bias.data + message["mlp l1 bias"] = layer.mlp.linear_fc2.bias.data + + # Grab all parallel tensors for this layer + qkv_weight = [] + qkv_bias = [] + dense_weight = [] + mlp_l0_weight = [] + mlp_l0_bias = [] + mlp_l1_weight = [] + for tp_rank, model in enumerate(models): + layer = get_transformer_block(model).layers[layer_num] + qkv_weight.append(layer.self_attention.linear_qkv.weight.data) + dense_weight.append(layer.self_attention.linear_proj.weight.data) + mlp_l0_weight.append(layer.mlp.linear_fc1.weight.data) + mlp_l1_weight.append(layer.mlp.linear_fc2.weight.data) + if md.linear_bias: + qkv_bias.append(layer.self_attention.linear_qkv.bias.data) + mlp_l0_bias.append(layer.mlp.linear_fc1.bias.data) + + # Handle gated linear units + if md.swiglu: + # concat all the first halves ('W's) and all the second halves ('V's) + for tp_rank in range(tp_size): + mlp_l0_weight[tp_rank] = torch.chunk(mlp_l0_weight[tp_rank], 2, dim=0) + message["mlp l0 weight W"] = torch.cat([w[0] for w in mlp_l0_weight], dim=0) + message["mlp l0 weight V"] = torch.cat([w[1] for w in mlp_l0_weight], dim=0) + else: + message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0) + + # simple concat of the rest + message["qkv weight"] = torch.cat(qkv_weight, dim=0) + message["dense weight"] = torch.cat(dense_weight, dim=1) + message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1) + if md.linear_bias: + message["qkv bias"] = torch.cat(qkv_bias, dim=0) + if md.swiglu: + for tp_rank in range(tp_size): + mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0) + message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0) + message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0) + else: + message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0) + + queue_put(f"transformer layer {total_layer_num}", message) + + total_layer_num = total_layer_num + 1 + + # Send final norm from tp_rank 0 + message = { + "weight": get_transformer_block(models[0]).final_layernorm.weight.data, + } + if norm_has_bias: + message["bias"] = get_transformer_block(models[0]).final_layernorm.bias.data + queue_put("final norm", message) + + if md.output_layer: + message = { + "weight": torch.cat( + [models[tp_rank].output_layer.weight.data for tp_rank in range(tp_size)], + dim = 0) + } + queue_put("output layer", message) + + + # Send BERT lm head and binary head if it exists + if md.model_type == 'BERT': + message = { + "weight": models[0].pooler.dense.weight.data, + "bias": models[0].pooler.dense.bias.data + } + queue_put("pooler", message) + + message = { + "dense weight": models[0].lm_head.dense.weight.data, + "dense bias": models[0].lm_head.dense.bias.data, + "norm weight": models[0].lm_head.layer_norm.weight.data, + } + if norm_has_bias: + message["norm bias"] = models[0].lm_head.layer_norm.bias.data + queue_put("lm head", message) + + if md.bert_binary_head: + message = { + "weight": models[0].binary_head.weight.data, + "bias": models[0].binary_head.bias.data + } + queue_put("binary head", message) + queue.put("done") + +def load_checkpoint(queue, args): + try: + _load_checkpoint(queue, args) + except: + queue.put("exit") + raise diff --git a/tools/checkpoint/loader_megatron.py b/tools/checkpoint/loader_megatron.py new file mode 100644 index 0000000..e6a465b --- /dev/null +++ b/tools/checkpoint/loader_megatron.py @@ -0,0 +1,371 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import json +import os +import sys +import types + +import torch + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron loader') + + group.add_argument('--true-vocab-size', type=int, default=None, + help='original size of vocab, if specified will trim padding from embedding table.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file. If specified will use this to get vocab size and ' + 'trim padding from the embedding table.') + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of Megatron repository') + group.add_argument('--position-embedding-type', + type=str, + default='learned_absolute', + choices=['learned_absolute', 'rope'], + help='Position embedding type.') + group.add_argument('--loader-transformer-impl', default='local', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + +def _load_checkpoint(queue, args): + + # Search in directory above this + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint + from megatron.legacy.model import module + from megatron.core import mpu + from megatron.core.enums import ModelType + from megatron.legacy import fused_kernels + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + queue.put("exit") + exit(1) + + # We want all arguments to come from us + sys.argv = ['script.py', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--mock-data', # To pass the "blend data checks" in arguments.py + '--no-initialization', + '--load', args.load_dir, + '--position-embedding-type', args.position_embedding_type, + ] + + margs = parse_args() + margs, checkpoint_args = load_args_from_checkpoint(margs, exit_on_missing_checkpoint=True) + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size + + # Explicitly copy data types from checkpoint. + margs.fp16 = checkpoint_args.fp16 + margs.bf16 = checkpoint_args.bf16 + + # Validate margs. + margs = validate_args(margs) + + margs.use_legacy_models = True + margs.transformer_impl = args.loader_transformer_impl + + def check_for_arg(arg_name, default=None): + if getattr(margs, arg_name, None) is None: + if default is not None: + setattr(margs, arg_name, default) + else: + print(f"Checkpoint does not specify the argument {arg_name}. Exiting.") + print(f"Arguments: {margs}") + queue.put("exit") + exit(1) + + check_for_arg('tensor_model_parallel_size') + check_for_arg('pipeline_model_parallel_size') + check_for_arg('num_layers') + check_for_arg('hidden_size') + check_for_arg('seq_length') + check_for_arg('num_attention_heads') + check_for_arg('max_position_embeddings') + check_for_arg('position_embedding_type') + check_for_arg('tokenizer_type') + check_for_arg('iteration') + check_for_arg('bert_binary_head') + check_for_arg('disable_bias_linear', False) + check_for_arg('params_dtype') + check_for_arg('swiglu', False) + + # Determine how to make our models + if args.model_type == 'GPT': + from pretrain_gpt import model_provider + margs.model_type = ModelType.encoder_or_decoder + elif args.model_type == 'BERT': + from pretrain_bert import model_provider + margs.model_type = ModelType.encoder_or_decoder + else: + raise Exception(f'unrecognized model type: {args.model_type}') + + # supress warning about torch.distributed not being initialized + module.MegatronModule.embedding_warning_printed = True + + consumed_train_samples = None + consumed_valid_samples = None + def get_models(count, dtype): + nonlocal consumed_train_samples + nonlocal consumed_valid_samples + model_array_len = margs.virtual_pipeline_model_parallel_size + if model_array_len is None: + model_array_len = 1 + models = [[] for _ in range(model_array_len)] + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + for rank in range(count): + mpu.set_tensor_model_parallel_rank(rank) + if margs.virtual_pipeline_model_parallel_size is not None: + model_ = [] + for i in range(margs.virtual_pipeline_model_parallel_size): + mpu.set_virtual_pipeline_model_parallel_rank(i) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + this_model = model_provider( + pre_process=pre_process, + post_process=post_process + ).to(dtype) + model_.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + model_rank = 0 + model_ = [model_provider(pre_process, post_process).to(dtype)] + margs.consumed_train_samples = 0 + margs.consumed_valid_samples = 0 + margs.exit_on_missing_checkpoint = True + load_checkpoint(model_, None, None) + + if consumed_train_samples is not None: + assert(margs.consumed_train_samples == consumed_train_samples) + else: + consumed_train_samples = margs.consumed_train_samples + if consumed_valid_samples is not None: + assert(margs.consumed_valid_samples == consumed_valid_samples) + else: + consumed_valid_samples = margs.consumed_valid_samples + for vp_rank in range(model_array_len): + models[vp_rank].append(model_[vp_rank]) + return models + + set_global_variables(margs, build_tokenizer=False) + mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) + mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size) + mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size) + fused_kernels.load(margs) + + # Get true (non-padded) vocab size + if args.true_vocab_size is not None: + true_vocab_size = args.true_vocab_size + elif args.vocab_file is not None: + vocab = json.load(open(args.vocab_file)) + true_vocab_size = len(vocab) + if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size: + print("Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.") + queue.put("exit") + exit(1) + else: + true_vocab_size = None + + # short aliases + tp_size = margs.tensor_model_parallel_size + pp_size = margs.pipeline_model_parallel_size + vp_size = margs.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + # Layernorm has bias; RMSNorm does not. + if hasattr(checkpoint_args, 'normalization'): + norm_has_bias = checkpoint_args.normalization == "LayerNorm" + else: + # older models only supported LayerNorm + norm_has_bias = True + + # metadata + md = types.SimpleNamespace() + md.model_type = args.model_type + md.num_layers = margs.num_layers + md.hidden_size = margs.hidden_size + md.seq_length = margs.seq_length + md.num_attention_heads = margs.num_attention_heads + md.max_position_embeddings = margs.max_position_embeddings + md.tokenizer_type = margs.tokenizer_type + md.iteration = margs.iteration + md.params_dtype = margs.params_dtype + md.bert_binary_head = margs.bert_binary_head + md.output_layer = margs.untie_embeddings_and_output_weights + md.position_embedding_type = margs.position_embedding_type + md.linear_bias = margs.add_bias_linear + md.norm_has_bias = norm_has_bias + md.swiglu = margs.swiglu + md.previous_tensor_parallel_size = margs.tensor_model_parallel_size + md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size + md.true_vocab_size = true_vocab_size + md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by + md.checkpoint_args = checkpoint_args + + # Get first pipe stage + mpu.set_pipeline_model_parallel_rank(0) + all_models = [get_models(tp_size, md.params_dtype)] + models = all_models[0][0] + + md.consumed_train_samples = consumed_train_samples + md.consumed_valid_samples = consumed_valid_samples + queue.put(md) + + def queue_put(name, msg): + print(f"sending {name}") + msg["name"] = name + queue.put(msg) + + # Send embeddings + message = { + "word embeddings": torch.cat( + [models[tp_rank].language_model.embedding.word_embeddings.weight.data for tp_rank in range(tp_size)], + dim = 0) + } + if md.position_embedding_type == 'learned_absolute': + message["position embeddings"] = models[0].language_model.embedding.position_embeddings.weight.data + else: + assert not hasattr(models[0].language_model.embedding, 'position_embeddings') + + queue_put("embeddings", message) + + total_layer_num = 0 + for vp_rank in range(vp_size): + mpu.set_virtual_pipeline_model_parallel_rank(vp_rank) + for pp_rank in range(pp_size): + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + if vp_rank == 0: + all_models.append(get_models(tp_size, md.params_dtype)) + models = all_models[pp_rank][vp_rank] + for layer_num in range(len(models[0].language_model.encoder.layers)): + message = {} + + # Get non-parallel tensors from tp_rank 0 + layer = models[0].language_model.encoder.layers[layer_num] + message["input norm weight"] = layer.input_norm.weight.data + if norm_has_bias: + message["input norm bias"] = layer.input_norm.bias.data + message["post norm weight"] = layer.post_attention_norm.weight.data + if norm_has_bias: + message["post norm bias"] = layer.post_attention_norm.bias.data + if md.linear_bias: + message["dense bias"] = layer.self_attention.dense.bias.data + message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data + + # Grab all parallel tensors for this layer + qkv_weight = [] + qkv_bias = [] + dense_weight = [] + mlp_l0_weight = [] + mlp_l0_bias = [] + mlp_l1_weight = [] + for tp_rank, model in enumerate(models): + layer = model.language_model.encoder.layers[layer_num] + qkv_weight.append(layer.self_attention.query_key_value.weight.data) + dense_weight.append(layer.self_attention.dense.weight.data) + mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data) + mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data) + if md.linear_bias: + qkv_bias.append(layer.self_attention.query_key_value.bias.data) + mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data) + + # Handle gated linear units + if md.swiglu: + # concat all the first halves ('W's) and all the second halves ('V's) + for tp_rank in range(tp_size): + mlp_l0_weight[tp_rank] = torch.chunk(mlp_l0_weight[tp_rank], 2, dim=0) + message["mlp l0 weight W"] = torch.cat([w[0] for w in mlp_l0_weight], dim=0) + message["mlp l0 weight V"] = torch.cat([w[1] for w in mlp_l0_weight], dim=0) + else: + message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0) + + # simple concat of the rest + message["qkv weight"] = torch.cat(qkv_weight, dim=0) + message["dense weight"] = torch.cat(dense_weight, dim=1) + message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1) + if md.linear_bias: + message["qkv bias"] = torch.cat(qkv_bias, dim=0) + if md.swiglu: + for tp_rank in range(tp_size): + mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0) + message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0) + message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0) + else: + message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0) + + queue_put(f"transformer layer {total_layer_num}", message) + + total_layer_num = total_layer_num + 1 + + # Send final norm from tp_rank 0 + message = { + "weight": models[0].language_model.encoder.final_norm.weight.data, + } + if norm_has_bias: + message["bias"] = models[0].language_model.encoder.final_norm.bias.data + queue_put("final norm", message) + + if md.output_layer: + message = { + "weight": torch.cat( + [models[tp_rank].language_model.output_layer.weight.data for tp_rank in range(tp_size)], + dim = 0) + } + queue_put("output layer", message) + + + # Send BERT lm head and binary head if it exists + if md.model_type == 'BERT': + message = { + "weight": models[0].language_model.pooler.dense.weight.data, + "bias": models[0].language_model.pooler.dense.bias.data + } + queue_put("pooler", message) + + message = { + "dense weight": models[0].lm_head.dense.weight.data, + "dense bias": models[0].lm_head.dense.bias.data, + "norm weight": models[0].lm_head.norm.weight.data, + } + if norm_has_bias: + message["norm bias"] = models[0].lm_head.norm.bias.data + queue_put("lm head", message) + + if md.bert_binary_head: + message = { + "weight": models[0].binary_head.weight.data, + "bias": models[0].binary_head.bias.data + } + queue_put("binary head", message) + queue.put("done") + +def load_checkpoint(queue, args): + try: + _load_checkpoint(queue, args) + except: + queue.put("exit") + raise diff --git a/tools/checkpoint/loader_mixtral_hf.py b/tools/checkpoint/loader_mixtral_hf.py new file mode 100644 index 0000000..a53f94e --- /dev/null +++ b/tools/checkpoint/loader_mixtral_hf.py @@ -0,0 +1,335 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import json +import os +import sys +import torch +import transformers +from tqdm import tqdm +import types + + +def add_arguments(parser): + group = parser.add_argument_group(title='Mixtral HF loader.') + + group.add_argument('--true-vocab-size', type=int, default=None, + help='original size of vocab, if specified will trim padding from embedding table.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file. If specified will use this to get vocab size and ' + 'trim padding from the embedding table.') + group.add_argument('--tokenizer-model', required=True, + help='Sentencepiece tokenizer model.') + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of deepspeed repository') + + +def load_args_from_checkpoint(args): + # Read Mixtral 8x7B args. + from transformers import MixtralConfig + mixtral_config = MixtralConfig.from_pretrained(args.load) + + # Update Megatron args. + args.untie_embeddings_and_output_weights = True + args.seq_length = 4096 + args.global_batch_size = 1024 + args.iteration = 1 # '0', 'release' don't work + args.add_position_embedding = False + args.use_rotary_position_embeddings = True + args.swiglu = True + args.bf16 = True + args.add_bias_linear = False + args.normalization = "RMSNorm" + args.tokenizer_type = "Llama2Tokenizer" + args.disable_bias_linear = True + + args.max_position_embeddings = mixtral_config.max_position_embeddings + args.hidden_size = mixtral_config.hidden_size + args.num_attention_heads = mixtral_config.num_attention_heads + args.num_layers = mixtral_config.num_hidden_layers + args.norm_epsilon = mixtral_config.rms_norm_eps + args.vocab_size = mixtral_config.vocab_size + args.padded_vocab_size = mixtral_config.vocab_size + args.mixtral = mixtral_config + args.ffn_hidden_size = mixtral_config.intermediate_size + args.num_experts = mixtral_config.num_local_experts + args.sequence_parallel = True + + if mixtral_config.num_key_value_heads: + args.group_query_attention = True + args.num_query_groups = mixtral_config.num_key_value_heads + +def verify_transformers_version(): + major, minor, patch = map(int, transformers.__version__.split('.')) + assert major >= 4 and minor >= 36 + +def set_preprocess_state(args, model, hf_model): + '''Set embedding params.''' + model.embedding.word_embeddings.weight.data.copy_( + hf_model.model.embed_tokens.weight) + +def set_postprocess_state(args, model, hf_model): + '''Set output layer & norm params.''' + model.decoder.final_layernorm.weight.data.copy_(hf_model.model.norm.weight) + model.output_layer.weight.data.copy_(hf_model.lm_head.weight) + +def set_attn_state(args, layer, hf_layer): + '''Set self-attention params.''' + + # Get attention layer & state. + attn = layer.self_attention + hf_attn = hf_layer.self_attn + + # Reshape loaded weights. + tp = args.tensor_model_parallel_size + num_heads = args.num_attention_heads // tp + num_query_groups = (args.num_query_groups if args.group_query_attention else args.num_attention_heads) // tp + num_querys_per_group = num_heads // num_query_groups + dim = args.kv_channels + assert num_heads % num_querys_per_group == 0 + + # Copy weights (re-order dimensions for Megatron). + attn.linear_qkv.weight.data.copy_(torch.cat([ + hf_attn.q_proj.weight.reshape((num_query_groups, num_querys_per_group*dim, -1)), + hf_attn.k_proj.weight.reshape((num_query_groups, dim, -1)), + hf_attn.v_proj.weight.reshape((num_query_groups, dim, -1)), + ], dim=1).reshape((-1, args.hidden_size))) + attn.linear_proj.weight.data.copy_(hf_attn.o_proj.weight) + +def set_mlp_state(args, layer, hf_layer): + '''Set MLP params.''' + + layer.mlp.router.weight.data.copy_(hf_layer.block_sparse_moe.gate.weight) + + mcore_experts = layer.mlp.experts.local_experts + hf_experts = hf_layer.block_sparse_moe.experts + for expert_idx in range(args.num_experts): + mcore_experts[expert_idx].linear_fc1.weight.data.copy_( + torch.cat([ + hf_experts[expert_idx].w1.weight, + hf_experts[expert_idx].w3.weight + ], dim=0) + ) + mcore_experts[expert_idx].linear_fc2.weight.data.copy_( + hf_experts[expert_idx].w2.weight + ) + +def set_layer_state(args, model, hf_model, layer_idx): + '''Set transformer layer params.''' + + layer = model.decoder.layers[layer_idx] + hf_layer = hf_model.model.layers[layer_idx] + + set_attn_state(args, layer, hf_layer) + set_mlp_state(args, layer, hf_layer) + + layer.self_attention.linear_qkv.layer_norm_weight.data.copy_(hf_layer.input_layernorm.weight) + layer.pre_mlp_layernorm.weight.data.copy_(hf_layer.post_attention_layernorm.weight) + +def load_checkpoint_to_model(args): + '''Set model params.''' + + from pretrain_gpt import model_provider + from transformers import MixtralForCausalLM, MixtralConfig + + # Load Huggingface model. + + hf_model = MixtralForCausalLM.from_pretrained(args.load, device_map="cpu") + + # Init Megatron model. + model = model_provider(True, True).to(args.params_dtype) + + # Set model state. + set_preprocess_state(args, model, hf_model) + set_postprocess_state(args, model, hf_model) + for layer_idx in tqdm(range(args.num_layers), "set layer states"): + set_layer_state(args, model, hf_model, layer_idx) + return model + + +def _load_checkpoint(queue, args): + + # Llama-2 requires HF transformers >=4.31.0. + verify_transformers_version() + + # Search in directory above this. + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir, + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.legacy.model import module + from megatron.core import mpu + from megatron.core.enums import ModelType + from megatron.legacy import fused_kernels + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + queue.put("exit") + exit(1) + + # We want all arguments to come from us. + sys.argv = ['script.py', + '--use-mcore-models', + '--disable-bias-linear', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--mock-data', # To pass the "blend data checks" in arguments.py + '--transformer-impl', 'transformer_engine', + '--load', args.load_dir + ] + + margs = parse_args() + margs.tokenizer_model = args.tokenizer_model + load_args_from_checkpoint(margs) + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes. + margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size + + margs = validate_args(margs) + + def check_for_arg(arg_name, default=None): + if getattr(margs, arg_name, None) is None: + if default is not None: + setattr(margs, arg_name, default) + else: + print(f"Checkpoint does not specify the argument {arg_name}. Exiting.") + print(f"Arguments: {margs}") + queue.put("exit") + exit(1) + + check_for_arg('tensor_model_parallel_size') + check_for_arg('pipeline_model_parallel_size') + check_for_arg('num_layers') + check_for_arg('hidden_size') + check_for_arg('seq_length') + check_for_arg('num_attention_heads') + check_for_arg('max_position_embeddings') + check_for_arg('position_embedding_type') + check_for_arg('tokenizer_type') + check_for_arg('iteration') + check_for_arg('disable_bias_linear') + check_for_arg('params_dtype') + check_for_arg('swiglu') + + # Determine how to make our models. + assert args.model_type == 'GPT', 'Llama-2 is a GPT model.' + margs.model_type = ModelType.encoder_or_decoder + + # Suppress warning about torch.distributed not being initialized. + module.MegatronModule.embedding_warning_printed = True + + set_global_variables(margs, build_tokenizer=False) + mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) + mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size) + mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size) + mpu.set_expert_model_parallel_world_size(margs.expert_model_parallel_size) + fused_kernels.load(margs) + + # Metadata. + md = types.SimpleNamespace() + md.model_type = args.model_type + md.num_layers = margs.num_layers + md.hidden_size = margs.hidden_size + md.seq_length = margs.seq_length + md.num_attention_heads = margs.num_attention_heads + md.max_position_embeddings = margs.max_position_embeddings + md.tokenizer_type = margs.tokenizer_type + md.iteration = margs.iteration + md.params_dtype = margs.params_dtype + md.bert_binary_head = margs.bert_binary_head + md.output_layer = margs.untie_embeddings_and_output_weights + md.position_embedding_type = margs.position_embedding_type + md.linear_bias = margs.add_bias_linear + md.norm_has_bias = False + md.swiglu = margs.swiglu + md.previous_tensor_parallel_size = margs.tensor_model_parallel_size + md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size + md.true_vocab_size = margs.vocab_size # skips padding in saver + md.make_vocab_size_divisible_by = None + md.checkpoint_args = margs + md.consumed_train_samples = 0 + md.consumed_valid_samples = 0 + md.num_experts = margs.num_experts + + # Get first pipe stage. + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + mpu.set_expert_model_parallel_rank(0) + model = load_checkpoint_to_model(margs) + + queue.put(md) + + def queue_put(name, msg): + print(f"sending {name}") + msg["name"] = name + queue.put(msg) + + # Send embeddings. + message = { + "word embeddings": model.embedding.word_embeddings.weight.data + } + if md.position_embedding_type == 'learned_absolute': + message["position embeddings"] = model.embedding.position_embeddings.weight.data + else: + assert not hasattr(model.embedding, 'position_embeddings') + + queue_put("embeddings", message) + + for layer_idx in range(margs.num_layers): + message = {} + + # Get non-parallel tensors from tp_rank 0. + layer = model.decoder.layers[layer_idx] + message["input norm weight"] = layer.self_attention.linear_qkv.layer_norm_weight.data + message["post norm weight"] = layer.pre_mlp_layernorm.weight.data + + # Simple concat of the rest. + message["qkv weight"] = layer.self_attention.linear_qkv.weight.data + message["dense weight"] = layer.self_attention.linear_proj.weight.data + + # Grab all parallel tensors for this layer. + layer = model.decoder.layers[layer_idx] + experts = layer.mlp.experts.local_experts + + message["router weight"] = layer.mlp.router.weight.data + if md.swiglu: + chunked_mlp_l0_weight = [torch.chunk(local_expert.linear_fc1.weight.data, 2, dim=0) for local_expert in experts] + message["mlp l0 weight W"] = torch.stack([local_weight[0] for local_weight in chunked_mlp_l0_weight], dim=0) + message["mlp l0 weight V"] = torch.stack([local_weight[1] for local_weight in chunked_mlp_l0_weight], dim=0) + else: + message["mlp l0 weight"] = torch.stack([local_expert.linear_fc1.weight.data for local_expert in experts]) + message["mlp l1 weight"] = torch.stack([local_expert.linear_fc2.weight.data for local_expert in experts], dim=0) + + queue_put(f"transformer layer {layer_idx}", message) + + queue_put("final norm", { + "weight": model.decoder.final_layernorm.weight.data, + }) + + if md.output_layer: + queue_put("output layer", { + "weight": model.output_layer.weight.data + }) + + queue.put("done") + +def load_checkpoint(queue, args): + try: + _load_checkpoint(queue, args) + except: + queue.put("exit") + raise diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py new file mode 100644 index 0000000..fbfd061 --- /dev/null +++ b/tools/checkpoint/saver_mcore.py @@ -0,0 +1,796 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os +import sys +import torch +from importlib.metadata import version +from pkg_resources import packaging + +from setter import ModelSetter +from utils import get_mcore_transformer_block_key, print_memory_usage + + +class MCoreSetter(ModelSetter): + + transformer_block_key = None + + @classmethod + def get_transformer_block(cls, model): + return getattr(model, cls.transformer_block_key) + + @classmethod + def has_position_embeddings(cls, model): + return hasattr(model.embedding, "position_embeddings") + + @classmethod + def set_embeddings( + cls, + model, + word=None, + pos=None, + ): + cls.set_tensor(model.embedding.word_embeddings.weight, word) + if pos is not None: + cls.set_tensor(model.embedding.position_embeddings.weight, pos) + + @classmethod + def set_final_norm( + cls, + model, + weight=None, + bias=None, + ): + block = cls.get_transformer_block(model) + cls.set_tensor(block.final_layernorm.weight, weight) + if bias is not None: + cls.set_tensor(block.final_layernorm.bias, bias) + + @classmethod + def set_output_word_embeddings( + cls, + model, + emb=None, + ): + cls.set_tensor(model.embedding.word_embeddings.weight, emb) + + @classmethod + def set_output_layer( + cls, + model, + weight=None, + ): + cls.set_tensor(model.output_layer.weight, weight) + + @classmethod + def set_pooler( + cls, + model, + weight=None, + bias=None, + ): + cls.set_tensor(model.pooler.dense.weight, weight) + if bias is not None: + cls.set_tensor(model.pooler.dense.bias, bias) + + @classmethod + def set_lm_head( + cls, + model, + dense_weight=None, + dense_bias=None, + norm_weight=None, + norm_bias=None, + ): + + cls.set_tensor(model.lm_head.dense.weight, dense_weight) + if dense_bias is not None: + cls.set_tensor(model.lm_head.dense.bias, dense_bias) + + cls.set_tensor(model.lm_head.layer_norm.weight, norm_weight) + if norm_bias is not None: + cls.set_tensor(model.lm_head.layer_norm.bias, norm_bias) + + @classmethod + def set_binary_head( + cls, + model, + weight=None, + bias=None, + ): + cls.set_tensor(model.binary_head.weight, weight) + if bias is not None: + cls.set_tensor(model.binary_head.bias, bias) + + +class MCoreLocalSetter(MCoreSetter): + + @classmethod + def set_layer( + cls, + model, + layer_idx, + self_attn_norm_weight=None, + self_attn_norm_bias=None, + self_attn_qkv_weight=None, + self_attn_qkv_bias=None, + self_attn_proj_weight=None, + self_attn_proj_bias=None, + mlp_norm_weight=None, + mlp_norm_bias=None, + mlp_fc1_weight=None, + mlp_fc1_bias=None, + mlp_fc2_weight=None, + mlp_fc2_bias=None, + ): + + block = cls.get_transformer_block(model) + l = block.layers[layer_idx] + + # Self attention. + cls.set_tensor(l.input_layernorm.weight, self_attn_norm_weight) + if self_attn_norm_bias is not None: + cls.set_tensor(l.input_layernorm.bias, self_attn_norm_bias) + + cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight) + if self_attn_qkv_bias is not None: + cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias) + + cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight) + if self_attn_proj_bias is not None: + cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias) + + # MLP. + cls.set_tensor(l.pre_mlp_layernorm.weight, mlp_norm_weight) + if mlp_norm_bias is not None: + cls.set_tensor(l.pre_mlp_layernorm.bias, mlp_norm_bias) + + cls.set_tensor(l.mlp.linear_fc1.weight, mlp_fc1_weight) + if mlp_fc1_bias is not None: + cls.set_tensor(l.mlp.linear_fc1.bias, mlp_fc1_bias) + + cls.set_tensor(l.mlp.linear_fc2.weight, mlp_fc2_weight) + if mlp_fc2_bias is not None: + cls.set_tensor(l.mlp.linear_fc2.bias, mlp_fc2_bias) + + +class MCoreTESetter(MCoreSetter): + + @classmethod + def set_layer( + cls, + model, + layer_idx, + self_attn_norm_weight=None, + self_attn_norm_bias=None, + self_attn_qkv_weight=None, + self_attn_qkv_bias=None, + self_attn_proj_weight=None, + self_attn_proj_bias=None, + mlp_norm_weight=None, + mlp_norm_bias=None, + mlp_fc1_weight=None, + mlp_fc1_bias=None, + mlp_fc2_weight=None, + mlp_fc2_bias=None, + ): + + block = cls.get_transformer_block(model) + l = block.layers[layer_idx] + + # Self attention. + cls.set_tensor(l.self_attention.linear_qkv.layer_norm_weight, self_attn_norm_weight) + if self_attn_norm_bias is not None: + cls.set_tensor(l.self_attention.linear_qkv.layer_norm_bias, self_attn_norm_bias) + + cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight) + if self_attn_qkv_bias is not None: + cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias) + + cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight) + if self_attn_proj_bias is not None: + cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias) + + # MLP. + cls.set_tensor(l.mlp.linear_fc1.layer_norm_weight, mlp_norm_weight) + if mlp_norm_bias is not None: + cls.set_tensor(l.mlp.linear_fc1.layer_norm_bias, mlp_norm_bias) + + cls.set_tensor(l.mlp.linear_fc1.weight, mlp_fc1_weight) + if mlp_fc1_bias is not None: + cls.set_tensor(l.mlp.linear_fc1.bias, mlp_fc1_bias) + + cls.set_tensor(l.mlp.linear_fc2.weight, mlp_fc2_weight) + if mlp_fc2_bias is not None: + cls.set_tensor(l.mlp.linear_fc2.bias, mlp_fc2_bias) + +class MCoreMoETESetter(MCoreSetter): + + @classmethod + def set_layer( + cls, + model, + layer_idx, + router_weight=None, + self_attn_norm_weight=None, + self_attn_norm_bias=None, + self_attn_qkv_weight=None, + self_attn_qkv_bias=None, + self_attn_proj_weight=None, + self_attn_proj_bias=None, + mlp_norm_weight=None, + mlp_norm_bias=None, + mlp_fc1_weight=None, + mlp_fc1_bias=None, + mlp_fc2_weight=None, + mlp_fc2_bias=None, + ): + + block = cls.get_transformer_block(model) + l = block.layers[layer_idx] + + # Self attention. + cls.set_tensor(l.self_attention.linear_qkv.layer_norm_weight, self_attn_norm_weight) + if self_attn_norm_bias is not None: + cls.set_tensor(l.self_attention.linear_qkv.layer_norm_bias, self_attn_norm_bias) + cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight) + if self_attn_qkv_bias is not None: + cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias) + cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight) + if self_attn_proj_bias is not None: + cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias) + + # MLP. + cls.set_tensor(l.pre_mlp_layernorm.weight, mlp_norm_weight) + if model.config.normalization == "LayerNorm": + cls.set_tensor(l.pre_mlp_layernorm.bias, mlp_norm_bias) + + cls.set_tensor(l.mlp.router.weight, router_weight) + + num_local_experts = mlp_fc1_weight.shape[0] + for expert_idx in range(num_local_experts): + cls.set_tensor(l.mlp.experts.local_experts[expert_idx].linear_fc1.weight, mlp_fc1_weight[expert_idx]) + cls.set_tensor(l.mlp.experts.local_experts[expert_idx].linear_fc2.weight, mlp_fc2_weight[expert_idx]) + + +def get_model_setter(model_type, transformer_impl, num_experts=0): + if num_experts is not None and num_experts > 0: + # Only support TE setter for MOE + assert transformer_impl == "transformer_engine" + setter = MCoreMoETESetter + else: + setter = { + "local" : MCoreLocalSetter, + "transformer_engine" : MCoreTESetter, + }[transformer_impl] + setter.transformer_block_key = get_mcore_transformer_block_key(model_type) + return setter + + +def add_arguments(parser): + group = parser.add_argument_group(title='M-Core saver') + + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of Megatron repository') + + group.add_argument('--target-tensor-parallel-size', type=int, + help='Target tensor model parallel size, defaults to the tensor parallel size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--target-pipeline-parallel-size', type=int, + help='Target tensor model parallel size, default to the pipeline parall size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--saver-transformer-impl', default='transformer_engine', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + group.add_argument('--target-expert-parallel-size', type=int, default=1, + help='Target expert model parallel size, default to 1') + + +def save_checkpoint(queue, args): + + # Transformer engine >= 0.12.0, for CPU initialization. + te_version = packaging.version.Version(version("transformer-engine")) + assert te_version >= packaging.version.Version("0.12.0"), \ + "transformer engine version: %s (>=0.12.0 required)." % te_version + + # Search in directory above this + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir, + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.training.arguments import (parse_args, validate_args) + from megatron.training.checkpointing import save_checkpoint + from megatron.training.global_vars import set_global_variables, get_args + from megatron.core.enums import ModelType + from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding + from megatron.legacy import fused_kernels + from megatron.core import mpu + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + exit(1) + + def queue_get(name=None): + val = queue.get() + if val == "exit": + print("Loader exited, exiting saver") + exit(1) + if name is not None and args.checking and val["name"] != name: + val_name = val["name"] + print(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') + exit(1) + if name is not None: + print(f"received {name}") + return val + + def check_message(msg): + if not args.checking: + return + msg_name = msg.pop("name") + if len(msg.keys()) > 0: + print(f"Unexpected values in {msg_name}:") + for key in msg.keys(): + print(f" {key}") + print(f"Exiting. If you want to ignore this, use the argument --no-checking.") + exit(1) + + + md = queue_get() + + if args.target_tensor_parallel_size is None: + if hasattr(md, 'previous_tensor_parallel_size'): + args.target_tensor_parallel_size = md.previous_tensor_parallel_size + else: + print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " + "Default to 1.") + args.target_tensor_parallel_size = 1 + + if args.target_pipeline_parallel_size is None: + if hasattr(md, 'previous_pipeline_parallel_size'): + args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size + else: + print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " + "Default to 1.") + args.target_pipeline_parallel_size = 1 + + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: + if args.target_expert_parallel_size is not None: + os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size * args.target_expert_parallel_size}' + else: + os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' + + # We want all arguments to come from us + sys.argv = ['script.py', + '--num-layers', str(md.num_layers), + '--hidden-size', str(md.hidden_size), + '--seq-length', str(md.seq_length), + '--num-experts', str(getattr(md, "num_experts", 0)), + '--num-attention-heads', str(md.num_attention_heads), + '--max-position-embeddings', str(md.max_position_embeddings), + '--position-embedding-type', str(md.position_embedding_type), + '--tokenizer-type', str(md.tokenizer_type), + '--tensor-model-parallel-size', str(args.target_tensor_parallel_size), + '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size), + '--expert-model-parallel-size', str(args.target_expert_parallel_size), + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--save-interval', '1', + '--save', args.save_dir + ] + + if md.make_vocab_size_divisible_by is not None: + sys.argv.extend(['--make-vocab-size-divisible-by', str(md.make_vocab_size_divisible_by)]) + if md.params_dtype == torch.float16: + sys.argv.append('--fp16') + elif md.params_dtype == torch.bfloat16: + sys.argv.append('--bf16') + + if md.output_layer: + sys.argv.append('--untie-embeddings-and-output-weights') + if not md.linear_bias: + sys.argv.append('--disable-bias-linear') + + if md.model_type == 'BERT' and not md.bert_binary_head: + sys.argv.append('--bert-no-binary-head') + + margs = parse_args() + + if hasattr (md, 'checkpoint_args'): + # These are arguments that we are either changing, or cause problems for validation if they are set + # Note that some of these deal with T5 so will need to be changed if we support T5. + args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'expert_model_parallel_size', 'world_size', 'params_dtype', + 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', + 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', + 'sequence_parallel', 'async_tensor_model_parallel_allreduce', + 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', + 'vocab_file', 'tokenizer_model', + 'save_interval', 'save', + 'perform_initialization', 'use_cpu_initialization', + 'recompute_granularity', 'recompute_num_layers', 'recompute_method', + 'encoder_num_layers', 'encoder_seq_length', + 'distribute_saved_activations', + 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', + 'start_weight_decay', 'end_weight_decay'] + + for arg, value in vars(md.checkpoint_args).items(): + if arg in args_to_keep: + continue + if not hasattr(margs, arg): + print(f"Checkpoint had argument {arg} but new arguments does not have this.") + continue + if getattr(margs, arg) != value: + print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") + setattr(margs, arg, value) + + # Explicitly copy sequence_parallel, apply_query_key_layer_scaling. + margs.sequence_parallel = md.checkpoint_args.sequence_parallel + margs.apply_query_key_layer_scaling = md.checkpoint_args.apply_query_key_layer_scaling + + # Sequence parallel is required if use both tensor-parallel and Moe. + if margs.num_experts is not None and args.target_tensor_parallel_size is not None: + if margs.num_experts > 1 and args.target_tensor_parallel_size > 1: + margs.sequence_parallel = True + + validate_args(margs) + + # Use M-core models & unset loaded paths. + margs.use_legacy_models = False + margs.blendable_index_path = None + margs.data_path = [] + margs.load = None + margs.save = args.save_dir + margs.tensorboard_dir = None + margs.tokenizer_model = None + margs.transformer_impl = args.saver_transformer_impl + + set_global_variables(margs, build_tokenizer=False) + + # Megatron args. (i.e., 'margs') + margs = get_args() + + if hasattr(md, 'consumed_train_samples'): + margs.consumed_train_samples = md.consumed_train_samples + margs.consumed_valid_samples = md.consumed_valid_samples + print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" + f" and consumed_valid_samples to {margs.consumed_valid_samples}") + else: + print("consumed_train_samples not provided.") + + # Determine how to make our models + if md.model_type == 'GPT': + from pretrain_gpt import model_provider + margs.model_type = ModelType.encoder_or_decoder + elif md.model_type == 'BERT': + from pretrain_bert import model_provider + margs.model_type = ModelType.encoder_or_decoder + else: + raise Exception(f'unrecognized model type: {args.model_type}') + + # fake initializing distributed + mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) + mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size) + mpu.set_expert_model_parallel_world_size(args.target_expert_parallel_size) + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + mpu.set_expert_model_parallel_rank(0) + fused_kernels.load(margs) + + # Embeddings + #----------- + embeddings_msg = queue_get("embeddings") + + pos_embed = None + if md.position_embedding_type == 'learned_absolute': + pos_embed = embeddings_msg.pop("position embeddings") + orig_word_embed = embeddings_msg.pop("word embeddings") + check_message(embeddings_msg) + + # Deal with padding + def pad_weight(orig_word_embed, true_vocab_size): + if true_vocab_size is not None: + # figure out what our padded vocab size is + orig_vocab_size = orig_word_embed.shape[0] + margs.padded_vocab_size = _vocab_size_with_padding(true_vocab_size, margs) + + # Cut out extra padding we don't need + if orig_vocab_size > margs.padded_vocab_size: + full_word_embed = orig_word_embed[0:margs.padded_vocab_size,:] + + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < margs.padded_vocab_size: + padding_size = margs.padded_vocab_size - orig_vocab_size + + full_word_embed = torch.cat(( + orig_word_embed, + orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) + + # Same size! + else: + full_word_embed = orig_word_embed + else: + print("Original vocab size not specified, leaving embedding table as-is. " + "If you've changed the tensor parallel size this could cause problems.") + margs.padded_vocab_size = orig_word_embed.shape[0] + full_word_embed = orig_word_embed + return full_word_embed + + full_word_embed = pad_weight(orig_word_embed, md.true_vocab_size) + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk(full_word_embed, args.target_tensor_parallel_size, dim=0) + + # Parameter setter class. + setter = get_model_setter(md.model_type, margs.transformer_impl, margs.num_experts) + + # Construct a 3D(PPxEPxTP) arry for models, fill it with None + models = [[[None for _ in range(args.target_tensor_parallel_size)] for _ in range(args.target_expert_parallel_size)] for _ in range(args.target_pipeline_parallel_size)] + + # Model is lazy instantiated at firstly using + def get_local_model(pp_rank, ep_rank, tp_rank): + if models[pp_rank][ep_rank][tp_rank] is None: + pre_process = True if pp_rank == 0 else False + post_process = True if pp_rank == args.target_pipeline_parallel_size - 1 else False + models[pp_rank][ep_rank][tp_rank] = model_provider(pre_process, post_process).to(md.params_dtype) + return models[pp_rank][ep_rank][tp_rank] + + # Set embeddings. + # -------------- + for ep_rank in range(args.target_expert_parallel_size): + for tp_rank in range(args.target_tensor_parallel_size): + model = get_local_model(0, ep_rank, tp_rank) + if pos_embed is None: + assert not setter.has_position_embeddings(model) + setter.set_embeddings( + model, + word=out_word_embed[tp_rank], + pos=pos_embed, + ) + + def chunk_weight(weight, parallel_mode, tp_size=1, ep_size=1): + assert parallel_mode in ["row", "column"] + if weight.dim() == 3: + num_experts, out_features, in_features = weight.shape + if parallel_mode == "column": + weight = weight.reshape(ep_size, num_experts // ep_size, tp_size, out_features // tp_size, in_features) + weight = weight.permute(0, 2, 1, 3, 4) + else: + weight = weight.reshape(ep_size, num_experts // ep_size, out_features, tp_size, in_features // tp_size) + weight = weight.permute(0, 3, 1, 2, 4) + return weight # (ep_size, tp_size, local_eps, output_features, in_features) + else: + out_features, in_features = weight.shape + if parallel_mode == "column": + weight = weight.reshape(tp_size, out_features // tp_size, in_features) + else: + weight = weight.reshape(out_features, tp_size, in_features // tp_size).permute(1, 0, 2) + return weight # (tp_size, output_features, in_features) + + def chunk_bias(bias, parallel_mode, tp_size=1, ep_size=1): + assert parallel_mode in ["row", "column"] + if bias.dim() == 2: + num_experts, hidden_size = bias.shape + if parallel_mode == 'column': + bias = bias.reshape(ep_size, num_experts // ep_size, tp_size, hidden_size // tp_size) + bias = bias.permute(0, 2, 1, 3) # (ep_size, tp_size, local_eps, hidden_size) + else: + bias = bias.reshape(ep_size, num_experts // ep_size, hidden_size) # (ep_size, local_eps, hidden_size) + return bias + else: + hidden_size = bias.shape + if parallel_mode == "column": + bias = bias.reshape(tp_size, hidden_size[0] // tp_size) # (tp_size, hidden_size) + return bias + + # Transformer layers. + # ------------------ + total_layer_num = 0 + for pp_rank in range(args.target_pipeline_parallel_size): + # initial the first module in pp stage to get the layer_num, pooler, lm_head. binary_head + get_local_model(pp_rank,0,0) + for layer_id in range(len(setter.get_transformer_block(models[pp_rank][0][0]).layers)): + msg = queue_get(f"transformer layer {total_layer_num}") + + # duplicated tensors + input_norm_weight = msg.pop("input norm weight") + post_norm_weight = msg.pop("post norm weight") + if md.norm_has_bias: + input_norm_bias = msg.pop("input norm bias") + post_norm_bias = msg.pop("post norm bias") + + # Split up the parallel tensors + qkv_weight = chunk_weight(msg.pop("qkv weight"), "column", args.target_tensor_parallel_size) + dense_weight = chunk_weight(msg.pop("dense weight"), "row", args.target_tensor_parallel_size) + mlp_l1_weight = chunk_weight(msg.pop("mlp l1 weight"), "row", args.target_tensor_parallel_size, args.target_expert_parallel_size) + + if margs.num_experts: + router = msg.pop("router weight") + + # Special handling for swiglu + if md.swiglu: + mlp_l0_weight_W = chunk_weight(msg.pop("mlp l0 weight W"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size) + mlp_l0_weight_V = chunk_weight(msg.pop("mlp l0 weight V"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size) + mlp_l0_weight = torch.cat((mlp_l0_weight_W, mlp_l0_weight_V), dim=-2) + else: + mlp_l0_weight = chunk_weight(msg.pop("mlp l0 weight"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size) + + if md.linear_bias: + dense_bias = msg.pop("dense bias") + mlp_l1_bias = chunk_bias(msg.pop("mlp l1 bias"), 'row', args.target_tensor_parallel_size, args.target_expert_parallel_size) + qkv_bias = chunk_bias(msg.pop("qkv bias"), 'column', args.target_tensor_parallel_size) + if md.swiglu: + mlp_l0_bias_W = chunk_bias(msg.pop("mlp l0 bias W"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size) + mlp_l0_bias_V = chunk_bias(msg.pop("mlp l0 bias V"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size) + mlp_l0_bias = torch.cat((mlp_l0_bias_W, mlp_l0_bias_V), dim=-1) + else: + mlp_l0_bias = chunk_bias(msg.pop("mlp l0 bias"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size) + + # Save them to the model + for ep_rank in range(args.target_expert_parallel_size): + for tp_rank in range(args.target_tensor_parallel_size): + params_dict = { + "self_attn_norm_weight" : input_norm_weight, + "self_attn_qkv_weight" : qkv_weight[tp_rank], + "self_attn_proj_weight" : dense_weight[tp_rank], + "mlp_norm_weight" : post_norm_weight + } + if margs.num_experts: + params_dict.update({ + "mlp_fc1_weight" : mlp_l0_weight[ep_rank][tp_rank], + "mlp_fc2_weight" : mlp_l1_weight[ep_rank][tp_rank] + }) + else: + params_dict.update({ + "mlp_fc1_weight" : mlp_l0_weight[tp_rank], + "mlp_fc2_weight" : mlp_l1_weight[tp_rank] + }) + params_dict.update({ + "self_attn_norm_bias" : input_norm_bias if md.norm_has_bias else None, + "mlp_norm_bias" : post_norm_bias if md.norm_has_bias else None, + }) + if md.linear_bias: + params_dict.update({ + "self_attn_qkv_bias" : qkv_bias[tp_rank], + "self_attn_proj_bias" : dense_bias + }) + if margs.num_experts: + params_dict.update({ + "mlp_fc1_bias" : mlp_l0_bias[ep_rank][tp_rank], + "mlp_fc2_bias" : mlp_l1_bias[ep_rank] + }) + else : + params_dict.update({ + "mlp_fc1_bias" : mlp_l0_bias[tp_rank], + "mlp_fc2_bias" : mlp_l1_bias + }) + if margs.num_experts: + params_dict.update({ + "router_weight": router + }) + model = get_local_model(pp_rank, ep_rank, tp_rank) + setter.set_layer(model, layer_id, **params_dict) + + total_layer_num = total_layer_num + 1 + check_message(msg) + + + if pp_rank == args.target_pipeline_parallel_size - 1: + msg = queue_get("final norm") + final_norm_weight = msg.pop("weight") + if md.norm_has_bias: + final_norm_bias = msg.pop("bias") + pp_local_models = [get_local_model(pp_rank, ep_rank, tp_rank) for ep_rank in range(args.target_expert_parallel_size) + for tp_rank in range(args.target_tensor_parallel_size)] + for eptp_rank, model in enumerate(pp_local_models): + tp_rank = eptp_rank % args.target_tensor_parallel_size + setter.set_final_norm( + model, + weight=final_norm_weight, + bias=final_norm_bias if md.norm_has_bias else None, + ) + if pp_rank != 0 and not md.output_layer: + # Copy word embeddings to final pipeline rank + setter.set_output_word_embeddings( + model, + emb=out_word_embed[tp_rank], + ) + del final_norm_weight + if md.norm_has_bias: + del final_norm_bias + check_message(msg) + + if md.output_layer: + msg = queue_get("output layer") + if not hasattr(pp_local_models[0], 'output_layer'): + print("ERROR: got an output layer, but model does not have one") + exit(1) + output_layer_weight = pad_weight(msg.pop("weight"), md.true_vocab_size) + output_layer_weight = torch.chunk(output_layer_weight, args.target_tensor_parallel_size, dim=0) + for eptp_rank, model in enumerate(pp_local_models): + tp_rank = eptp_rank % args.target_tensor_parallel_size + setter.set_output_layer(model, output_layer_weight[tp_rank]) + check_message(msg) + + msg = queue_get() + if msg != "done" and msg["name"] == "pooler": + if not hasattr(models[pp_rank][0][0], 'pooler'): + print("ERROR: got a pooler, but model does not have one") + exit(1) + print("received pooler") + pooler_weight = msg.pop("weight") + pooler_bias = msg.pop("bias") + for model in pp_local_models: + setter.set_pooler( + model=model, + weight=pooler_weight, + bias=pooler_bias, + ) + del pooler_weight + del pooler_bias + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "lm head": + if not hasattr(models[pp_rank][0][0], 'lm_head'): + print("ERROR: got an lm head, but model does not have one") + exit(1) + print("received lm head") + lm_head_dense_weight = msg.pop("dense weight") + lm_head_dense_bias = msg.pop("dense bias") + lm_head_norm_weight = msg.pop("norm weight") + if md.norm_has_bias: + lm_head_norm_bias = msg.pop("norm bias") + for model in pp_local_models: + setter.set_lm_head( + model=model, + dense_weight=lm_head_dense_weight, + dense_bias=lm_head_dense_bias, + norm_weight=lm_head_norm_weight, + norm_bias=lm_head_norm_bias if md.norm_has_bias else None, + ) + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "binary head": + if not hasattr(models[pp_rank][0][0], 'binary_head'): + print("ERROR: got a binary head, but model does not have one") + exit(1) + print("received binary head") + binary_head_weight = msg.pop("weight") + binary_head_bias = msg.pop("bias") + for model in pp_local_models: + setter.set_binary_head( + model=model, + weight=binary_head_weight, + bias=binary_head_bias, + ) + check_message(msg) + msg = queue_get() + + # TODO: delete weight when not used + if msg != "done": + print("ERROR: got some more data but was expecting to be done") + + for ep_rank in range(args.target_expert_parallel_size): + for tp_rank in range(args.target_tensor_parallel_size): + save_checkpoint(md.iteration, [get_local_model(pp_rank, ep_rank, tp_rank)], None, None, num_floating_point_operations_so_far=0, + pipeline_rank=pp_rank, pipeline_parallel=args.target_pipeline_parallel_size > 1, + expert_rank=ep_rank, expert_parallel=args.target_expert_parallel_size > 1, + tensor_rank=tp_rank) + # release the uselese model parts + models[pp_rank][ep_rank][tp_rank] = None + + print("Done!") diff --git a/tools/checkpoint/saver_megatron.py b/tools/checkpoint/saver_megatron.py new file mode 100644 index 0000000..38f80f1 --- /dev/null +++ b/tools/checkpoint/saver_megatron.py @@ -0,0 +1,410 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os +import sys +import torch + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron saver') + + group.add_argument('--megatron-path', type=str, default=None, + help='Base directory of Megatron repository') + + group.add_argument('--target-tensor-parallel-size', type=int, + help='Target tensor model parallel size, defaults to the tensor parallel size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--target-pipeline-parallel-size', type=int, + help='Target tensor model parallel size, default to the pipeline parall size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--saver-transformer-impl', default='local', + choices=['local', 'transformer_engine'], + help='Which Transformer implementation to use.') + +def save_checkpoint(queue, args): + # Search in directory above this + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir, + os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + try: + from megatron.training.arguments import (parse_args, validate_args) + from megatron.training.checkpointing import save_checkpoint + from megatron.training.global_vars import set_global_variables, get_args + from megatron.core.enums import ModelType + from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding + from megatron.legacy import fused_kernels + from megatron.core import mpu + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + exit(1) + + def queue_get(name=None): + val = queue.get() + if val == "exit": + print("Loader exited, exiting saver") + exit(1) + if name is not None and args.checking and val["name"] != name: + val_name = val["name"] + print(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') + exit(1) + if name is not None: + print(f"received {name}") + return val + + def check_message(msg): + if not args.checking: + return + msg_name = msg.pop("name") + if len(msg.keys()) > 0: + print(f"Unexpected values in {msg_name}:") + for key in msg.keys(): + print(f" {key}") + print(f"Exiting. If you want to ignore this, use the argument --no-checking.") + exit(1) + + md = queue_get() + + if args.target_tensor_parallel_size is None: + if hasattr(md, 'previous_tensor_parallel_size'): + args.target_tensor_parallel_size = md.previous_tensor_parallel_size + else: + print( + "loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " + "Default to 1.") + args.target_tensor_parallel_size = 1 + + if args.target_pipeline_parallel_size is None: + if hasattr(md, 'previous_pipeline_parallel_size'): + args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size + else: + print( + "loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " + "Default to 1.") + args.target_pipeline_parallel_size = 1 + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: + os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' + + # We want all arguments to come from us + sys.argv = ['script.py', + '--num-layers', str(md.num_layers), + '--hidden-size', str(md.hidden_size), + '--seq-length', str(md.seq_length), + '--num-attention-heads', str(md.num_attention_heads), + '--max-position-embeddings', str(md.max_position_embeddings), + '--position-embedding-type', str(md.position_embedding_type), + '--tokenizer-type', str(md.tokenizer_type), + '--tensor-model-parallel-size', str(args.target_tensor_parallel_size), + '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size), + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--save-interval', '1', + '--save', args.save_dir + ] + + if md.make_vocab_size_divisible_by is not None: + sys.argv.extend(['--make-vocab-size-divisible-by', str(md.make_vocab_size_divisible_by)]) + if md.params_dtype == torch.float16: + sys.argv.append('--fp16') + elif md.params_dtype == torch.bfloat16: + sys.argv.append('--bf16') + + if md.output_layer: + sys.argv.append('--untie-embeddings-and-output-weights') + if not md.linear_bias: + sys.argv.append('--disable-bias-linear') + + if md.model_type == 'BERT' and not md.bert_binary_head: + sys.argv.append('--bert-no-binary-head') + + margs = parse_args() + + if hasattr(md, 'checkpoint_args'): + # These are arguments that we are either changing, or cause problems for validation if they are set + # Note that some of these deal with T5 so will need to be changed if we support T5. + args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', + 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', + 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', + 'sequence_parallel', 'async_tensor_model_parallel_allreduce', + 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', + 'vocab_file', 'tokenizer_model', + 'save_interval', 'save', + 'perform_initialization', 'use_cpu_initialization', + 'recompute_granularity', 'recompute_num_layers', 'recompute_method', + 'encoder_num_layers', 'encoder_seq_length', + 'distribute_saved_activations', + 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', + 'start_weight_decay', 'end_weight_decay', 'bf16', 'fp16'] + + + for arg, value in vars(md.checkpoint_args).items(): + if arg in args_to_keep: + continue + if not hasattr(margs, arg): + print(f"Checkpoint had argument {arg} but new arguments does not have this.") + continue + if getattr(margs, arg) != value: + print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") + setattr(margs, arg, value) + + validate_args(margs) + + # Use MLM models. + margs.use_legacy_models = True + margs.transformer_impl = args.saver_transformer_impl + + # Do not instantiate Tensorboard + margs.tensorboard_dir = None + + set_global_variables(margs, build_tokenizer=False) + + # margs = megatron args + margs = get_args() + + if hasattr(md, 'consumed_train_samples'): + margs.consumed_train_samples = md.consumed_train_samples + margs.consumed_valid_samples = md.consumed_valid_samples + print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" + f" and consumed_valid_samples to {margs.consumed_valid_samples}") + else: + print("consumed_train_samples not provided.") + + # Determine how to make our models + if md.model_type == 'GPT': + from pretrain_gpt import model_provider + margs.model_type = ModelType.encoder_or_decoder + elif md.model_type == 'BERT': + from pretrain_bert import model_provider + margs.model_type = ModelType.encoder_or_decoder + else: + raise Exception(f'unrecognized model type: {args.model_type}') + + def get_models(count, dtype, pre_process, post_process): + models = [model_provider(pre_process, post_process).to(dtype) for _ in range(count)] + return models + + # fake initializing distributed + mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) + mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size) + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + fused_kernels.load(margs) + + # Embeddings + # ----------- + embeddings_msg = queue_get("embeddings") + + pos_embed = None + if md.position_embedding_type == 'learned_absolute': + pos_embed = embeddings_msg.pop("position embeddings") + orig_word_embed = embeddings_msg.pop("word embeddings") + check_message(embeddings_msg) + + # Deal with padding + if md.true_vocab_size is not None: + # figure out what our padded vocab size is + orig_vocab_size = orig_word_embed.shape[0] + margs.padded_vocab_size = _vocab_size_with_padding(md.true_vocab_size, margs) + + # Cut out extra padding we don't need + if orig_vocab_size > margs.padded_vocab_size: + full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :] + + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < margs.padded_vocab_size: + padding_size = margs.padded_vocab_size - orig_vocab_size + + full_word_embed = torch.cat(( + orig_word_embed, + orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) + + # Same size! + else: + full_word_embed = orig_word_embed + else: + print("Original vocab size not specified, leaving embedding table as-is. " + "If you've changed the tensor parallel size this could cause problems.") + margs.padded_vocab_size = orig_word_embed.shape[0] + full_word_embed = orig_word_embed + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk(full_word_embed, args.target_tensor_parallel_size, dim=0) + + # Make models for first pipeline stage and fill in embeddings + mpu.set_pipeline_model_parallel_rank(0) + post_process = args.target_pipeline_parallel_size == 1 + models = get_models(args.target_tensor_parallel_size, md.params_dtype, True, post_process) + for tp_rank, model in enumerate(models): + model.language_model.embedding.word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) + if pos_embed is not None: + model.language_model.embedding.position_embeddings.weight.data.copy_(pos_embed) + else: + assert not hasattr(model.language_model.embedding, "position_embeddings") + + # Transformer layers + # ------------------- + total_layer_num = 0 + for pp_rank in range(args.target_pipeline_parallel_size): + # For later pipeline parallel ranks, make the new models + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + post_process = pp_rank == args.target_pipeline_parallel_size - 1 + models = get_models(args.target_tensor_parallel_size, md.params_dtype, False, post_process) + + for layer in range(len(models[0].language_model.encoder.layers)): + msg = queue_get(f"transformer layer {total_layer_num}") + + # duplicated tensors + input_norm_weight = msg.pop("input norm weight") + if md.norm_has_bias: + input_norm_bias = msg.pop("input norm bias") + post_norm_weight = msg.pop("post norm weight") + if md.norm_has_bias: + post_norm_bias = msg.pop("post norm bias") + if md.linear_bias: + dense_bias = msg.pop("dense bias") + mlp_l1_bias = msg.pop("mlp l1 bias") + + # Split up the parallel tensors + qkv_weight = torch.chunk(msg.pop("qkv weight"), args.target_tensor_parallel_size, dim=0) + dense_weight = torch.chunk(msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1) + mlp_l1_weight = torch.chunk(msg.pop("mlp l1 weight"), args.target_tensor_parallel_size, dim=1) + + # Special handling for swiglu + if md.swiglu: + mlp_l0_weight_W = torch.chunk(msg.pop("mlp l0 weight W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight_V = torch.chunk(msg.pop("mlp l0 weight V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight = [torch.cat(weights, dim=0) for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V)] + else: + mlp_l0_weight = torch.chunk(msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0) + + if md.linear_bias: + qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) + if md.swiglu: + mlp_l0_bias_W = torch.chunk(msg.pop("mlp l0 bias W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_bias_V = torch.chunk(msg.pop("mlp l0 bias V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_bias = [torch.cat(bias, dim=0) for bias in zip(mlp_l0_bias_W, mlp_l0_bias_V)] + else: + mlp_l0_bias = torch.chunk(msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0) + + # Save them to the model + for tp_rank in range(args.target_tensor_parallel_size): + l = models[tp_rank].language_model.encoder.layers[layer] + l.input_norm.weight.data.copy_(input_norm_weight) + if md.norm_has_bias: + l.input_norm.bias.data.copy_(input_norm_bias) + l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank]) + l.self_attention.dense.weight.data.copy_(dense_weight[tp_rank]) + l.post_attention_norm.weight.data.copy_(post_norm_weight) + if md.norm_has_bias: + l.post_attention_norm.bias.data.copy_(post_norm_bias) + l.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank]) + l.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank]) + if md.linear_bias: + l.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank]) + l.self_attention.dense.bias.data.copy_(dense_bias) + l.mlp.dense_h_to_4h.bias.data.copy_(mlp_l0_bias[tp_rank]) + l.mlp.dense_4h_to_h.bias.data.copy_(mlp_l1_bias) + + total_layer_num = total_layer_num + 1 + check_message(msg) + + if post_process: + msg = queue_get("final norm") + final_norm_weight = msg.pop("weight") + if md.norm_has_bias: + final_norm_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.encoder.final_norm.weight.data.copy_(final_norm_weight) + if md.norm_has_bias: + models[tp_rank].language_model.encoder.final_norm.bias.data.copy_(final_norm_bias) + if pp_rank != 0 and not md.output_layer: + # Copy word embeddings to final pipeline rank + models[tp_rank].word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) + del final_norm_weight + if md.norm_has_bias: + del final_norm_bias + check_message(msg) + + if md.output_layer: + msg = queue_get("output layer") + if not hasattr(models[0].language_model, 'output_layer'): + print("ERROR: got an output layer, but model does not have one") + exit(1) + output_layer_weight = torch.chunk(msg.pop("weight"), args.target_tensor_parallel_size, dim=0) + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.output_layer.weight.data.copy_(output_layer_weight[tp_rank]) + del output_layer_weight + check_message(msg) + + msg = queue_get() + if msg != "done" and msg["name"] == "pooler": + if not hasattr(models[0].language_model, 'pooler'): + print("ERROR: got a pooler, but model does not have one") + exit(1) + print("received pooler") + pooler_weight = msg.pop("weight") + pooler_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.pooler.dense.weight.data.copy_(pooler_weight) + models[tp_rank].language_model.pooler.dense.bias.data.copy_(pooler_bias) + del pooler_weight + del pooler_bias + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "lm head": + if not hasattr(models[0], 'lm_head'): + print("ERROR: got an lm head, but model does not have one") + exit(1) + print("received lm head") + lm_head_dense_weight = msg.pop("dense weight") + lm_head_dense_bias = msg.pop("dense bias") + lm_head_norm_weight = msg.pop("norm weight") + if md.norm_has_bias: + lm_head_norm_bias = msg.pop("norm bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].lm_head.dense.weight.data.copy_(lm_head_dense_weight) + models[tp_rank].lm_head.dense.bias.data.copy_(lm_head_dense_bias) + models[tp_rank].lm_head.norm.weight.data.copy_(lm_head_norm_weight) + if md.norm_has_bias: + models[tp_rank].lm_head.norm.bias.data.copy_(lm_head_norm_bias) + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "binary head": + if not hasattr(models[0], 'binary_head'): + print("ERROR: got a binary head, but model does not have one") + exit(1) + print("received binary head") + binary_head_weight = msg.pop("weight") + binary_head_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].binary_head.weight.data.copy_(binary_head_weight) + models[tp_rank].binary_head.bias.data.copy_(binary_head_bias) + check_message(msg) + msg = queue_get() + + if msg != "done": + print("ERROR: got some more data but was expecting to be done") + + for tp_rank in range(args.target_tensor_parallel_size): + mpu.set_tensor_model_parallel_rank(tp_rank) + save_checkpoint(md.iteration, [models[tp_rank]], None, None, + num_floating_point_operations_so_far=0) + print("Done!") diff --git a/tools/checkpoint/setter.py b/tools/checkpoint/setter.py new file mode 100644 index 0000000..5e84cff --- /dev/null +++ b/tools/checkpoint/setter.py @@ -0,0 +1,113 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +class ModelSetter: + '''Model parameter setter. + + See convert.py for a full list of supported parameters and their names. + ''' + + @classmethod + def set_tensor(cls, dst, src): + '''Copy (in-place) src tensor to dst tensor.''' + if src is not None: + dst.data.copy_(src) + + @classmethod + def has_position_embeddings(cls, model): + ''' + Return True if learned parameters exist for position embeddings (e.g., + learned absolute), and False otherwise (e.g., RoPE). + ''' + raise NotImplementedError + + @classmethod + def set_embeddings( + cls, + model, + word=None, + pos=None, + ): + '''Set word and position embeddings.''' + raise NotImplementedError + + @classmethod + def set_output_word_embeddings( + cls, + model, + emb=None, + ): + '''Set output word embeddings for final pipeline stage.''' + raise NotImplementedError + + @classmethod + def set_layer( + cls, + model, + layer_idx, + self_attn_norm_weight=None, + self_attn_norm_bias=None, + self_attn_qkv_weight=None, + self_attn_qkv_bias=None, + self_attn_proj_weight=None, + self_attn_proj_bias=None, + mlp_norm_weight=None, + mlp_norm_bias=None, + mlp_fc1_weight=None, + mlp_fc1_bias=None, + mlp_fc2_weight=None, + mlp_fc2_bias=None, + ): + '''Set layer parameters.''' + raise NotImplementedError + + @classmethod + def set_final_norm( + cls, + model, + weight=None, + bias=None, + ): + '''Set final norm parameters (i.e., after last transformer layer).''' + raise NotImplementedError + + @classmethod + def set_output_layer( + cls, + model, + weight=None, + ): + '''Set output (i.e., 'dense') weights.''' + raise NotImplementedError + + @classmethod + def set_pooler( + cls, + model, + weight=None, + bias=None, + ): + '''Set pooler parameters (e.g., for Bert).''' + raise NotImplementedError + + @classmethod + def set_lm_head( + cls, + model, + dense_weight=None, + dense_bias=None, + norm_weight=None, + norm_bias=None, + ): + '''Set LM head parameters.''' + raise NotImplementedError + + @classmethod + def set_binary_head( + cls, + model, + weight=None, + bias=None, + ): + '''Set binary head parameters.''' + raise NotImplementedError diff --git a/tools/checkpoint/utils.py b/tools/checkpoint/utils.py new file mode 100644 index 0000000..a604619 --- /dev/null +++ b/tools/checkpoint/utils.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import psutil + + +def print_memory_usage(key, rank, num_ranks): + '''Print memory usage.''' + process = psutil.Process() + mem_info = process.memory_info() + print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % ( + key, + rank, + num_ranks, + mem_info.rss / 1024**3, + 100 * mem_info.rss / process.memory_percent() / 1024**3, + )) + + +def get_mcore_transformer_block_key(model_key): + return { + "GPT" : "decoder", + "BERT" : "encoder", + }[model_key] diff --git a/tools/linter.py b/tools/linter.py new file mode 100644 index 0000000..5b14007 --- /dev/null +++ b/tools/linter.py @@ -0,0 +1,36 @@ +import os +import os.path as osp +import pathlib +import subprocess + + +def recursively_lint_files(): + """Recursively lint all python files in chosen subdirectories of megatron-lm""" + + try: + import autopep8 + except ModuleNotFoundError: + print("Please first install autopep8 via `pip install autopep8`") + return + + # get all python file paths from top level directory + file_dir = str(pathlib.Path(__file__).parent.absolute()) + working_dir = osp.join(file_dir, os.pardir) + all_py_paths = set(os.path.join(working_dir, fname) + for fname in os.listdir(working_dir) if ".py" in fname) + + # get all python file paths from chosen subdirectories + check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] + for sub_dir in check_dirs: + for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): + all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) + + print("Linting the following: ") + for py_path in all_py_paths: + print(py_path) + command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) + subprocess.check_call(command) + + +if __name__ == "__main__": + recursively_lint_files() diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py new file mode 100644 index 0000000..c615558 --- /dev/null +++ b/tools/merge_datasets.py @@ -0,0 +1,93 @@ +import os +import sys +import json +import argparse + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) + +from megatron.core.datasets.indexed_dataset import ( + IndexedDataset, + IndexedDatasetBuilder, + get_bin_path, + get_idx_path, +) + + +def get_args(): + parser = argparse.ArgumentParser() + + group = parser.add_argument_group(title="input data") + group.add_argument( + "--input", + type=str, + required=True, + help="Path to directory containing all document files to merge", + ) + + group = parser.add_argument_group(title="output data") + group.add_argument( + "--output-prefix", + type=str, + required=True, + help="Path to binary output file without suffix", + ) + + group = parser.add_argument_group(title="miscellaneous") + group.add_argument( + "--multimodal", + action="store_true", + help="Whether the datasets are assumed to be multimodal" + ) + + args = parser.parse_args() + + assert os.path.isdir( + args.input + ), f"ERROR: {args.input} is not a directory or does not exist" + + assert os.path.isdir( + os.path.dirname(args.output_prefix) + ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist" + + return args + + +def main(): + args = get_args() + + prefixes = set() + for basename in os.listdir(args.input): + prefix, ext = os.path.splitext(basename) + + if prefix in prefixes: + continue + + if not os.path.isfile(os.path.join(args.input, basename)): + continue + + ext_pair = ".bin" if ext == ".idx" else ".idx" + assert os.path.isfile( + os.path.join(args.input, prefix) + ext_pair + ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}" + + prefixes.add(prefix) + + builder = None + for prefix in sorted(prefixes): + if builder is None: + dataset = IndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal) + builder = IndexedDatasetBuilder( + get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal + ) + del dataset + + builder.add_index(os.path.join(args.input, prefix)) + + builder.finalize(get_idx_path(args.output_prefix)) + + +if __name__ == '__main__': + + main() diff --git a/tools/openwebtext/README.md b/tools/openwebtext/README.md new file mode 100644 index 0000000..d7707c6 --- /dev/null +++ b/tools/openwebtext/README.md @@ -0,0 +1,59 @@ +The following steps show how to prepare training dataset to train the mode. + +# Libraries to install + +``` + pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract + git clone https://github.com/mattilyra/LSH + cd LSH + python setup.py install +``` + +# Download the dataset + +1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) +2. Remove blacklisted URLs. +``` +python blacklist_urls.py +``` +3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). + +4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. + +# Prepare the data for GPT training: + +1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. +``` +python cleanup_dataset.py +``` +Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`. +2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`. +``` +python find_duplicates.py --inputs --output +``` +3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. +``` +python group_duplicate_urls.py +``` +4. Remove similar documents that were detected in the last step. +``` +python remove_group_duplicates.py +``` + +5. Shuffle the dataset. +``` +shuf -o train_data.json +``` + +# Deduplicating ngrams + +To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command. + +``` +python filter_ngrams.py --tasks --dedup-dataset --output +``` +We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments. + +Only for the lambada task, we need to provide the path, `--lambada-path `. + +Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details. diff --git a/tools/openwebtext/add_id.py b/tools/openwebtext/add_id.py new file mode 100644 index 0000000..7bea7ee --- /dev/null +++ b/tools/openwebtext/add_id.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import argparse +import json +import os +import time + +""" +This code adds id to each json object in a json file. User can add prefix +to the ids. +""" + +if __name__ == '__main__': + + print('parsing the arguments ...') + + parser = argparse.ArgumentParser() + parser.add_argument('--input-file', type=str, default=None, help='Input'\ + ' json file where id needs to be added') + parser.add_argument('--output-file', type=str, default=None, help=\ + 'Output file name with id') + parser.add_argument('--id-prefix', type=str, default=None, help=\ + 'Id prefix') + parser.add_argument('--log-interval', type=int, default=100, + help='Log interval') + args = parser.parse_args() + + print('Adding ids to dataset ...') + + f_input = open(args.input_file, 'r', encoding='utf-8') + f_output = open(args.output_file, 'wb') + + unique_ids = 1 + start_time = time.time() + for row in f_input: + each_row = json.loads(row) + adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) + each_row['adlr_id'] = adlr_id_string + myjson = json.dumps(each_row, ensure_ascii=False) + + f_output.write(myjson.encode('utf-8')) + f_output.write('\n'.encode('utf-8')) + + if unique_ids % args.log_interval == 0: + print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ + unique_ids, time.time() - start_time), flush=True) + + unique_ids += 1 + + # Close the file. + f_input.close() + f_output.close() + + print('done :-)', flush=True) diff --git a/tools/openwebtext/blacklist_urls.py b/tools/openwebtext/blacklist_urls.py new file mode 100644 index 0000000..f54f661 --- /dev/null +++ b/tools/openwebtext/blacklist_urls.py @@ -0,0 +1,302 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +# WARNING! This file contains a blacklist of known malicious sites and thus contains some NSFW language. + + +import glob +import re +import time +import tldextract +import sys + + +# List of the domains to blacklist. +domain_blacklist = set([ + '500px', + 'aapks', + 'akamaihd', + 'amazon', + 'apple', + 'artifactfire', + 'artstation', + 'awwni', + 'bandcamp', + 'battleforthenet', + 'coinscalendar', + 'dailymotion', + 'deviantart', + 'discord', + 'discordapp', + 'dlapkandroid', + 'dropbox', + 'e621', + 'ebay', + 'edealinfo', + 'erome', + 'eroshare', + 'explosm', + 'facebook', + 'fbcdn', + 'flickr', + 'furaffinity', + 'futhead', + 'gatopardo', + 'gfycat', + 'gifsound', + 'gifsoup', + 'giphy', + 'github', + 'google', + 'gunprime', + 'gyazo', + 'horsefucker', + 'hotdealstar', + 'imagefap', + 'imageshack', + 'imgflip', + 'imgur', + 'instagram', + 'karmadecay', + 'kryptocal', + 'kym-cdn', + 'liveleak', + 'livememe', + 'lmgtfy', + 'magaimg', + 'memegenerator', + 'minorplanetcenter', + 'minus', + 'mobafire', + 'morejpeg', + 'nocookie', + 'pcpartpicker', + 'photobucket', + 'pinimg', + 'pinterest', + 'pixiv', + 'pornhub', + 'prntscr', + 'puu', + 'qkme', + 'quickmeme', + 'radd', + 'redd', + 'reddit', + 'reddit-stream', + 'redditlog', + 'redditmedia', + 'reddituploads', + 'redtube', + 'reupp', + 'reverb', + 'roanoke', + 'rollingstone', + 'sli', + 'soundcloud', + 'soundgasm', + 'spankbang', + 'spotify', + 'strawpoll', + 'streamable', + 'timeanddate', + 'tinypic', + 'touhouradio', + 'tumblr', + 'twimg', + 'twitch', + 'twitter', + 'vid', + 'vimeo', + 'vine', + 'vkaao', + 'vocaroo', + 'voyagefusion', + 'walmart', + 'wciu', + 'wikimedia', + 'wikipedia', + 'xhamster', + 'xkcd', + 'xvideos', + 'youtu', + 'youtube', + 'youtubedoubler', + 'ytimg', + 'zillexplorer', +]) + +def domain_is_in_blacklist(url): + domain = tldextract.extract(url).domain + return domain in domain_blacklist + + +# List of extentions to blacklist. +extentions_blacklist = ( + '.3gp', + '.7z' + '.ai', + '.aif', + '.apk', + '.app', + '.avi', + '.bin', + '.bmp', + '.bz2', + '.css', + '.csv', + '.dat', + '.deb', + '.dmg', + '.doc', + '.docx', + '.exe', + '.gif', + '.gifv', + '.gz', + '.iso', + '.jar', + '.jpeg', + '.jpg', + '.js', + '.log', + '.mid', + '.midi', + '.mkv', + '.mov', + '.mp3', + '.mp4', + '.mpeg', + '.mpg', + '.ogg', + '.ogv', + '.otf', + '.pdf', + '.pkg', + '.png', + '.pps', + '.ppt', + '.pptx', + '.psd', + '.py', + '.qt', + '.ram', + '.rar', + '.sql', + '.svg', + '.swf', + '.tar.gz', + '.tar', + '.tgz', + '.tiff', + '.ttf', + '.txt', + '.wav', + '.webm', + '.wma', + '.wmv', + '.xls', + '.xlsx', + '.xml', + '.xz', + '.zip', +) + +def extention_is_in_blacklist(url): + if url.split('?')[0].lower().endswith(extentions_blacklist): + return True + return False + + +# Malformed urls. +# This function is adapted from: +# https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not +url_regex = re.compile( + r'^(?:http)s?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', re.IGNORECASE) +def url_is_malformed(url): + return re.match(url_regex, url) is None + + +def print_progress(prefix, start_time, urls_counter, + domain_blacklist_counter, + extention_blacklist_counter, + short_url_counter, malformed_url_counter, + duplicate_url_counter): + string = prefix + ' | ' + string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time) + string += 'number of urls: {} | '.format(urls_counter) + string += 'domain blacklisted: {} | '.format(domain_blacklist_counter) + string += 'extention blacklisted: {} | '.format(extention_blacklist_counter) + string += 'short urls (<=8): {} | '.format(short_url_counter) + string += 'malformed urls: {} | '.format(malformed_url_counter) + string += 'duplicate urls: {}'.format(duplicate_url_counter) + print(string, flush=True) + + +if __name__ == '__main__': + + + print('remove blacklisted urls ..') + + # Path to the url files. + path = sys.argv[1] + # Output url file. + output = sys.argv[2] + + # Get the list of url files. + files = glob.glob(path + '/*.txt') + print('> found {} files'.format(len(files))) + + urls = set() + urls_counter = 0 + domain_blacklist_counter = 0 + extention_blacklist_counter = 0 + short_url_counter = 0 + malformed_url_counter = 0 + duplicate_url_counter = 0 + start_time = time.time() + for filename in files: + with open(filename, 'r') as f: + for line in f: + url = line.strip() + urls_counter += 1 + if domain_is_in_blacklist(url): + print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True) + domain_blacklist_counter += 1 + elif extention_is_in_blacklist(url): + print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True) + extention_blacklist_counter += 1 + elif len(url) <= 8: + print('[SHORT URL]: {}'.format(url), flush=True) + short_url_counter += 1 + elif url_is_malformed(url): + print('[MALFORMED URL]: {}'.format(url), flush=True) + malformed_url_counter += 1 + elif url in urls: + print('[DUPLICATE URL]: {}'.format(url), flush=True) + duplicate_url_counter += 1 + else: + urls.add(url) + if urls_counter % 100000 == 0: + print_progress('PROGRESS', start_time, urls_counter, + domain_blacklist_counter, + extention_blacklist_counter, + short_url_counter, malformed_url_counter, + duplicate_url_counter) + + print_progress('FINAL', start_time, urls_counter, + domain_blacklist_counter, + extention_blacklist_counter, + short_url_counter, malformed_url_counter, + duplicate_url_counter) + + # Write the final set of urls. + print('> writing cleaned up url list to {}'.format(output)) + with open(output, 'w') as f: + for url in urls: + f.write(url + '\n') + + print('done :-)') diff --git a/tools/openwebtext/cleanup_dataset.py b/tools/openwebtext/cleanup_dataset.py new file mode 100644 index 0000000..3a2eba4 --- /dev/null +++ b/tools/openwebtext/cleanup_dataset.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +import ftfy +import json +from langdetect import detect +import numpy as np +import time +import os +import sys + +from tokenizer import Tokenizer + +MIN_DOCUMENT_LENGHT = 128 + + +def print_progress(prefix, start_time, num_docs, num_fixed_text, + num_non_english_docs, chars_non_english_docs, + num_small_docs, chars_small_docs): + + string = prefix + ' | ' + string += 'elapsed time: {:.2f} | '.format(time.time() - start_time) + string += 'documents: {} | '.format(num_docs) + string += 'fixed text: {} | '.format(num_fixed_text) + string += 'non-english: {} | '.format(num_non_english_docs) + string += 'non-english chars: {} | '.format(chars_non_english_docs) + string += 'small docs: {} | '.format(num_small_docs) + string += 'small docs chars: {}'.format(chars_small_docs) + print(string, flush=True) + + +def filter_corpus(filename, out_filename, print_interval=10000): + + print(' > filtering {}'.format(filename)) + + tokenizer = Tokenizer(cache_dir='./cache') + + num_docs = 0 + num_written_docs = 0 + num_small_docs = 0 + num_fixed_text = 0 + num_non_english_docs = 0 + chars_non_english_docs = 0 + chars_small_docs = 0 + start_time = time.time() + with open(out_filename, 'wb') as f: + with open(filename, 'r') as fin: + for line in fin: + try: + num_docs += 1 + myjson = json.loads(line) + # Fix text + text = ftfy.fix_text(myjson['text']) + if text != myjson['text']: + num_fixed_text += 1 + myjson['text'] = text + # Detect language. + if detect(text) != 'en': + print('[non-english text]', myjson) + num_non_english_docs += 1 + chars_non_english_docs += len(text) + continue + # On average each token is 5 characters so 8 is an + # upper bound. + if len(text) < (8 * MIN_DOCUMENT_LENGHT): + tokens = tokenizer.tokenize_document(text) + if len(tokens) < MIN_DOCUMENT_LENGHT: + print('[small document, skipping]:', myjson) + num_small_docs += 1 + chars_small_docs += len(text) + continue + myjson = json.dumps(myjson, ensure_ascii=False) + f.write(myjson.encode('utf-8')) + f.write('\n'.encode('utf-8')) + num_written_docs += 1 + if num_docs % print_interval == 0: + print_progress('[PROGRESS]', start_time, num_docs, + num_fixed_text, num_non_english_docs, + chars_non_english_docs, + num_small_docs, chars_small_docs) + except Exception as e: + print(' skipping ', line, e) + + print_progress('[FINAL]', start_time, num_docs, + num_fixed_text, num_non_english_docs, + chars_non_english_docs, + num_small_docs, chars_small_docs) + + +if __name__ == '__main__': + + print('building gpt2 dataset ...') + + input_filename = sys.argv[1] + output_filename = sys.argv[2] + + print('will be reading {}'.format(input_filename)) + print('and will write the results to {}'.format(output_filename)) + + filter_corpus(input_filename, output_filename) + + diff --git a/tools/openwebtext/cleanup_fix_dataset.py b/tools/openwebtext/cleanup_fix_dataset.py new file mode 100644 index 0000000..c7f6cf2 --- /dev/null +++ b/tools/openwebtext/cleanup_fix_dataset.py @@ -0,0 +1,178 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +""" +Filter and clean documents: +Capable to clean docs with less than 512 characters, less than +256 characters and contains javascript, fix text and dataset specific +cleaning like stories and realnews datasets. +Program arguments have the details. +""" + +import argparse +from functools import partial +import glob +import ftfy +import json +from langdetect import detect +import multiprocessing +import os +from pathlib import Path +import re +import time + +def process_doc(json_line, args): + + # Read the line. + document = json.loads(json_line) + text = document['text'] + + output = {'remove_512': False, 'remove_256_javascript': False, \ + 'remove_512_non_english': False, 'ftfy_fix_text': False, \ + 'general_cleaning': False} + + try: + # Reomove all docs with less than 512 characters + if "remove_512" in args.tasks: + if len(text) < 512: + output['remove_512'] = True + return output, text, document, True + + # Remove docs if less than 256 character length and contains Javascript + if "remove_256_javascript" in args.tasks: + if len(text) < 256 and 'javascript' in text.lower(): + output['remove_256_javascript'] = True + return output, text, document, True + + # Remove docs < 512 and nonenglish + if "remove_512_non_english" in args.tasks: + if len(text) < 512 and detect(text) != 'en': + output['remove_512_non_english'] = True + return output, text, document, True + + # Fix the text using ftfy, don't remove the text, hence return False + if "ftfy_fix_text" in args.tasks: + fixed_text = ftfy.fix_text(text) + output['ftfy_fix_text'] = True + return output, fixed_text, document, False + + # Cleaning extra spaces and newlines + if "general_cleaning" in args.tasks: + cleaned_text = re.sub(r" +|\b\n+ |\b\n+", " ", text) + #cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset + #cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews + + # stories datasets + #cleaned_text = re.sub(r" \'", "'", text) + #cleaned_text = re.sub(r" \!", "!", cleaned_text) + #cleaned_text = re.sub(r" \.", ".", cleaned_text) + #cleaned_text = re.sub(r" \?", "?", cleaned_text) + #cleaned_text = re.sub(r" - ", "-", cleaned_text) + ##cleaned_text = re.sub(r"\" ", "\"", cleaned_text) + #cleaned_text = re.sub(r" @ ", "@", cleaned_text) + + output['general_cleaning'] = True + return output, cleaned_text, document, False + + except Exception as e: + print('Error: *************************\n{}\ntext: {}'.format(e, \ + text), flush=True) + return output, text, document, True + + # don't remove + return output, text, document, False + + +def process_set(args, input_file, output_f_cleaned, output_f_filtered): + + print(' > working on {} ...'.format(input_file), flush=True) + + num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \ + = num_ftfy_fix_text = num_general_cleaning = 0 + + # Output file and counters. + output_cleaned = open(output_f_cleaned, 'wb') + output_filtered = open(output_f_filtered, 'wb') + + start_time = time.time() + + # Setup multi-processing. + num_workers = 40 + fin = open(input_file, 'r', encoding='utf-8') + pool = multiprocessing.Pool(num_workers) + process_doc_partial = partial(process_doc, args=args) + processed_docs = pool.imap(process_doc_partial, fin, 500) + + # Process documents. + for output, text, document, to_filter in processed_docs: + num_docs += 1 + + num_remove_512 += 1 if output['remove_512'] else 0 + num_remove_java += 1 if output['remove_256_javascript'] else 0 + num_remove_512_non_english += 1 if output['remove_512_non_english'] \ + else 0 + num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0 + num_general_cleaning += 1 if output['general_cleaning'] else 0 + + document['text'] = text + myjson = json.dumps(document, ensure_ascii=False) + + if to_filter: + output_filtered.write(myjson.encode('utf-8')) + output_filtered.write('\n'.encode('utf-8')) + else: + output_cleaned.write(myjson.encode('utf-8')) + output_cleaned.write('\n'.encode('utf-8')) + + if num_docs % args.log_interval == 0: + print(' processed {:9d} documents in {:.2f} seconds ...'.format( + num_docs, time.time() - start_time), flush=True) + + # Close the file. + output_cleaned.close() + output_filtered.close() + fin.close() + + # Print stats. + print(' >> total docs: {} remove_512 {} remove_256_javascript {} '\ + 'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\ + format(num_docs, num_remove_512, num_remove_java,\ + num_remove_512_non_english, num_ftfy_fix_text, \ + num_general_cleaning), flush=True) + +if __name__ == '__main__': + + + print('parsing the arguments ...') + + parser = argparse.ArgumentParser() + parser.add_argument('--input-files', nargs = '*', required=True, default=\ + None, help = 'Input json files that needs to be'\ + ' cleaned') + parser.add_argument('--tasks', nargs = '*', required=True, default=None,\ + help = 'Tasks to perform on the input files, ' \ + 'such as remove_512, remove_256_javascript, ' \ + 'remove_512_non_english, ftfy_fix_text, and ' \ + 'general_cleaning. 256 or 512 means the number' \ + ' of characters.') + + parser.add_argument('--output-path', type=str, default=None, + help='Directory where the output should go') + parser.add_argument('--log-interval', type=int, default=100, + help='Log interval') + + args = parser.parse_args() + + print('cleanup dataset ...') + + for input_file in args.input_files: + input_filename, input_filename_ext = os.path.splitext(Path(input_file)\ + .name) + + output_f_cleaned = os.path.join(args.output_path, input_filename + \ + "_cleaned" + input_filename_ext) + output_f_filtered = os.path.join(args.output_path, input_filename + \ + "_filtered" + input_filename_ext) + + process_set(args, input_file, output_f_cleaned, output_f_filtered) + + print('done :-)', flush=True) diff --git a/tools/openwebtext/filter_ngrams.py b/tools/openwebtext/filter_ngrams.py new file mode 100644 index 0000000..7327a16 --- /dev/null +++ b/tools/openwebtext/filter_ngrams.py @@ -0,0 +1,479 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +""" +Deduplicate downstream tasks from training dataset. 13-grams have been used. +All split documents with less than 200 characters got filtered. Any document +with more than 10 splits got filtered as well. +""" + +import argparse +from functools import partial +import json +import multiprocessing +import nltk +import pickle +import re +import string +import sys +import time + +def get_words(text): + # get all the lowercase words from text + words, positions = [], [] + for match in re.finditer(r'\w+', text.lower()): + words.append(match.group(0)) + positions.append(match.start()) + return words, positions + +# splits the text +def split_text(text, start_position, remove_char_each_side, seq): + # first part of the text + punctuations = ".!?" + pos = start_position - remove_char_each_side + text_first = "" + while pos > 0 and not text[pos] in punctuations: + pos -= 1 + if pos > 0: + text_first = text[0:pos+1] + + # add length of seq and remove_char_each_side + pos = start_position + len(seq) + remove_char_each_side + + # last part of the text + text_second = "" + while pos < len(text) and not text[pos] in punctuations: + pos += 1 + if pos + 1 < len(text): + text_second = text[pos+1:len(text)] + + return text_first, text_second + +def check_and_clean_text(args, words, ngrams, text, start_position, \ + text_buf_ngram_free, text_buf, local_ngram): + + seq = " ".join(words) + if seq in ngrams: + print(" [matched]: {}".format(seq), flush=True) + + if args.get_ngram_freq_only: + # increase freq of this seq and then only consider the later part + # of the text for further processing + if seq in local_ngram: + local_ngram[seq] += 1 + else: + local_ngram[seq] = 1 + #print(" [increased]: {} {}".format(seq, ngrams[seq]), flush=True) + if (start_position + len(seq) + 1) < len(text): + text_buf.append(text[start_position + len(seq) + 1:len(text)]) + return False + + # split the text + text_first, text_second = split_text(text, start_position, \ + args.remove_char_each_side, seq) + + # first part of ngrams free + if len(text_first) > args.filter_text_char_len: + text_buf_ngram_free.append(text_first) + + # add second part for further processing + if len(text_second) > args.filter_text_char_len: + text_buf.append(text_second) + + return False # not ngram free + + # ngram free + return True + + +def free_ngram(line, args, key, ngrams, ngrams_freq_sorted): + # remove all the ngrams + + try: + myjson = json.loads(line) + text_buf = [myjson[key]] + except Exception as e: + print("Error: {}".format(e), flush=True) + text_buf = [] + + text_buf_ngram_free = [] + local_ngram = {} + while len(text_buf) > 0: + + # get the first one from the buffer + text = text_buf.pop(0) + words, positions = get_words(text) + + ngram_free = True + # find each max n-grams and check dictionary + for i in range(len(words) - args.max_ngram_size + 1): + check_ngram_free = check_and_clean_text(args, words[i:\ + i+args.max_ngram_size], ngrams, text, positions[i], \ + text_buf_ngram_free, text_buf, local_ngram) + + # the seq is ngram free? if yes, break + if not check_ngram_free: + ngram_free = False + break + + # if max ngrams doesn't match, check if any other lower n-grams + # within max ngram macthes + for ngram_len, _ in ngrams_freq_sorted: + check_ngram_free = check_and_clean_text(args, words[i:\ + i+ngram_len], ngrams, text, positions[i], \ + text_buf_ngram_free, text_buf, local_ngram) + + # same check as above + if not check_ngram_free: + ngram_free = False + break + + # check break from lower than max ngram loop above + if not ngram_free: + break + + # for the last max n-gram, check all the lower ngrams in it + if ngram_free and len(words) - args.max_ngram_size > 0: + # get the last words of the lax max ngram + last_seq_words = words[(len(words)-args.max_ngram_size):len(words)] + last_seq_start_position = len(words) - args.max_ngram_size + + # check all n-grams lower than the max + for pos, (ngram_len, _) in enumerate(ngrams_freq_sorted): + + # ignore the max ngram as has been considered already + if ngram_len == args.max_ngram_size: + continue + + # find each ngram of ngram_len in max n-grams and check + for i in range(len(last_seq_words) - ngram_len + 1): + check_ngram_free = check_and_clean_text(args, \ + last_seq_words[i:i+ngram_len], ngrams, text,\ + positions[last_seq_start_position+i], \ + text_buf_ngram_free, text_buf, local_ngram) + + if not check_ngram_free: + ngram_free = False + break + + if not ngram_free: + break + + # texts are ngram free + if ngram_free and not args.get_ngram_freq_only: + text_buf_ngram_free.append(text) + + # check if the text has only been trimmed + trimmed = 0 + if not args.get_ngram_freq_only and len(text_buf_ngram_free) == 1 and \ + len(text_buf_ngram_free[0]) < len(myjson[key]): + trimmed = 1 + + return text_buf_ngram_free, trimmed, myjson, local_ngram + +# insert word sequence into dictionary +def insert_dict(words, ngrams, pos): + seq = " ".join(words) + if seq not in ngrams: + ngrams[seq] = 0 + #ngrams[seq] = pos + +# insert each ngram from text into the ngrams dictionary +def compute_ngrams_insert_dict(args, text, ngrams): + words, positions = get_words(text) + if len(words) < args.min_ngram_size: + return + + if len(words) < args.max_ngram_size: + insert_dict(words, ngrams, positions[0]) + + for i in range(len(words) - args.max_ngram_size+1): + insert_dict(words[i:i+args.max_ngram_size], ngrams, positions[i]) + + +# Build ngrams for the lambada dataset +def process_task_lambda(args, task_file, ngrams): + print(' reading from {} and computing ngrams'.format(task_file)) + with open(task_file, 'r') as f: + for line in f: + try: + myjson = json.loads(line) + text = myjson['text'] + compute_ngrams_insert_dict(args, text, ngrams) + except Exception as e: + print('Error:', e) + print(" Entities in ngrams {}".format(len(ngrams)), flush=True) + + +# Build ngrams for the dataset of the given task +def process_task(args, task_name, ngrams): + + print(' reading from {} and computing ngrams'.format('import datasets')) + print(" Current entities in ngrams {}".format(len(ngrams)), flush=True) + # using validation/test data from datasets + from datasets import load_dataset + + entities_in_ngrams = len(ngrams) + + # load the dataset + if task_name == 'squad': + dataset = load_dataset('squad_v2', split='validation') + elif task_name == 'natural_questions': + dataset = load_dataset('natural_questions', split='validation') + elif task_name == 'triviaqa': + dataset = load_dataset('trivia_qa', 'unfiltered', split='test') + elif task_name == 'webqa': + dataset = load_dataset('web_questions', split='test') + elif task_name == 'race': + dataset = load_dataset('race', 'all', split='test') + elif task_name == 'drop': + dataset = load_dataset('drop', split='validation') + elif task_name == 'coqa': + dataset = load_dataset('coqa', split='validation') + elif task_name == 'piqa': + dataset = load_dataset('piqa', split='test') + else: + print("Invalid task name: {}".format(task_name), flush=True) + return + + # read the dataset and add to ngrams + for line in dataset: + try: + if task_name in ['squad', 'triviaqa', 'webqa', 'race', 'drop']: + text = line['question'] + compute_ngrams_insert_dict(args, text, ngrams) + elif task_name == 'natural_questions': + text = line['question']['text'] + compute_ngrams_insert_dict(args, text, ngrams) + elif task_name == 'coqa': + all_questions = line['questions'] + for question in all_questions: + compute_ngrams_insert_dict(args, question, ngrams) + elif task_name == 'piqa': + text = line['goal'] + compute_ngrams_insert_dict(args, text, ngrams) + except Exception as e: + print('Error:', e) + + print(" After task {} entities in ngrams {}, added {}".format(task_name, \ + len(ngrams), len(ngrams) - entities_in_ngrams), flush=True) + +def compute_tasks_ngrams(args, ngrams): + start_time = time.time() + for _, task_name in enumerate(args.tasks): + print('Task: {}'.format(task_name), flush=True) + if task_name == 'lambada': + assert args.lambada_path is not None + process_task_lambda(args, args.lambada_path, ngrams) + else: + process_task(args, task_name, ngrams) + print(" Taken time to compute ngrams {:.2f}".format(time.time() - \ + start_time), flush=True) + +def compute_ngram_freq_sorted(args, ngrams): + ngrams_freq = {} + for ngram_key in ngrams.keys(): + length = len(ngram_key.split()) + ngrams_freq[length] = ngrams_freq[length] + 1 if length in \ + ngrams_freq else 1 + + ngrams_freq_sorted = sorted(ngrams_freq.items(), key=lambda item: item[0]) + print(" Ngram frequencies: {}".format(ngrams_freq_sorted), flush=True) + print(" Entities in ngrams {} min_ngram_size {} max_ngram_size {}".format(\ + len(ngrams), ngrams_freq_sorted[0][0], ngrams_freq_sorted[len(\ + ngrams_freq_sorted) -1 ][0]), flush=True) + return ngrams_freq_sorted + +def get_ngrams_below_threshold(args, ngrams, ngrams_below_threshold, \ + dedup_file, dedup_key, ngrams_freq_sorted): + + start_time = time.time() + # get the ngrams frequency + args.get_ngram_freq_only = True + + # Open the large file to process in parallel + num_workers = args.num_threads + pool = multiprocessing.Pool(num_workers) + fin = open(dedup_file, 'r', encoding='utf-8') + free_ngram_abt_partial=partial(free_ngram, args=args, key=dedup_key, \ + ngrams=ngrams, ngrams_freq_sorted=ngrams_freq_sorted) + free_ngrams_abt = pool.imap(free_ngram_abt_partial, fin, 500) + + counter = 0 + for _, _, _, local_ngram in free_ngrams_abt: + counter += 1 + if counter % 1000 == 0: + print(' [compute_stat]> processed {} documents in {:.2f} seconds ...'. + format(counter, time.time() - start_time), flush=True) + for local_key in local_ngram: + if local_key in ngrams: + ngrams[local_key] += 1 + local_ngram = {} + + print(' Time taken to compute statistics {:.2f} seconds'.format(time.time() - \ + start_time), flush=True) + pool.close() + pool.join() + + start_time = time.time() + counter_threshold = 0 + # Get ngram below theadhold + for local_key, local_val in ngrams.items(): + if ngrams[local_key] < args.key_threshold: + print(" [threshold] {} {}".format(local_key, local_val), flush=True) + counter_threshold += 1 + ngrams_below_threshold[local_key] = 1 + + print(' Ngrams below threshold {}'.format(counter_threshold), flush=True) + fin.close() + +def clean_ngrams_below_threshold(args, ngrams_below_threshold, dedup_file, \ + dedup_key): + + start_time = time.time() + # Now actually filter the dataset + args.get_ngram_freq_only = False + #id_prefix = '-'.join(args.tasks[::2]) + id_prefix = '-'.join(args.tasks[::1]) + + # get the range of the size of the ngrams + ngrams_freq_sorted = compute_ngram_freq_sorted(args, ngrams_below_threshold) + + # Open the large file to process in parallel + counter = splitted = ignored = split_mt_thld = trimmed_count = 0 + num_workers = args.num_threads + pool = multiprocessing.Pool(num_workers) + fin = open(dedup_file, 'r', encoding='utf-8') + free_ngram_clean_partial=partial(free_ngram, args=args, key=dedup_key, \ + ngrams=ngrams_below_threshold, ngrams_freq_sorted=ngrams_freq_sorted) + free_ngrams_clean = pool.imap(free_ngram_clean_partial, fin, 500) + + out_f = open(args.output, 'wb') + + for text_buf_ngram_free, trimmed, myjson, _ in free_ngrams_clean: + counter += 1 + try: + + trimmed_count += trimmed + + if len(text_buf_ngram_free) > 1: + splitted += 1 + if len(text_buf_ngram_free) == 0: + ignored += 1 + # more than 10 splits ignored + if len(text_buf_ngram_free) > args.splits_count: + text_buf_ngram_free = [] + split_mt_thld += 1 + + if args.output is not None: + if "split_id" in myjson: + use_prefix = myjson["split_id"] + "-" + else: + use_prefix = "" + + for i in range(len(text_buf_ngram_free)): + split_id_string = id_prefix + '-{:010d}'.format(int(\ + counter)) + '-{:04d}'.format(int(i)) + myjson[dedup_key] = text_buf_ngram_free[i] + myjson["split_id"] = use_prefix + split_id_string + outjson = json.dumps(myjson, ensure_ascii=False) + #outjson = json.dumps({"text":text_buf_ngram_free[i], + # id_prefix+"_split_id":split_id_string}, + # ensure_ascii=False) + out_f.write(outjson.encode('utf-8')) + out_f.write('\n'.encode('utf-8')) + + if counter % 1000 == 0: + print(' [final]> processed {} documents in {:.2f} seconds ...'. + format(counter, time.time() - start_time), flush=True) + except Exception as e: + print('Error:', e) + + print(' [final]> processed {} documents in {:.2f} seconds ...'. + format(counter, time.time() - start_time), flush=True) + + print(' Total docs {} splitted {} ignored {} splits > theshold {} trimmed'\ + ' {}'.format(counter, splitted, ignored, split_mt_thld, trimmed_count)\ + , flush=True) + + pool.close() + pool.join() + + out_f.close() + fin.close() + +if __name__ == '__main__': + + # we use 13-grams, any text less than 200 characters got removed + # any text splitted more than 10 got removed as well + + print('parsing the arguments ...') + + parser = argparse.ArgumentParser() + parser.add_argument('--tasks', nargs = '*', required=True, default=None, \ + help = 'Tasks to use for deduplication: currently ' + ' suuport [lambada, squad, natural_questions,' + ' triviaqa, webqa, race, drop, coqa, and piqa]') + parser.add_argument('--lambada-path', type=str, default=None, + help='Only Lambada task needs the path') + parser.add_argument('--dedup-dataset', nargs = '*', default=None, + help='Dataset to deduplicate with the key to use' + ' e.g. cc.json text') + parser.add_argument('--output', type=str, default=None, + help='Output file name to save dedup dataset') + parser.add_argument('--num-threads', type=int, default=40, + help='Number of threads to use') + # Default dedup values + parser.add_argument('--max-ngram-size', type=int, default=13, + help='Maximum size of ngram to use.') + parser.add_argument('--min-ngram-size', type=int, default=8, + help='Minimum size of ngram to use.') + parser.add_argument('--filter-text-char-len', type=int, default=200, + help='Remove any text below this length.') + parser.add_argument('--key-threshold', type=int, default=10, + help='Number of keys to consider as threshold') + parser.add_argument('--save-dictionary', type=str, default=None, + help='Save the dictionary') + parser.add_argument('--load-dictionary', type=str, default=None, + help='Load the dictionary') + parser.add_argument('--splits-count', type=int, default=10, + help='Remove any documents more than this many splits') + parser.add_argument('--remove-char-each-side', type=int, default=200, + help='Maximum size of ngram to use.') + + args = parser.parse_args() + + assert len(args.dedup_dataset) == 2 + dedup_file = args.dedup_dataset[0] + dedup_key = args.dedup_dataset[1] + + # Setup multi-processing + num_workers = args.num_threads + if args.load_dictionary is None: + + # Build ngrams + ngrams = {} + compute_tasks_ngrams(args, ngrams) + + # get the range of the size of the ngrams + ngrams_freq_sorted = compute_ngram_freq_sorted(args, ngrams) + + # get ngram freq from large file in parallel + # get ngrams below threshold + ngrams_below_threshold = {} + get_ngrams_below_threshold(args, ngrams, ngrams_below_threshold, \ + dedup_file, dedup_key, ngrams_freq_sorted) + + # save the dictionary if needed + if args.save_dictionary is not None: + with open(args.save_dictionary, 'wb') as save_dict_handle: + pickle.dump(ngrams_below_threshold, save_dict_handle) + else: + with open(args.load_dictionary, 'rb') as load_dict_handle: + ngrams_below_threshold = pickle.load(load_dict_handle) + + # filter the large file + if args.output is not None: + clean_ngrams_below_threshold(args, ngrams_below_threshold, \ + dedup_file, dedup_key) + + print('done :-)') diff --git a/tools/openwebtext/find_duplicates.py b/tools/openwebtext/find_duplicates.py new file mode 100644 index 0000000..7f6cb57 --- /dev/null +++ b/tools/openwebtext/find_duplicates.py @@ -0,0 +1,292 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import argparse +from functools import partial +import itertools +import json +from lsh import cache, minhash +import multiprocessing +import numpy as np +import time +import pickle +import sys +import os + +# This function is adapted from: +# https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb +def shingles(text, char_ngram=5): + return set(text[head:head + char_ngram] + for head in range(0, len(text) - char_ngram)) + + +# This function is adapted from: +# https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb +def jaccard(set_a, set_b, args): + if len(set_a) < 1 or len(set_b) < 1: + return 0.0 + + intersection = set_a & set_b + union = set_a | set_b + + if args.jaccard == 'min': + return len(intersection) / min(len(set_a), len(set_b)) + elif args.jaccard == 'max': + return len(intersection) / max(len(set_a), len(set_b)) + else: + return len(intersection) / len(union) + +def compute_fingerprint(line, key): + try: + myjson = json.loads(line) + url = myjson[key] + text = myjson['text'] + fingerprint = hasher.fingerprint(text) + except Exception as e: + print('Error:', e) + return None, None, None, False + + return url, text, fingerprint, True + +def url_pairs_to_remove(args, bucket_urls, url_doc): + remove_urls_list = [] + deduped_local, counter_local = 0, 0 + iteration = 0 + while len(bucket_urls) > 1: + if args.heuristic_iter != -1 and \ + iteration == args.heuristic_iter: + break + + items = list(bucket_urls) + remove_urls = [] + main_url = items[np.random.randint(0, len(items))] + main_dhingles = shingles(url_doc[main_url]) + + for i in range(0, len(items)): + counter_local += 1 + other_url = items[i] + if other_url == main_url: + continue + other_shingles = shingles(url_doc[other_url]) + try: + jaccard_sim = jaccard(main_dhingles, other_shingles, args) + except Exception as e: + print('Error:', e) + jaccard_sim = 0.0 + if jaccard_sim > 0.5: + remove_urls.append({other_url: jaccard_sim}) + deduped_local += 1 + bucket_urls.remove(other_url) + + bucket_urls.remove(main_url) + if len(remove_urls) > 0: + remove_urls_list.append({main_url: remove_urls}) + iteration += 1 + return remove_urls_list, deduped_local, counter_local + +def write_remove_urls_list(remove_urls_list, f_out): + if len(remove_urls_list) > 0: + for each_url_remove in remove_urls_list: + myjson = json.dumps(each_url_remove, ensure_ascii=False) + f_out.write(myjson.encode('utf-8')) + f_out.write('\n'.encode('utf-8')) + +def compute_jaccard(each_bin, num_bins, start_time_local): + + remove_urls_list = [] + deduped_local, counter_local, bucket_local = 0, 0, 0 + + for bucket_id in each_bin: + bucket_local += 1 + if os.getpid() % num_bins == 0 and bucket_local % 100000 == 0: + print("Counter {}, progress {:.2f} time {:.2f}".\ + format(bucket_local, float(bucket_local)/float(len(each_bin)),\ + time.time() - start_time_local), flush=True) + + if len(each_bin[bucket_id]) <= 1: + continue + + bucket_urls = each_bin[bucket_id].copy() + remove_urls_list_sub, deduped_local_sub, counter_local_sub = \ + url_pairs_to_remove(args, bucket_urls, url_doc) + + deduped_local += deduped_local_sub + counter_local += counter_local_sub + if len(remove_urls_list_sub) > 0: + remove_urls_list.extend(remove_urls_list_sub) + + return remove_urls_list, deduped_local, counter_local + +def find_pair_urls_parallel(args, lshcache, url_doc): + start_time = time.time() + f_out = open(args.output, 'wb') + deduped, counter = 0, 0 + + # compute jaccards of buckets in bin in parallel (parallelism + # limited to # of bins) + num_bins = len(lshcache.bins) + pool = multiprocessing.Pool(num_bins) + compute_jaccard_partial = partial(compute_jaccard, num_bins=num_bins, \ + start_time_local=start_time) + # don't need to pass args and url_doc as they are already shared + compute_jaccard_iter = pool.imap(compute_jaccard_partial, lshcache.bins) + + print("multiprocessing init took {:.2f}".format(time.time() - start_time),\ + flush=True) + for remove_urls_list, deduped_local, counter_local in compute_jaccard_iter: + deduped += deduped_local + counter += counter_local + write_remove_urls_list(remove_urls_list, f_out) + print(' [write]> processed {} documents in {:.2f} ' + 'seoncds and deduped {} documents ...'.format(counter, time.time()\ + - start_time, deduped), flush=True) + + pool.close() + pool.join() + f_out.close() + + print(' Taken time for jaccard similariries {:.2f} seconds'.format(\ + time.time() - start_time), flush=True) + +def find_pair_urls_sequential(args, lshcache, url_doc): + start_time = time.time() + f_out = open(args.output, 'wb') + deduped, counter = 0, 0 + for b in lshcache.bins: + for bucket_id in b: + if len(b[bucket_id]) <= 1: + continue + + bucket_urls = b[bucket_id].copy() + remove_urls_list_sub, deduped_local_sub, counter_local_sub = \ + url_pairs_to_remove(args, bucket_urls, url_doc) + + deduped += deduped_local_sub + counter += counter_local_sub + write_remove_urls_list(remove_urls_list_sub, f_out) + if counter % 10000 == 0: + print(' [write]> processed {} documents in {:.2f} ' + 'seoncds and deduped {} documents ...'. + format(counter, time.time() - start_time, + deduped), flush=True) + f_out.close() + print(' [write]> processed {} documents in {:.2f} ' + 'seoncds and deduped {} documents ...'. + format(counter, time.time() - start_time, + deduped), flush=True) + +if __name__ == '__main__': + + print('parsing the arguments ...') + + parser = argparse.ArgumentParser() + parser.add_argument('--seed', type=int, default=1234, + help='Random seed used for python, numpy') + parser.add_argument('--inputs', nargs = '*', default=None, help = \ + 'Pairwise list of the input files and keys, ' + 'e.g. --inputs cc.json cc_id news.json news_id') + parser.add_argument('--load-fingerprints', nargs = '*', default=None, + help='Load fingerprints from a list of pickle files,' + ' e.g. cc.pkl news.pkl') + parser.add_argument('--save-fingerprints', type=str, default=None, + help='Save the fingerprints of the inputs.') + parser.add_argument('--output', type=str, default=None, + help='Output file name that consists of all ids' + ' with matching similarities') + parser.add_argument('--jaccard', type=str, default='union', + choices=['union', 'min', 'max'], help='Jaccard'\ + ' similarity computation') + parser.add_argument('--heuristic-iter', type=int, default=1, + help='Number of iterations to run the heuristics' + ': use -1 for exact') + parser.add_argument('--num-bands', type=int, default=10, + help='Number of bands to use in cache') + parser.add_argument('--num-seeds', type=int, default=100, + help='Number of seeds to use for minhash. Note that' + ' this value should be divisible by num-bands') + parser.add_argument('--jaccard-parallel', action='store_true', + help='Use this to process large number of documents.') + args = parser.parse_args() + + print('finding possible duplicate content ...') + + # set seed and get an array of seeds of 100 integers + np.random.seed(args.seed) + seeds = np.random.randint(0, 1e6, size=args.num_seeds) + + # initialize minhash and lsh cache + hasher = minhash.MinHasher(seeds=seeds, char_ngram=5, hashbytes=4) + lshcache = cache.Cache(num_bands=args.num_bands, hasher=hasher) + + url_doc = {} + + # load fingerprints from pickle file if needed + if args.load_fingerprints is not None: + for count_fp, fp_file_name in enumerate(args.load_fingerprints): + print("Loading fingerprints from pickle file {}".format( + fp_file_name), flush=True) + fp = open(fp_file_name, "rb") + if count_fp == 0: + # assign directory for the first pkl + lshcache = pickle.load(fp) + url_doc = pickle.load(fp) + else: + # append these to lshcache and url_doc + local_lshcache = pickle.load(fp) + local_url_doc = pickle.load(fp) + for url in local_lshcache.fingerprints.keys(): + url_doc[url] = local_url_doc[url] + lshcache.add_fingerprint(local_lshcache.fingerprints[url], url) + fp.close() + + counter = 0 + start_time = time.time() + + # compute finger prints of the inputs if any + # input file and the key to use as id + if args.inputs is not None: + print("Computing fingerprints", flush=True) + assert len(args.inputs) % 2 == 0 + for input_file, key in zip(args.inputs[::2], args.inputs[1::2]): + print(' document processing {} with key {}'.format(input_file, key), + flush=True) + + # compute fingerprints in parallel + num_workers = 40 + pool = multiprocessing.Pool(num_workers) + fin = open(input_file, 'r', encoding='utf-8') + compute_fingerprint_partial = partial(compute_fingerprint, key=key) + compute_fingerprint_iter = pool.imap(compute_fingerprint_partial, + fin, 512) + # traverse all the texts and add fingerprints + for url, text, fingerprint, flag in compute_fingerprint_iter: + counter += 1 + if flag: + url_doc[url] = text + lshcache.add_fingerprint(fingerprint, url) + if counter % 10000 == 0: + print(' [read]> processed {} documents in {:.2f} ' + 'seconds ...'.format(counter, time.time() - \ + start_time), flush=True) + + fin.close() + pool.close() + pool.join() + + # Save the fingerprints if needed + if args.save_fingerprints is not None: + print("Saving fingerprints to pickle file {}".format( + args.save_fingerprints), flush=True) + with open(args.save_fingerprints, 'wb') as f_save: + pickle.dump(lshcache, f_save) + pickle.dump(url_doc, f_save) + + # compute jaccard index of the input texts and write to file if needed + if args.output is not None: + print("Compute jaccard similarity", flush=True) + if args.jaccard_parallel: + find_pair_urls_parallel(args, lshcache, url_doc) + else: + find_pair_urls_sequential(args, lshcache, url_doc) + + print('done :-)') + diff --git a/tools/openwebtext/group_duplicate_url.py b/tools/openwebtext/group_duplicate_url.py new file mode 100644 index 0000000..16a0354 --- /dev/null +++ b/tools/openwebtext/group_duplicate_url.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import json +import time +import sys + + +if __name__ == '__main__': + + + print('grouping duplicate urls ...') + + input = sys.argv[1] + output = sys.argv[2] + if len(sys.argv) > 3: + jaccard_similarity_threshold = float(sys.argv[3]) + else: + jaccard_similarity_threshold = 0.7 + + url_to_index = {} + index_to_urls = [] + counter = 0 + start_time = time.time() + with open(input, 'r') as f: + for line in f: + counter += 1 + myjson = json.loads(line) + urls = [] + for main_url in myjson.keys(): + urls.append(main_url) + for value in myjson[main_url]: + for other_url, js in value.items(): + if js >= jaccard_similarity_threshold: + urls.append(other_url) + current_index = -1 + other_indices = set() + for url in urls: + if url in url_to_index: + if current_index == -1: + current_index = url_to_index[url] + elif current_index != url_to_index[url]: + other_indices.add(url_to_index[url]) + if current_index == -1: + current_index = len(index_to_urls) + index_to_urls.append(set()) + for url in urls: + url_to_index[url] = current_index + index_to_urls[current_index].add(url) + for index in other_indices: + for url in index_to_urls[index]: + index_to_urls[current_index].add(url) + url_to_index[url] = current_index + index_to_urls[index] = None + + if counter % 100000 == 0: + print(' > processed {} lines in {} seconds ...'.format( + counter, time.time() - start_time)) + + + total_remove = 0 + total_remain = 0 + for urls in index_to_urls: + if urls is not None: + if len(urls) > 1: + total_remove += (len(urls) - 1) + total_remain += 1 + print('out of {} urls, only {} are unique and {} should be removed'.format( + total_remove+total_remain, total_remain, total_remove)) + + with open(output, 'wb') as f: + for i, urls in enumerate(index_to_urls): + if urls is not None: + if len(urls) > 1: + myjson = json.dumps({str(i): list(urls)}, + ensure_ascii=False) + f.write(myjson.encode('utf-8')) + f.write('\n'.encode('utf-8')) diff --git a/tools/openwebtext/merge_jsons.py b/tools/openwebtext/merge_jsons.py new file mode 100644 index 0000000..fb11fe4 --- /dev/null +++ b/tools/openwebtext/merge_jsons.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +import glob +import sys +import json +import argparse + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument("--json_path", type=str, default=".", + help="path where all the json files are located") + + parser.add_argument("--output_file", type=str, default="merged_output.json", + help="filename where the merged json should go") + + args = parser.parse_args() + + json_path = args.json_path + out_file = args.output_file + + json_files = glob.glob(json_path + '/*.json') + + counter = 0 + + with open(out_file, 'w') as outfile: + for fname in json_files: + counter += 1 + + if counter % 1024 == 0: + print("Merging at ", counter, flush=True) + + with open(fname, 'r') as infile: + for row in infile: + each_row = json.loads(row) + outfile.write(row) + + + print("Merged file", out_file, flush=True) + + diff --git a/tools/openwebtext/remove_group_duplicates.py b/tools/openwebtext/remove_group_duplicates.py new file mode 100644 index 0000000..44b62d6 --- /dev/null +++ b/tools/openwebtext/remove_group_duplicates.py @@ -0,0 +1,56 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + + +import json +import time +import sys + + +if __name__ == '__main__': + + url_filename = sys.argv[1] + data_filename = sys.argv[2] + output_filename = sys.argv[3] + + urls = set() + with open(url_filename, 'r') as f: + for line in f: + myjson = json.loads(line) + for key in myjson: + this_urls = myjson[key] + for i in range(1, len(this_urls)): + urls.add(this_urls[i]) + print('will be removing {} urls'.format(len(urls)), flush=True) + + written_docs = 0 + removed_docs = 0 + removed_chars = 0 + start_time = time.time() + with open(output_filename, 'wb') as fout: + with open(data_filename, 'r') as fin: + for line in fin: + try: + myjson = json.loads(line) + url = myjson['url'] + if url in urls: + print('removing', myjson) + removed_docs += 1 + removed_chars += len(myjson['text']) + continue + myjson = json.dumps(myjson, ensure_ascii=False) + fout.write(myjson.encode('utf-8')) + fout.write('\n'.encode('utf-8')) + written_docs += 1 + if written_docs % 10000 == 0: + print(' [PROCESSED] time (s): {:.2f} | written: {} ' + '| removed: {} (char: {})'.format( + time.time() - start_time, + written_docs, removed_docs, removed_chars)) + except Exception as e: + print('[SKIPPING]', line, e) + + print(' [PROCESSED] time (s): {:.2f} | written: {} ' + '| removed: {} (char: {})'.format( + time.time() - start_time, + written_docs, removed_docs, removed_chars)) + print('done :-)') diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py new file mode 100644 index 0000000..c1f2588 --- /dev/null +++ b/tools/preprocess_data.py @@ -0,0 +1,409 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Processing large data for pretraining.""" +import argparse +import math +import json +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) +import time +import gzip +import glob +import torch +import numpy as np +import multiprocessing +try: + import nltk + nltk_available = True +except ImportError: + nltk_available = False + +from megatron.training.tokenizer import build_tokenizer +from megatron.core.datasets import indexed_dataset + + +# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer +class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars): + + _period_context_fmt = r""" + \S* # some word material + %(SentEndChars)s # a potential sentence ending + \s* # <-- THIS is what I changed + (?=(?P + %(NonWord)s # either other punctuation + | + (?P\S+) # <-- Normally you would have \s+ here + ))""" + +class IdentitySplitter(object): + def tokenize(self, *text): + return text + + +class Encoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + # Use Encoder class as a container for global data + Encoder.tokenizer = build_tokenizer(self.args) + if self.args.split_sentences: + if not nltk_available: + print("NLTK is not available to split sentences.") + exit() + if os.environ.get("NLTK_DATA"): + library = os.path.join(os.environ.get("NLTK_DATA"), "tokenizers", "punkt", f"{self.args.lang}.pickle") + url = f"file:{library}" + else: + library = os.path.join("tokenizers", "punkt", f"{self.args.lang}.pickle") + url = f"nltk:{library}" + splitter = nltk.load(url) + if self.args.keep_newlines: + # this prevents punkt from eating newlines after sentences + Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer( + train_text = splitter._params, + lang_vars = CustomLanguageVars()) + else: + Encoder.splitter = splitter + + else: + Encoder.splitter = IdentitySplitter() + + def split(self, json_line): + data = json.loads(json_line) + output = {} + for key in self.args.json_keys: + text = data[key] + max_len = 1000000 + tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)] + output[key] = [tokens for partial in tokens_list for tokens in partial] + return json.dumps(output), len(json_line) + + def encode(self, json_line): + data = json.loads(json_line) + ids = {} + lens = {} + for key in self.args.json_keys: + text = data[key] + if isinstance(text, list): + sentences = text + else: + sentences = [text] + doc_ids = [] + sentence_lens = [] + for sentence in sentences: + sentence_ids = Encoder.tokenizer.tokenize(sentence) + if len(sentence_ids) > 0: + doc_ids.extend(sentence_ids) + sentence_lens.append(len(sentence_ids)) + if len(doc_ids) > 0 and self.args.append_eod: + doc_ids.append(Encoder.tokenizer.eod) + sentence_lens[-1] += 1 + ids[key] = doc_ids + lens[key] = sentence_lens + return ids, lens, len(json_line) + + +class Partition(object): + def __init__(self, args, workers): + self.args = args + self.workers = workers + + def print_processing_stats(self, count, proc_start, total_bytes_processed): + if count % self.args.log_interval == 0: + current = time.time() + elapsed = current - proc_start + mbs = total_bytes_processed/elapsed/1024/1024 + print(f"Processed {count} documents", + f"({count/elapsed} docs/s, {mbs} MB/s).", + file=sys.stderr) + + def split_sentences(self, file_name): + input_file_name, output_file_name = file_name + print("Opening", input_file_name) + fin = open(input_file_name, 'r', encoding='utf-8') + fout = open(output_file_name, 'w') + + encoder = Encoder(self.args) + pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer) + split_docs = pool.imap(encoder.split, fin, 32) + + proc_start = time.time() + total_bytes_processed = 0 + for i, (doc, bytes_processed) in enumerate(split_docs, start=1): + total_bytes_processed += bytes_processed + fout.write(doc + "\n") + self.print_processing_stats(i, proc_start, total_bytes_processed) + + fin.close() + fout.close() + + + def process_json_file(self, file_name): + input_file_name, output_prefix = file_name + print("Opening", input_file_name) + fin = open(input_file_name, 'r', encoding='utf-8') + + startup_start = time.time() + encoder = Encoder(self.args) + tokenizer = build_tokenizer(self.args) + pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer) + encoded_docs = pool.imap(encoder.encode, fin, 32) + + level = "document" + if self.args.split_sentences: + level = "sentence" + + output_bin_files = {} + output_idx_files = {} + builders = {} + + for key in self.args.json_keys: + output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix, + key, level) + output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix, + key, level) + builders[key] = indexed_dataset.IndexedDatasetBuilder( + output_bin_files[key], + dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size), + ) + + startup_end = time.time() + proc_start = time.time() + total_bytes_processed = 0 + print("Time to startup:", startup_end - startup_start) + for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1): + total_bytes_processed += bytes_processed + for key in doc.keys(): + builders[key].add_document(doc[key], sentence_lens[key]) + self.print_processing_stats(i, proc_start, total_bytes_processed) + + fin.close() + builders[key].finalize(output_idx_files[key]) + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title='input data') + group.add_argument('--input', type=str, required=True, + help='Path to input JSON') + group.add_argument('--json-keys', nargs='+', default=['text'], + help='space separate listed of keys to extract from json') + group.add_argument('--split-sentences', action='store_true', + help='Split documents into sentences.') + group.add_argument('--keep-newlines', action='store_true', + help='Keep newlines between sentences when splitting.') + + group = parser.add_argument_group(title='tokenizer') + group.add_argument('--tokenizer-type', type=str, required=True, + choices=['BertWordPieceLowerCase','BertWordPieceCase', + 'GPT2BPETokenizer', 'SentencePieceTokenizer', + 'GPTSentencePieceTokenizer', 'Llama2Tokenizer', + 'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'], + help='What type of tokenizer to use.') + group.add_argument('--tokenizer-model', type=str, default=None, + help='YTTM tokenizer model.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file') + group.add_argument('--vocab-size', default=786, + help='size of vocab for use with NullTokenizer') + group.add_argument('--merge-file', type=str, default=None, + help='Path to the BPE merge file (if necessary).') + group.add_argument('--append-eod', action='store_true', + help='Append an token to the end of a document.') + group.add_argument('--lang', type=str, default='english', + help='Language to use for NLTK-powered sentence splitting.') + group = parser.add_argument_group(title='output data') + group.add_argument('--output-prefix', type=str, required=True, + help='Path to binary output file without suffix') + + group = parser.add_argument_group(title='runtime') + group.add_argument('--workers', type=int, required=True, + help=('Number of worker processes to launch.' + 'A good default for fast pre-processing ' + 'is: (workers * partitions) = available CPU cores.')) + group.add_argument('--partitions', type=int, default=1, + help='Number of file partitions') + group.add_argument('--log-interval', type=int, default=1000, + help='Interval between progress updates') + group.add_argument('--keep-sequential-samples', action='store_true', + help='Ensure ordering of samples in .jsonl files is ' + 'preserved when using partitions>1.') + args = parser.parse_args() + args.keep_empty = False + + if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences: + print("Are you sure you don't want to split sentences?") + + # some default/dummy values for the tokenizer + args.rank = 1 + args.make_vocab_size_divisible_by = 128 + args.tensor_model_parallel_size = 1 + args.vocab_extra_ids = 0 + + return args + + +def get_file_name(args, file_id): + file_name, extension = os.path.splitext(args.input) + input_file_name = file_name + "_" + str(file_id) + extension + sentence_split_file = file_name + "_ss_" + str(file_id) + extension + output_prefix = args.output_prefix + "_" + str(file_id) + file_names = { + 'partition': input_file_name, + 'sentence_split': sentence_split_file, + 'output_prefix': output_prefix} + return file_names + + +def check_files_exist(in_ss_out_names, key, num_partitions): + for i in range(num_partitions): + if not os.path.exists(in_ss_out_names[i][key]): + return False + return True + + +def main(): + args = get_args() + + if args.split_sentences: + if nltk_available: + nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA")) + else: + raise Exception( + "nltk library required for sentence splitting is not available.") + + in_ss_out_names = [] + if args.partitions == 1: + file_name, extension = os.path.splitext(args.input) + sentence_split_file = file_name + "_ss" + extension + file_names = { + 'partition': args.input, + 'sentence_split': sentence_split_file, + 'output_prefix': args.output_prefix} + in_ss_out_names.append(file_names) + else: + in_file_names = glob.glob(args.input) + + # Count total number of lines across .jsonl files + if args.keep_sequential_samples: + total_sample_count = 0 + for filename in in_file_names: + with open(filename, "r") as fin: + for fc, _ in enumerate(fin): + pass + total_sample_count += (fc + 1) + partition_size = math.ceil(total_sample_count / args.partitions) + + # create .jsonl parition files + for idx in range(args.partitions): + in_ss_out_name = get_file_name(args, idx) + in_ss_out_names.append(in_ss_out_name) + + # check to see if paritions were already created + partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions) + + # check to see if paritions with split sentences already created + split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions) + + if not partitions_present and not split_sentences_present: + # populate .jsonl partition files from parent files + partitioned_input_files = [] + for idx in range(args.partitions): + partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w') + partitioned_input_files.append(partitioned_input_file) + + index = 0 + if args.keep_sequential_samples: line_count = 0 + for in_file_name in in_file_names: + # support for gzip files + if in_file_name.endswith(".gz"): + fin = gzip.open(in_file_name, 'rt') + else: + fin = open(in_file_name, 'r', encoding='utf-8') + + for line in fin: + partitioned_input_files[index].write(line) + if args.keep_sequential_samples: + line_count += 1 + if line_count % partition_size == 0: + index += 1 + else: + index = (index + 1)%args.partitions + + fin.close() + + for idx in range(args.partitions): + partitioned_input_files[idx].close() + + assert args.workers % args.partitions == 0 + partition = Partition(args, args.workers//args.partitions) + + # check to see if paritions with split sentences already created + split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions) + + # split sentences in partition files + if args.split_sentences and not split_sentences_present: + processes = [] + for name in in_ss_out_names: + p = multiprocessing.Process(target=partition.split_sentences, + args=((name['partition'], name['sentence_split']),)) + p.start() + processes.append(p) + + for p in processes: + p.join() + + if args.partitions == 1: + return + + + # encode partition files in parallel + processes = [] + input_key = 'sentence_split' if args.split_sentences else 'partition' + for name in in_ss_out_names: + p = multiprocessing.Process(target=partition.process_json_file, + args=((name[input_key], name['output_prefix']),)) + p.start() + processes.append(p) + + for p in processes: + p.join() + + if args.partitions == 1: + return + + # merge bin/idx partitions + level = "document" + if args.split_sentences: + level = "sentence" + + output_bin_files = {} + output_idx_files = {} + builders = {} + tokenizer = build_tokenizer(args) + + for key in args.json_keys: + output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, + key, level) + output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, + key, level) + builders[key] = indexed_dataset.IndexedDatasetBuilder( + output_bin_files[key], + dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size), + ) + + for name in in_ss_out_names: + parition_output_prefix = name['output_prefix'] + full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix, + key, level) + builders[key].add_index(full_partition_output_prefix) + builders[key].finalize(output_idx_files[key]) + + +if __name__ == '__main__': + + main() + diff --git a/tools/preprocess_data_nmt.py b/tools/preprocess_data_nmt.py new file mode 100644 index 0000000..13a04f6 --- /dev/null +++ b/tools/preprocess_data_nmt.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Processing nmt data for finetuning.""" + +import argparse +import json +import multiprocessing +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) +import time +import torch +from megatron.training.tokenizer import build_tokenizer +from megatron.core.datasets import indexed_dataset + + +class Encoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + # Use Encoder class as a container for global data + Encoder.tokenizer = build_tokenizer(self.args) + + def encode(self, text): + ids = {} + ids = Encoder.tokenizer.tokenize(text) + assert len(ids) > 0 + return ids, len(text) + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title='input data') + group.add_argument('--input', type=str, required=True, + help='Path to input JSON') + + group = parser.add_argument_group(title='tokenizer') + group.add_argument('--tokenizer-type', type=str, default='YTTMTokenizer', + choices=['BertWordPieceLowerCase','BertWordPieceCase', + 'GPT2BPETokenizer', 'SentencePieceTokenizer'], + help='What type of tokenizer to use.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file') + group.add_argument('--merge-file', type=str, default=None, + help='Path to the BPE merge file (if necessary).') + + group = parser.add_argument_group(title='output data') + group.add_argument('--output-prefix', type=str, required=True, + help='Path to binary output file without suffix') + + group = parser.add_argument_group(title='runtime') + group.add_argument('--workers', type=int, default=1, + help='Number of worker processes to launch') + group.add_argument('--log-interval', type=int, default=100, + help='Interval between progress updates') + args = parser.parse_args() + args.keep_empty = False + + # some default/dummy values for the tokenizer + args.rank = 0 + args.make_vocab_size_divisible_by = 128 + args.tensor_model_parallel_size = 1 + args.vocab_extra_ids = 0 + + return args + +def main(): + args = get_args() + startup_start = time.time() + + print("Opening", args.input) + fin = open(args.input, 'r', encoding='utf-8') + + encoder = Encoder(args) + tokenizer = build_tokenizer(args) + pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) + encoded_sentences = pool.imap(encoder.encode, fin, 25) + + print(f"Vocab size: {tokenizer.vocab_size}") + print(f"Output prefix: {args.output_prefix}") + output_bin_file = "{}.bin".format(args.output_prefix) + output_idx_file = "{}.idx".format(args.output_prefix) + builder = indexed_dataset.IndexedDatasetBuilder( + output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size) + ) + + startup_end = time.time() + proc_start = time.time() + total_bytes_processed = 0 + print("Time to startup:", startup_end - startup_start) + + for i, (sentence, bytes_processed) in enumerate(encoded_sentences, start=1): + total_bytes_processed += bytes_processed + builder.add_item(torch.IntTensor(sentence)) + # documents contain only one sentence. + builder.end_document() + if i % args.log_interval == 0: + current = time.time() + elapsed = current - proc_start + mbs = total_bytes_processed/elapsed/1024/1024 + print(f"Processed {i} sentences", + f"({i/elapsed} sentences/s, {mbs} MB/s).", + file=sys.stderr) + + builder.finalize(output_idx_file) + +if __name__ == '__main__': + main() + diff --git a/tools/preprocess_mmdata.py b/tools/preprocess_mmdata.py new file mode 100644 index 0000000..247b66b --- /dev/null +++ b/tools/preprocess_mmdata.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Processing text modality data for MultiModal pretraining.""" + +import argparse +import json +import multiprocessing +import os +import sys +import numpy as np +from torchvision.transforms import ToTensor +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) +import time + +import torch +try: + import nltk + nltk_available = True +except ImportError: + nltk_available = False + +from megatron.training.tokenizer import build_tokenizer +from megatron.core.datasets.indexed_dataset import IndexedDatasetBuilder + + +# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer +class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars): + + _period_context_fmt = r""" + \S* # some word material + %(SentEndChars)s # a potential sentence ending + \s* # <-- THIS is what I changed + (?=(?P + %(NonWord)s # either other punctuation + | + (?P\S+) # <-- Normally you would have \s+ here + ))""" + +class IdentitySplitter(object): + def tokenize(self, *text): + return text + +class Encoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + # Use Encoder class as a container for global data + Encoder.tokenizer = build_tokenizer(self.args) + + def encode(self, input_pair): + json_line, img_path = input_pair + data = json.loads(json_line) + key = "text" + text = data[key] + sentence_ids = Encoder.tokenizer.tokenize(text) + pad_len = self.args.pad_length + if len(sentence_ids) > 0 and self.args.append_eod: + sentence_ids = sentence_ids[:pad_len] + current_length = len(sentence_ids) + sentence_ids.extend([Encoder.tokenizer.eod for _ in range(max(0,pad_len-current_length))]) + + with open(img_path, "rb") as tf: + xs = bytearray(tf.read()) + img_pad = (4 - len(xs) % 4) % 4 + xs.extend([0 for _ in range(img_pad)]) + img_raw = np.frombuffer(xs, dtype=np.int32) + img_raw = np.insert(img_raw, 0, img_pad) + + return sentence_ids, img_raw, len(json_line) + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title='input data') + group.add_argument('--input', type=str, required=True, + help='Path to input JSON') + group.add_argument('--input-image', type=str, required=True, + help='Path to input image folder') + + group.add_argument('--pad-length', type=int, required=True, + help='Pad length of preprocessed text') + + group.add_argument('--split-sentences', action='store_true', + help='Split documents into sentences.') + group.add_argument('--keep-newlines', action='store_true', + help='Keep newlines between sentences when splitting.') + + group = parser.add_argument_group(title='tokenizer') + group.add_argument('--tokenizer-type', type=str, required=True, + choices=['BertWordPieceLowerCase','BertWordPieceCase', + 'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer'], + help='What type of tokenizer to use.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file') + group.add_argument('--merge-file', type=str, default=None, + help='Path to the BPE merge file (if necessary).') + group.add_argument('--append-eod', action='store_true', + help='Append an token to the end of a document.') + group.add_argument('--lang', type=str, default='english', + help='Language to use for NLTK-powered sentence splitting.') + group.add_argument('--tokenizer-model', type=str, default=None, + help='sentencepeice tokenizer model.') + + group = parser.add_argument_group(title='output data') + group.add_argument('--output-prefix', type=str, required=True, + help='Path to binary output file without suffix') + group = parser.add_argument_group(title='runtime') + group.add_argument('--workers', type=int, default=1, + help='Number of worker processes to launch') + group.add_argument('--log-interval', type=int, default=100, + help='Interval between progress updates') + args = parser.parse_args() + args.keep_empty = False + + # some default/dummy values for the tokenizer + args.rank = 0 + args.make_vocab_size_divisible_by = 128 + args.tensor_model_parallel_size = 1 + args.vocab_extra_ids = 0 + + return args + +def main(): + args = get_args() + startup_start = time.time() + + encoder = Encoder(args) + tokenizer = build_tokenizer(args) + pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) + + fin = open(args.input, 'r', encoding='utf-8') + img_paths = [os.path.join(args.input_image, basename) for basename in os.listdir(args.input_image)] + + encoded_docs = pool.imap(encoder.encode, zip(fin, img_paths), 25) + + print(f"Vocab size: {tokenizer.vocab_size}") + print(f"Output prefix: {args.output_prefix}") + + output_bin_files = "{}.bin".format(args.output_prefix) + output_idx_files = "{}.idx".format(args.output_prefix) + + builders = IndexedDatasetBuilder(output_bin_files, dtype=np.int32, multimodal=True) + + startup_end = time.time() + proc_start = time.time() + total_bytes_processed = 0 + + print("Time to startup:", startup_end - startup_start) + + for i, (sentence, img_raw, bytes_processed) in enumerate(encoded_docs, start=1): + total_bytes_processed += bytes_processed + builders.add_item(torch.IntTensor(sentence)) + builders.add_item(torch.from_numpy(img_raw), 1) + builders.end_document() + if i % args.log_interval == 0: + current = time.time() + elapsed = current - proc_start + mbs = total_bytes_processed/elapsed/1024/1024 + print(f"Processed {i} documents", + f"({i/elapsed} docs/s, {mbs} MB/s).", + file=sys.stderr) + + builders.finalize(output_idx_files) + + +if __name__ == '__main__': + main() + diff --git a/tools/report_theoretical_memory.py b/tools/report_theoretical_memory.py new file mode 100644 index 0000000..79b483d --- /dev/null +++ b/tools/report_theoretical_memory.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Computes theoretical memory footprint for model training without instantiating +a model and running training iterations on GPU(s).""" + +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron +from megatron.training.theoretical_memory_usage import report_theoretical_memory + +if __name__ == "__main__": + initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True) + args = get_args() + + report_theoretical_memory(args, verbose=True) diff --git a/tools/retro/README.md b/tools/retro/README.md new file mode 100644 index 0000000..395005e --- /dev/null +++ b/tools/retro/README.md @@ -0,0 +1,256 @@ +# Retro and InstructRetro + +Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM) +pretrained with retrieval-augmentation. +Retro features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of +tokens. +Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing +factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving +lower perplexity than standard GPT. +Retro also provides the flexibility to update the +knowledge stored in LMs [(Wang et al., 2023a)](https://arxiv.org/abs/2304.06762) +by updating the retrieval database without training LMs again. + +InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B, +featuring the largest LLM pretrained with retrieval (as of December 2023). +The obtained foundation model, Retro 48B, largely outperforms the GPT counterpart in terms of perplexity. +With instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction tuned GPT on +downstream tasks in the zero-shot setting. Specifically, the average improvement of InstructRetro is 7% over its GPT +counterpart across 8 short-form QA tasks, 10% over GPT across 4 challenging long-form QA tasks, and 16% over GPT across +3 summarization tasks. We also find that one can ablate the encoder from InstructRetro architecture and directly use the +InstructRetro decoder backbone as GPT, while achieving comparable results. + +This README provides an end-to-end tutorial to reproduce Retro and InstructRetro. + +# Contents + +* [Checkpoints](#checkpoints) +* [End-to-end Reproduction Guide](#end-to-end-reproduction-guide) + * [Step 0: Prepare the environment](#step-0-prepare-the-environment) + * [Docker image](#docker-image) + * [Install dependencies](#install-dependencies) + * [Step 1: Build retrieval database](#step-1-build-retrieval-database) + * [Step 2: Pretraining](#step-2-pretraining) + * [Step 3: Perplexity evaluation](#step-3-perplexity-evaluation) + * [Step 4: Instruction tuning](#step-4-instruction-tuning) + * [Step 5: Downstream task evaluation](#step-5-downstream-task-evaluation) +* [Citations](#citations) + +# Checkpoints + +We provide the pretrained checkpoints of Retro and InstructRetro in the following table. The checkpoints are available +to download through the following links: + +| Model | Size | Instruction Tuning | Download Link 1 | Download Link 2 | Download Link 3 | +|-------------------------|------|--------------------|--------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `retro-8b-base-4k` | 8b | | [Huggingface](https://huggingface.co/nvidia/retro-8b-base-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-base-4k) | [Google Drive](https://drive.google.com/drive/folders/1uSQ5DAsuvx_8XcbtnVfs_MGvEOcx0uK_?usp=sharing) | +| `retro-8b-instruct-4k` | 8b | ✅ | [Huggingface](https://huggingface.co/nvidia/retro-8b-instruct-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-instruct-4k) | [Google Drive](https://drive.google.com/drive/folders/1v5dKaSN0cm2lwyAWpFaJtlTrLhtMZXsI?usp=sharing) | +| `retro-48b-base-4k` | 48b | | [Huggingface](https://huggingface.co/nvidia/retro-48b-base-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-base-4k) | [Google Drive](https://drive.google.com/drive/folders/1rtNpf0CiLElSHQcr3aLI3zgfI3teGTP5?usp=sharing) | +| `retro-48b-instruct-4k` | 48b | ✅ | [Huggingface](https://huggingface.co/nvidia/retro-48b-instruct-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-instruct-4k) | [Google Drive](https://drive.google.com/drive/folders/1qdb0AQjSsAPGlWaIu3wgHPjf_nwLeY5h?usp=sharing) | + +# End-to-end Reproduction Guide + +In this README, we provide an end-to-end reproduction guide for InstructRetro, covering from large-scale retrieval +construction, pretraining, perplexity evaluation, instruction tuning, to downstream task evaluation. + +If you are interested in evaluation only, we also [open-sourced our checkpoints](#checkpoints) and you can directly go +to [Step 5](#step-5-downstream-task-evaluation) to evaluate the checkpoints on downstream tasks. + +## Step 0: Prepare the environment + +We recommend using docker environment to run the code. + +### Docker image + +We provide a docker build file in [tools/retro/examples/Dockerfile](examples/Dockerfile) for the reproduction. The +docker image is based on the [NGC docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) `nvcr.io/nvidia/pytorch:23.09-py3`. + +### Install dependencies + +Clone the Megatron repo: + +```bash +git clone --branch InstructRetro https://github.com/NVIDIA/Megatron-LM.git +``` + +If docker is not available, we recommend starting from a clean conda environment with the following runtime +dependencies: + +- Python 3.10 +- NVIDIA CUDA® 12.2.1 +- NVIDIA cuBLAS 12.2.5.6 +- NVIDIA cuDNN 8.9.5 +- NVIDIA NCCL 2.18.5 +- PyTorch 2.1.0a0+32f93b1 + +Then install Retro-specific dependencies, including: + +```bash +pip install -U faiss-gpu +pip install -U transformers +pip install -U sentencepiece +pip install -U h5py +pip install -U nltk +pip install -U einops +``` + +## Step 1: Build retrieval database + +In this step, we build a large-scale retrieval database for InstructRetro +through [Faiss](https://github.com/facebookresearch/faiss) to retrieve from trillions of tokens, and preprocess (and +save) the retrieval neighbors for the pretraining step. + +Please refer to [tools/retro/build_db.md](build_db.md) for more details. + +## Step 2: Pretraining + +*Please strictly follow Step 1 to build the retrieval database before pretraining to make sure the preprocessed +retrieval neighbors match the pretraining corpus.* + +In the pretraining step, we support both pretraining from scratch and continued pretraining from a pretrained GPT model. + +We provide a template pretraining script to pretrain 843M Retro from scratch. Prepare your own arguments and update our +templates in [tools/retro/examples/pretrain_model.sh](examples/pretrain_model.sh). Please note that the data path should +be exactly matching the one used in Step 1 to make sure the preprocessed retrieval neighbors match the pretraining +corpus. + +[//]: # (Take the example of the Wikipedia corpus) + +```bash +bash tools/retro/examples/pretrain_model.sh +``` + +After pretraining, the model checkpoints will be saved in the `--save` directory if you specified the arg +in `pretrain_model.sh`. + +To continue pretraining with retrieval from a pretrained GPT model, please specify `--load` in `pretrain_model.sh` to +load the pretrained GPT model checkpoint (the architecture of GPT, including hidden size, number of layers, and +activation methods, should be exactly the same as the one used for Retro). You should also +specify `--no-load-optim --finetune` to make sure the optimizer state is not loaded from the pretrained GPT model and +the continued pretraining with retrieval is from a clean start. After the first job / the first run, you will continue +pretraining with retrieval from your last checkpoint. In the follow-up jobs, you should launch the pretraining without +the flags `--no-load-optim --finetune` to make sure the optimizer state is correctly loaded from your last job. + +## Step 3: Perplexity evaluation + +During pretraining, we will automatically evaluate the model perplexity on the specified validation corpus +every `--eval-interval` steps. The validation corpus should be exactly the same as the one used in Step 1 to make sure +the preprocessed retrieval neighbors match the pretraining corpus. + +To evaluate the perplexity of a pretrained model, please add `--skip-train` in `pretrain_model.sh` to skip the +pretraining step and only evaluate the perplexity of the model specified in `--load` on the validation corpus. Run the +above command again to evaluate the perplexity of a pretrained model: + +```bash +bash tools/retro/examples/pretrain_model.sh +``` + +## Step 4: Instruction tuning + +In this step, we fine-tune the pretrained model on the downstream task with instructions. We provide a template +instruction tuning script to fine-tune 843M Retro. + +We also provide an open-source blend of instruction tuning datasets. The dataset is available to download +through [here](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing). The blendable +dataset consists of the following open-source instruction tuning datasets: + +### Instruction Tuning Dataset Breakdown + +| Dataset | Samples | Epochs | Sampling Prob | +|------------------------------------------------------------|--------:|-------:|--------------:| +| [soda](https://arxiv.org/abs/2212.10465) | 2560 | 0.005 | 0.020 | +| [eli5](https://arxiv.org/abs/1907.09190) | 2561 | 0.055 | 0.020 | +| [self_instruct_short](https://arxiv.org/abs/2212.10560) | 1280 | 0.043 | 0.010 | +| [self_instruct_long](https://arxiv.org/abs/2212.10560) | 2560 | 0.333 | 0.020 | +| [unnatural-instructions](https://arxiv.org/abs/2212.09689) | 2560 | 0.024 | 0.020 | +| [flan_cot](https://arxiv.org/abs/2210.11416) | 1280 | 0.093 | 0.010 | +| [dolly](https://arxiv.org/abs/2305.13735) | 6400 | 0.938 | 0.050 | +| [oasst-skip-noncode](https://open-assistant.io/) | 104558 | 1.839 | 0.817 | +| [oasst-skip-code](https://open-assistant.io/) | 4243 | 1.839 | 0.033 | + +Refer to the paper links above for more details about each instruction tuning dataset. + +*We note that the provided instruction tuning dataset is all from open-source instruction tuning datasets. It is +slightly different from what we use in [InstructRetro](https://arxiv.org/abs/2310.07713), which contains private and +proprietary datasets. Thus a 1-2% accuracy difference in downstream tasks may be expected.* + +### Instruction tuning script + +Download +the [blended instruction tuning dataset](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing) +in your data home directory `$DATA_HOME` and update our templates +in [tools/retro/sft/sft_retro_lm.sh](sft/sft_retro_lm.sh). + +An example command to run instruction tuning on 843M Retro is as follows: + +```bash + [blend-dataset-name] [model-size] [batch-size] [lr] [checkpoints] +bash tools/retro/sft/sft_retro_lm.sh open_inst 843m 128 5e-6 +``` + +The `blend_dataset_name` argument will blend all the datasets within the `$DATA_HOME` following the weights and +configurations specified in the `${blend_dataset_name}.sh` ([open_inst.sh](sft/open_inst.sh) in the example above). +The checkpoints will be saved in the `--save` directory. For example, it will be saved to +`/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6`. + +## Step 5: Downstream task evaluation + +In this step, we demonstrate how to run InstructRetro for zero-shot evaluation on downstream question answering (QA) +tasks. We provide the pre-processed open-source evaluation datasets with a unified format for different tasks. The +evaluation datasets used in our paper are available to download +through [here](https://drive.google.com/drive/folders/1xw-N0LJR_lIWnH6BKzHIb49quVCS_V72?usp=sharing). Please stick to +the same retro workdir used in Step 0-4 to make sure the preprocessed retrieval neighbors match the pretraining corpus. +If you directly come to Step 5, an example retro workdir with `args.json` for 800M Retro is +provided [here](https://drive.google.com/file/d/121GqAdMvf8bJEBZRt-SD4uhW-SRWgI3s/view?usp=sharing). Note that the args +in the json can be overwritten through the command line. + +We present an example command to run retro generation given the InstructRetro checkpoints and the Natural Question (NQ) +task. The example command is for the 843m InstructRetro obtained in Step 4. Please specify the directory for the NQ +dataset and update the command accordingly for other checkpoints. + +```bash +bash tools/retro/text_generation/retro_generate.sh nq 843m greedy test 0 20000 1000 5 pp1 /checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6 2 +``` + +The generated responses will be saved in the corresponding checkpoint directory. For example, for the 843m +InstructRetro, it will be saved to +`/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6/retro-generate-nq_5_2_843m_test_greedy_0_20000_1000.txt`. + +To evaluate the F1 / Exact Match (EM) scores of the generated responses, we provide an example script to run the +evaluation on the NQ dataset. Please specify the directory for the NQ dataset and update the command accordingly for +other checkpoints and downstream tasks. + +```bash +python3 tools/retro/text_generation/evaluate.py +``` + +# Citations + +See more details from our papers: + +[Shall we Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study.](https://arxiv.org/abs/2304.06762) + +_Boxin Wang, Wei Ping, Peng Xu, Lawrence McAfee, Zihan Liu, Mohammad Shoeybi, Yi Dong, Oleksii Kuchaiev, Bo Li, Chaowei +Xiao, Anima Anandkumar, Bryan Catanzaro._ (EMNLP 2023) + +[InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining.](https://arxiv.org/abs/2310.07713) + +_Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro._ + +Please cite the papers as follows if you use the data or code from this repo: + +```bibtex +@inproceedings{wang2023shall, + title = {Shall We Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study}, + author = {Boxin Wang and Wei Ping and Peng Xu and Lawrence McAfee and Zihan Liu and Mohammad Shoeybi and Yi Dong and Oleksii Kuchaiev and Bo Li and Chaowei Xiao and Anima Anandkumar and Bryan Catanzaro}, + journal = {The 2023 Conference on Empirical Methods in Natural Language Processing}, + year = {2023} +} + +@article{wang2023instructretro, + title = {InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining}, + author = {Boxin Wang and Wei Ping and Lawrence McAfee and Peng Xu and Bo Li and Mohammad Shoeybi and Bryan Catanzaro}, + year = {2023}, + journal = {arXiv preprint arXiv: 2310.07713} +} +``` diff --git a/tools/retro/build_db.md b/tools/retro/build_db.md new file mode 100644 index 0000000..c999524 --- /dev/null +++ b/tools/retro/build_db.md @@ -0,0 +1,421 @@ +This directory contains a collection of tools for building the retrieval database and pretraining neighbors for Retro. This preprocessing pipeline is broken into 3 main stages: + +1. **Build retrieval chunk database** : Used for retrieving neighbors and continuation chunks, which are then passed through the retrieval encoder. +2. **Build index for similarity search** : Train and build a search index for querying chunk neighbors. +3. **Query pretraining neighbors** : For matching pretraining samples to database chunks. Neighbors are generated separately for training, validation, and test datasets. + +The following overview goes into more detail on the pipeline, code structure, usage, and pretraining. + + +# Contents + + * [Quick start](#quick-start) + * [Tutorial](#tutorial) + * [Code structure](#code-structure) + * [Arguments](#arguments) + + + + +# Quick Start +Key files: + +- `main.py` : Entry point for processing. +- `examples/preprocess_data.sh` : Example preprocessing launch (calls `main.py`). +- `examples/pretrain_data.sh` : Example pretraining launch (calls `pretrain_retro.py`). + +Use `--retro-tasks` to move through the preprocessing pipeline. + +- Simplest setup (builds everything): `--retro-tasks build` +- Alternatively, for tuning compute resources, run stages independently: + - Build retrieval database: `--retro-tasks db-build` + - Build search index: `--retro-tasks index-build` + - Query neighbors: `--retro-tasks pretraining-query-neighbors` + +Sample code flow: + +- `main.py` : Entry point (e.g., using `--retro-tasks X`). +- `db/build.py` : Build retrieval database. +- `index/build.py` : Build search index. Calls the following two files: + - `index/train.py` : Train index on subset of database. + - `index/add.py` : Add database chunks to index. +- `pretraining/query.py` : Query pretraining samples for database neighbors (saved to disk and used during pretraining). + + + +# Tutorial + +In this tutorial example, we use the Wikipedia corpus to demonstrate how we build a retrieval database and index for this corpus, and then query the pretraining datasets for their neighbors. + +## Step 1: Prepare your retrieval text corpus + +The format of text corpus follows the same format as in Megatron training. See [data precessing](../../README.md#data-preprocessing) for more details on how to convert your json dataset into the mmap format. + +Assume we have the Wikipedia corpus in the following format: + +``` +/Wikipedia_shuf_text_document.bin +/Wikipedia_shuf_text_document.idx +``` + +We note that the retrieval database can also be a blend of multiple text corpus. + +## Step 2: Build retrieval chunk database + +This *database* (stored as a 2-D array, NOT a relational database) consists of a list of chunks (traditionally length 64) extracted from the original GPT token dataset. This is simply a consecutive, non-overlapping chunking of the token dataset. Chunking only takes place within a document, and therefore the final chunk of each document has length: 1 <= chunk_length <= max_chunk_length. + +We discard chunks that would convert to an empty Bert sequence (rare case, happens ~1/100,000 chunks in our case), since we use Bert embeddings for building our index. Thus, the total number of chunks in the database will be slightly less than a naive calculation. + +Take the Wikipedia corpus as an example to build the retrieval chunk database: + +Prepare the following arguments and update our templates in [tools/retro/examples/preprocess_data.sh](examples/preprocess_data.sh): +- `--retro-workdir`: The directory in which the preprocessing pipeline saves its datasets and configuration files. + **This argument should remain consistent for a full pass through the pipeline, and for pretraining.** +- `--data-path`: text corpus path to build retrieval database. In the case of Wikipedia corpus, it could be +```bash +WIK="${DATA_HOME}/Wikipedia_shuf_text_document" + +DATA_BLEND=" \ + 1 ${WIK} \ +" +``` +- `--load`: bert path to load bert embedder +- `--vocab-file` and `--retro-bert-vocab-file`: bert vocab file +- `--retro-gpt-tokenizer-model`: gpt tokenizer model file + +Then launch the script: +```bash +bash tools/retro/examples/preprocess_data.sh db-build +``` + +After the `db-build` is finished, the output includes: +- The launching args will be saved in your `/args.json` for the following steps. +- The retrieval chunk database will be saved in your `/db/` with your dataset information in `/db/indexed_dataset_infos.json`. + +## Step 3: Build index for similarity search + +To match pretraining chunks to database chunks, a search index must be built to perform this querying. We use Faiss (https://github.com/facebookresearch/faiss) for training and building this index. Generally, the index is trained on a subset of all chunks in the database (specified via `--retro-index-ntrain`). After training, all chunks are added into the index, to be available during querying. + +Indexes only accept 1-D floating point vectors for training and adding, so each chunk must first be embedded before passing to the index for either training or adding. We use Bert embeddings for this purpose, and the embeddings are generated automatically within the pipeline. + +Take the Wikipedia corpus as an example to build the retrieval chunk database: + +```bash +bash tools/retro/examples/preprocess_data.sh index-train +``` +The `index-train` step is expected to take less than 4-hour on a single DGX-A100 node given the template index configuration. +To scale up for larger retrieval database, please carefully tune the faiss hyper-parameters specified in `--retro-index-str`. Please refer to [Faiss](https://github.com/facebookresearch/faiss/wiki/The-index-factory) to learn more about the index configuration. + +After the index is trained, the centroids, HNSW graph, and product quantizer is determined. However, the index is still empty, as there is no chunk added. + +Take the example of the Wikipedia corpus, with the default template, the output of `index-train` includes: +- The embedded Bert embeddings of the sampled chunks for `index-train` is saved in `/index/train_emb/`. +- The empty index is saved in `/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/empty_0.970.faissindex`. + +Then we add all chunks in the retrieval database into the index so that we perform fast query over the whole retrieval database: +```bash +bash tools/retro/examples/preprocess_data.sh index-add +``` + +We note that this step can be time-consuming as it will go through the whole retrieval database, embed chunk tokens to BERT embeddings, and add them into the index. Please make sure you successfully add the whole retrieval database before moving on to the next stage. + +*In case your job is interrupted in the middle, you can just run the script again, and it will automatically skip the chunks that have been added into the index and start from the chunk where it is interrupted.* + + +Following the Wikipedia configuration, an example output of the step `index-add` includes: +- The index with retrieval data chunks added is saved in `/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/added_0.970_0.950.faissindex`, which can be used to query the neighbors for pretraining. + +## Step 4: Query pretraining neighbors + +To ensure fast Retro pretraining, the database neighbors for pretraining samples are pre-computed and saved to disk, for efficient access within the Retro dataset. In this stage, the pretraining datasets (training, validation, and test) are iterated, each sample is broken into chunks, and the chunks are used for querying the index. Similar to when building the index, each chunk is embedded (via Bert) before querying the index. + +The saved neighbors are labeled with unique dataset properties (i.e., seed, sequence length, number of samples, etc.) to ensure the neighbors generated during preprocessing match the neighbors requested during pretraining. Please also make sure the pretraining configuration is the same as this step so that the neighbors are aligned. + +There are query-time hyper-parameters that can be tuned to improve the quality of the neighbors. These are specified in `RETRO_QUERY_EF_SEARCH` and `RETRO_QUERY_NPROBE`. The most important parameter is `RETRO_QUERY_NPROBE`, which controls the number of clusters to search during querying. This parameter can be tuned to improve the quality of the neighbors, but will also increase the query time. +We recommend following the tutorial of [faiss](https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning) to tune the hyper-parameters for your own retrieval database. + +Take the Wikipedia corpus as an example to query the neighbors in the retrieval database: + +```bash +bash tools/retro/examples/preprocess_data.sh query-pretraining-neighbors +``` + +The output of `query-pretraining-neighbors` on the Wikipedia corpus includes: +- `/wiki/query/train_855ab50e05151610301e2a74c4030fbc`, which contains the pre-retrieved neighbors for the pretraining dataset. +- `/wiki/query/valid_40bc7330318d64accec28e1e63c59bad`, which contains the pre-retrieved neighbors for the validation set of the pretraining corpus. + +## Step 5: Visualization of retrieval neighbors + +We also provide cli tools to help visualize and inspect the quality of your retrieved neighbors. + +To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following: + +``` +from tools.retro.cli import retro +retro.init("/path/to/retro/workdir") +``` + +This initializes Megatron, and prepares the Retro data for inspection. We also print out some example commands to help you get familiar with the command lines. + +An example output for the Wikipedia Corpus: + +```text +setting number of micro-batches to constant 32 +> building BertWordPieceLowerCase tokenizer ... +> initializing torch distributed ... +> initialized tensor model parallel with size 1 +> initialized pipeline model parallel with size 1 +> compiling dataset index builder ... +... +... + > sample ratios: + dataset 0, input: 1, achieved: 1 +> size of blendable dataset: 201000 samples +> elapsed time for building blendable dataset indices: 0.00 (sec) +> building indices for blendable datasets ... + > sample ratios: + dataset 0, input: 1, achieved: 1 +> size of blendable dataset: 12864 samples +> finished creating pretrained GPT datasets ... + ++++++++++++++++++++++++++++++++++++++++++++++++++++ +examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ] ++++++++++++++++++++++++++++++++++++++++++++++++++++ + +~~~~ indexed datasets ~~~~ +retro.get_db_num_indexed_datasets() : 1 +retro.get_db_indexed_dataset_infos() : + [(1.000000, Wikipedia_shuf_text_document)] + +~~~~ counts ~~~~ +retro.get_db_num_chunks : 68104992. + +retro.get_pt_num_samples('train') : 201000. +retro.get_pt_num_samples('valid') : 12864. +retro.get_pt_num_chunks('train') : 1608000. +retro.get_pt_num_chunks('valid') : 102912. + +~~~~ tokens, text ~~~~ +retro.get_db_chunk_gpt(chunk_id) : [46809, 218340, 716, 647, ... , 251525, 872, 692, 4042] +retro.get_db_chunk_bert(chunk_id) : [10680, 16216, 4313, 1745 ... , 8117, 1007, 1012, 1997] +retro.get_db_chunk_text(chunk_id) : Jonas Geirnaert\n\nJonas ... ort Flatlife (11 min). Of +retro.get_db_chunk_and_continuation_text(chunk_id) : + ['Jonas Geirnaert Jonas Ge ... ort Flatlife (11 min). Of', + 'the copy he sent in for s ... abet, clearly has one. On'] + +retro.get_pt_sample('train', sample_id) : + { + 'dataset_idx' : 0 + 'text' : [ 676 14 40656 184 ... 4\n 276 17361 251542] + 'doc_ids' : [1246422 1596948 2403969] + 'neighbor_chunks' : [[[ 657380 657381]\n ... \n [34108760 34108761]]] + 'neighbor_tokens' : [[[ 276 9596 251511 . ... . 889 646 1723]]] + } + +(e.g., sample = retro.get_pt_sample(...)) + + sample['text'].shape : (513,) + sample['neighbor_tokens'].shape : (8, 20, 128) + sample['text'] : [ 676 14 40656 184 ... 4\n 276 17361 251542] + sample['neighbor_tokens'][17][1] : [ 14 14 30291 1 ... 682 328 379 251527] + retro.gpt_to_text(sample['text']) : also\nLatgalians (modern) ... ission criticised the AVN + retro.gpt_to_text(sample['neighbor_tokens']) : \n\nHis second marriage o ... Augusta Eardley-Wilmot (2 ++++++++++++++++++++++++++++++++++++++++++++++++++++ +``` + +We can also directly call the function `retro.print_neighbor_texts(sample_id, chunk_id)` to inspect the retrieval neighbors for a specific sample and chunk within the pretraining corpus. For example, + +```text +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +PRETRAINING CHUNK: + - also\nLatgalians (modern)\n\nReferences\n\nCategory:Defunct political parti ... e.\n\nAbout \nThe company was established established in 1997. It is listed +NEIGHBOR_CHUNKS: + - the sides.\n\nNotes\n\nReferences\n\nCategory:Obaku Zen\n*\nCategory:Japane ... 2, 2008. It was founded by Anand Jagannathan, CEO of parent company Kriyari + - 2007).\n\nSee also\n Satellite Communications\n Tonga\n\nReferences\n\nExte ... y Procter & Gamble (P&G) in 1985 in order for P&G to compete in the "beauty + - Japan\nCategory:Fish of Russia\nCategory:Fish described in 1845 Mareco Inde ... lic Opinion (WAPOR)\n European Society for Opinion and Marketing Research ( + - The current director of the company is Albert Bosch.\n\nSee also\n Coupon\n ... some articles in Basque. Deia is the main product of the Editorial Iparrag + - A.Ş have been traded on the Istanbul Stock Exchange since 2000.\n\nReferenc ... with stores in California, New York City, and London.\n\nHistory \nSnapette + - \nCategory:Hawaiian mythology\nCategory:Hawaiian religion\nCategory:Religio ... crative state contracts. In 2008 Prokom became a part of the Asseco capital + - , and the Baltic countries, as well as an online store.\n\nReferences\n\nEx ... nd are involved in intracellular trafficking. This protein does not contain + - juice producer\nFood industry of Russia\n\nReferences\n\nExternal links\nWi ... panies formerly listed on the New York Stock Exchange General Grant's March + - is in private ownership.\n\nReferences\n\nExternal links\n\nCategory:Online ... ten and directed by Brent Hodge. The film stars Aubrey Plaza, Molly Hawkey, + - company's display technology to manufacture and sell display-only engines.\ ... for a group of naval vessels (a division in naval usage).\n\nUsage\n Russia + - .\n\nCarrols also operated a chain of outlets in neighbouring Estonia from ... rama film directed by Raajeev Walia. It is produced by Aman Mehta and Bijal + - \n\nExternal links\nHightail website\nThe Next Web on YouSendIt rebrand to ... eptember 2014, sitting mainly in the criminal division of that court.\n\nBe + - American television seasons\nCategory:2014 American television seasons\nCat ... Canada and larger European cities.\n\nIn 2010, advertising in New Zealand, + - .\n\nNotes\n\nCategory:Trade unions\nCategory:Industrial Workers of the Wor ... x people, some of whom may have been working on a part-time basis. Its head + - \n List of podcasting companies\n\nReferences\n\nExternal links\n \n\nCateg ... ct.\n\nCategory:Populated places in the Ashanti Region Nkeirouka Ezekh\n\nN + - \n\nReferences\n\nExternal links\n ADESE official website\n\nCategory:Compa ... State Street, and UBS Warburg. Its first CEO was Ian M. Drachman. The firm + - Hotel\n Sulake Corporation\n Sulake Press Room\n Habbo Hotel - Blog\n\nCate ... l: 김진태; born December 19, 1980), better known by his stage name Verbal Jint + - hockey player\n Ruutu.fi, a Finnish television streaming service operated b ... from the bottom, a BDSM term\n Topping cycle, a cycle used in power plants + - of Surakarta\nCategory:Indonesian names\nCategory:Indonesian families\nCate ... mber 13, 2013 in Izhevsk on Universitetskaya Street (later it was given the + - facilities are also in Ankara and the company HQ is in Istanbul.\n\nReferen ... is currently a World Wide Web Consortium Working Draft.\n\nSee also\n Voice +``` + +The code snippet for the above example is also equivalent to +```python +tokens = retro.get_pt_sample('train', 0) +for token_ids in tokens["neighbor_tokens"][0]: + print("- %s" % (retro.gpt_to_text(token_ids))) + print("-" * 20) +``` + +# Code structure + +### `tools/retro/main.py` + +This is the main entry point for Retro preprocessing. Call `main.py --help` to see arguments. Additionally, some Retro arguments are in Megatron's core arguments, so also see `add_retro_args()` section of `megatron/arguments.py` for additional arguments. Two of the most important arguments to customize are `--retro-workdir` and `--retro-tasks`. + +- **`--retro-workdir`** : Set the directory in which the preprocessing pipeline saves its datasets and configuration files. This argument should remain consistent for a full pass through the pipeline, and for pretraining. + +- **`--retro-tasks`** : Set the stages of preprocessing to perform. As mentioned previously, the three high-level stages are: 1) build retrieval database, 2) build search index, and 3) query pretraining neighbors. `--retro-tasks` can be used to either run the full pipeline, or run each of these stages in isolation. The latter case is useful for tuning compute resources for each stage. For example, index training utilizes GPUs and requires relatively less time, while querying neighbors uses the CPU and is a relatively slow process. Example tasks include: + + - **`--retro-tasks build`** : Run entire preprocessing pipeline. + - **`--retro-tasks db-build`** : Build retrieval database. + - **`--retro-tasks index-build`** : Train and build search index. + - **`--retro-tasks pretraining-query-neighbors`** : Query pretraining neighbors. + +Multiple tasks can be specified by separating with commas (e.g., `--retro-tasks db-build,index-build`). Additionally, various 'miscellaneous' tasks are currently including, primarily for validating data for each stage; these task names can be seen in `main.py`. + +### `tools/retro/examples` + +Example scripts for setting arguments and launch Retro preprocessing. The key files here are: + +- **`preprocess_data.sh`** : Example launch script for preprocessing retro data. +- **`pretrain_model.sh`** : Example launch script for pretraining a retro model. + +### `tools/retro/db` + +Build the retrieval chunk database. The key files here are: + +- **`build.py`** : Entry point for building the database. This code is responsible for iterating the input datasets (i.e., `--data-path`), parsing each dataset into consecutive chunks, checking for empty Bert (Wordpiece) conversions, and storing this information to disk. Two databases are created: 1) the retrieval database, and 2) a sampled database used for training the search index. +- **`dataset.py`** : Defines database class, for iterating or accessing chunks in the database. Each chunk contains its tokens, Bert conversion length, and dataset index. + +Input data: + + +- Token datasets, as loaded by `gpt_dataset.py`. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`). + +Output data: + +- **`/db/merged/train.hdf5`** : The main retrieval database. (*Database* here is used to denote a list of indexed chunks, rather than a *relational database*.) The chunks in this database are added to the search index, and are used for retrieval during pretraining. This file contains a single dataset `'chunks'`, which contains 5 columns: + + - `dataset_idx` : Dataset index, from list of blended indexed datasets. + - `document_idx` : Document index within dataset. + - `chunk_start_idx` : Chunk's starting token index within document. + - `chunk_end_idx` : Chunk's ending token index (exclusive) within document. + - `bert_chunk_length` : Length of Bert token sequence, after converting from GPT. + +- **`/db/merged/sampled.hdf5`** : Subset of training database that is used for training the search index. This file has the same structure as detailed above. In general, this database is significanly smaller than the `train.hdf5` database, since the search index only needs a relatively small number of samples to understand the data's structure. After training, all chunks in the main database (`train.hdf5`) are *added* to the search index. + +### `tools/retro/index` + +Build the search index. The key files here are: + +- `build.py` : Entry point for building the search index. First, the index is trained on the sampled chunk database (see above) by calling `train.py`, and then all chunks for the full database are added to the index by calling `add.py`. Note that training requires first embedding (using Bert) all chunks (a parallel operation), and then loading these embeddings and training the index (a sequential operation), so it's best to change one's compute setup after all chunks have been embedded and saved to disk. +- `indexes/faiss_base.py` : Wrapper class for building a Faiss index, following the standard `train()` and `add()` operations. +- `indexes/faiss_par_add.py` : Similar to above, except it uses an embarrassingly parallel (multi-node, multi-process) `add()` operation. Vectors are first added to separate index copies, and then merged together. + +Input data: + +- **`/db/merged/sampled.hdf5`** : Chunks used for training the search index. +- **`/db/merged/train.hdf5`** : Chunks used for adding to the *trained* search index. + +Output data: + +- **`/index///added.faissindex`** : The final index, which has been trained and has had all database chunks added to it. This index is ready for querying neighbors. Here, `RETRO_INDEX_TYPE` and `RETRO_INDEX_STR` correspond to the same-name arguments `--retro-index-type` (e.g., `faiss-par-add`) and `--retro-index-str` (e.g., `OPQ32_256,IVF4194304_HNSW32,PQ32`). +- **`/index///empty.faissindex`** : Generally can be discarded once `added.faissindex` has been built, but this file contains the *post-training*, *pre-adding* index. Useful for debugging or building other indexes. + +### `tools/retro/pretraining` + +Query the pretraining datasets (training, validation, test) for their neighbors within the database. Neighbors are queried during preprocessing -- rather than during pretraining -- because querying is a fairly slow operation, so it would be a bottleneck if performed during pretraining. Queried neighbors are tagged with their unique identifying information (e.g., `train_indexmap_27662746ns_2048sl_1234s`), so as to avoid incorrect references during pretraining. The key files here are: + +- **`query.py`** : Entry point for querying. The pretraining datasets are iterated, and each chunk within each sample is queried using the search index. These neighbors are filtered by discarding any database chunks that fall within the same document as any chunk within a pretraining sample. +- **`chunk_dataset.py`** : This creates an iterable 'chunk' dataset form of a pretraining dataset. This is just a light wrapper, but makes it easier to deterministically iterate and assign IDs to each chunk in a sample dataset. +- **`retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample. + +Input data: + +- Token datasets, as loaded by `gpt_dataset.py`. +- **`/index///added.faissindex`** : The trained index, with all database chunks added to it (see previous section for details). + +Output data: + +- **`/{train,valid,test}_XXns_YYsl_ZZs/WW.hdf5`** : These directories/files contain the indexes of neighbors for each chunk within each sample of the pretraining datasets. Each directory (e.g., `train_indexmap_2047435ns_2048sl_1234s`) contains a list of HDF5 files (e.g., one file might be called `0075700000-0075800000.hdf5`). Each HDF5 file contains a consecutive subset of neighbor IDs for a given chunk, for indexing into the main retrieval database. All HDF5 files taken together within a given directory, represent the entire set of neighbors for a dataset. The size of these HDF5 files is determined by the argument `--retro-block-size`. The `XX`, `YY`, `ZZ`, `WW` notation above denotes the dataset properties that are used for uniquely tagging the neighbor files, to ensure compatibility during model pretraining. These neighbor files are ultimated used by `retro_dataset.py` during pretraining, for building Retro samples. + +### `tools/retro/cli` + +Inspect preprocessed data. To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following: + +``` +from tools.retro.cli import retro +retro.init("/path/to/retro/workdir") +``` + +This initializes Megatron, and prepares the Retro data for inspection. See the printed usage for available functions. Several routines are included for viewing data in the retrieval database and viewing pretraining samples and neighbors. For example: + +```python +retro.get_db_num_indexed_datasets() # 15 +retro.get_db_chunk_text(92874113) # 'research project at ... and philosophy' +retro.get_pt_sample('train', 62005) # '[16084, 26158, 25387 ..., 6898, 9568]' +``` + +Most methods within the CLI are prefixed to denote the data being inspected: + +- **'db'** : Retrieval database (i.e., chunk tokens, document IDs, and dataset IDs) +- **'pt'** : Pretraining datasets (i.e., sample tokens and neighbor tokens) + +### `tools/retro/utils.py` + +A collection of utility methods. Most importantly, this contains: + +- **`def get_gpt_tokenizer()`** : Get the GPT tokenizer. +- **`def get_bert_tokenizer()`** : Get the Bert tokenizer. +- **`class GPTToTextDataset`** : Wrapper class that converts GPT (BPE) samples to raw text. + +### `tools/bert_embedding` + +Generate Bert embeddings. The main files here are: + +- **`embed.py`** : Entry point for generating embeddings, and contains the two main embedding classes, `BertEmbedder` and `DiskDataParallelBertEmbedder` (more below). This file contains code for generating Megatron embeddings, while the file below contains code for Huggingface embeddings. +- **`huggingface.py`** : Used by `embed.py` when the embedder is configured (see below) to output Huggingface embeddings. +- **`dataset.py`** : Wrapper class for converting a raw-text dataset to Bert (Wordpiece) tokens. + +The Bert embeddings can be configured along two axes. The first axis is the output type: + +- **`class BertEmbedder`** : This class takes a raw-text dataset as input, generates its embeddings, and returns a Numpy array. The main functions are `embed_text_dataset` (accepts a raw-text dataset) and `embed_text` (accepts a string). +- **`class DiskDataParallelBertEmbedder`** : This class wraps `BertEmbedder`, and rather than returning a Numpy array, it saves the embeddings to disk. Additionally, this class automatically splits data across data parallel ranks (using interleaving), and also processes data in a specified `block_size` (e.g., 1,000,000). + +The second axis is the type of embedding model to use, controlled by the argument `--bert-embedder-type`: + +- **`--bert-embedder-type megatron`** : Use Megatron's Bert model. The specific model used is dependent on the loaded checkpoint, vocab file, and tokenizer. +- **`--bert-embedder-type huggingface`** : Use Huggingface's `bert-large-cased`. (*Note*: Huggingface's inclusion is likely to be deprecated; and there is no ability to configure cased/uncased.) + +### Pretraining + +- **`pretrain_retro.py`** : Launch script for pretraining Retro. Similar to `pretrain_gpt.py`, except this script handles loading neighbor tokens and setting up the neighbor attention mask. + +- **`megatron/model/retro_transformer.py`** : Implementation of Retro model, including the main transformer, the retrieval encoder, and chunked cross-attention layers. Note that currently, `retro_transformer.py` contains several classes that are nearly identical to `transformer.py`, except for 1 or 2 lines, due to code changes that are yet to be integrated. +- **`tools/retro/pretraining/retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample. + + + +# Arguments + +See `tools/retro/main.py`'s `add_retro_args()` and `megatron/arguments.py`'s `_add_retro_args()` for details and descriptions. Here we list some particularly important arguments: + +- `--retro-workdir` : Mentioned previously, this argument determines the directory in which a set of Retro data is stored (during preprocessing) and loaded (during pretraining). Any change in this directory during preprocessing may result in preprocessing starting over from scratch, and any change before pretraining will result in pretraining throwing an error. +- Preprocessing + - `--retro-gpt-chunk-length` : Retro chunk length (e.g., 64 in original paper). + - `--retro-tasks` : Comma-separated list of preprocessing tasks. Generally, the `build` task is the simplest way to run the preprocessing pipeline. For finer control, individual stages can be run by using tasks (in order): `db-build`, `index-build`, and `pretraining-query-neighbors`. + - `--retro-index-str` : Faiss index string that defines the index configuration. This will vary based on data size, compute/disk setup, and user needs. For example, this string looks something like `IVF262144_HNSW32,Flat` or `OPQ32_256,IVF4194304_HNSW32,PQ32`. +- Pretraining + - `--retro-add-retriever` : Must be used to select Retro model. + - `--retro-num-neighbors` : Number of neighbors to retrieve from the retrieval database (defaults to 2). + - `--retro-num-retrieved-chunks` : For each neighbor, the number consecutive chunks to retrieve, including the initial neighbor (defaults to 2). + - `--retro-attention-gate` : Gated mechanism to incorporate information of cross attention from retrieved neighbor (defaults to 1 during pretraining). + + + + + diff --git a/tools/retro/cli/__init__.py b/tools/retro/cli/__init__.py new file mode 100644 index 0000000..2531017 --- /dev/null +++ b/tools/retro/cli/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from .cli import retro diff --git a/tools/retro/cli/__main__.py b/tools/retro/cli/__main__.py new file mode 100644 index 0000000..37d096a --- /dev/null +++ b/tools/retro/cli/__main__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import os + +from . import retro + + +if __name__ == "__main__": + retro.init(os.environ["RETRO_PROJECT_DIR"]) diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py new file mode 100644 index 0000000..2a75679 --- /dev/null +++ b/tools/retro/cli/cli.py @@ -0,0 +1,301 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import json +import numpy as np +import os +import typing as T +from types import SimpleNamespace + +from megatron.training.arguments import load_retro_config, parse_args, validate_args +from megatron.core.datasets.retro.db.dataset import DBDataset +from megatron.core.datasets.retro.db.utils import ( + get_indexed_dataset_infos as get_db_indexed_dataset_infos, + get_merged_train_dataset as get_db_dataset, +) +from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets, RetroDataset +from megatron.training.global_vars import set_global_variables +from megatron.training.training import build_train_valid_test_datasets, update_train_iters +from pretrain_retro import train_valid_test_datasets_provider +from tools.retro.preprocess_data import get_tokenizers + + +def shorten_str(s: str, n: int) -> str: + s = "\\n".join(s.splitlines()) + return s if len(s) <= n else "%s ... %s" % (s[: n // 2], s[-n // 2 :]) + + +class retro: + + config = None + + ############################################## + # initialize. + ############################################## + + @classmethod + def init(cls, project_dir: str) -> None: + '''Initialize Megatron, tokenizers, and datasets.''' + + # Megatron args. + args = parse_args(extra_args_provider=None, ignore_unknown_args=False) + args.retro_project_dir = project_dir + args.micro_batch_size = 1 + args.num_layers = 1 + args.hidden_size = 1 + args.num_attention_heads = 1 + args.async_tensor_model_parallel_allreduce = False + args.retro_add_retriever = True # for building RetroDataset + validate_args(args) + set_global_variables(args) + update_train_iters(args) + + # Retro config. + cls.config = load_retro_config(project_dir) + cls.config.retro_project_dir = project_dir + cls.config.retro_tokenizers = get_tokenizers(cls.config) + + # Chunk database dataset. + cls.db_indexed_dataset_infos = get_db_indexed_dataset_infos(project_dir) + cls.db_dataset = get_db_dataset(project_dir, + cls.config.retro_gpt_chunk_length, + cls.config.retro_tokenizers.gpt.eod) + + # Pretraining datasets. + pt_train_ds, pt_valid_ds, pt_test_ds = build_train_valid_test_datasets( + train_valid_test_datasets_provider) + cls.pt_datasets = SimpleNamespace( + train=pt_train_ds, + valid=pt_valid_ds, + test=pt_test_ds, + ) + + # Print usage. + cls.print_usage() + + ############################################## + # utils. + ############################################## + + @classmethod + def gpt_to_text(cls, token_ids: np.ndarray) -> str: + '''GPT tokens to text.''' + return cls.config.retro_tokenizers.gpt.detokenize( + token_ids.tolist() if isinstance(token_ids, np.ndarray) else token_ids + ) + + @classmethod + def text_to_bert(cls, text: str) -> np.ndarray: + '''Text to Bert tokens.''' + return cls.config.retro_tokenizers.bert.tokenize(text) + + ############################################## + # chunk db. + ############################################## + + @classmethod + def get_db_num_indexed_datasets(cls) -> int: + '''Number of indexed datasets within blended dataset.''' + return len(cls.db_indexed_dataset_infos) + + @classmethod + def get_db_indexed_dataset_infos(cls) -> T.List[T.Tuple[float, str]]: + '''Dataset infos, including number of training & sampled sets.''' + return [(info["ratio"], info["prefix"]) for info in cls.db_indexed_dataset_infos] + + @classmethod + def get_db_dataset(cls) -> DBDataset: + return cls.db_dataset + + @classmethod + def get_db_num_chunks(cls) -> int: + '''Number of DB chunks.''' + return len(cls.get_db_dataset()) + + @classmethod + def get_db_chunk_gpt(cls, idx: int) -> T.List[int]: + '''Get DB chunk as GPT token ids.''' + return cls.get_db_dataset()[idx]["text"].tolist() + + @classmethod + def get_db_chunk_bert(cls, idx: int) -> T.List[int]: + '''Get DB chunk as Bert token ids.''' + return cls.text_to_bert(cls.get_db_chunk_text(idx)) + + @classmethod + def get_db_chunk_text(cls, idx: int) -> str: + '''Get DB chunk as text.''' + return cls.gpt_to_text(cls.get_db_chunk_gpt(idx)) + + @classmethod + def get_db_chunk_and_continuation_text(cls, idx: int) -> T.List[str]: + '''Get DB chunk along with continuation, as text.''' + + # Modulus used here to match original implementation (i.e., last + # chunks continuation wraps around to first chunk). + return [ + cls.get_db_chunk_text(idx), + cls.get_db_chunk_text((idx + 1) % len(cls.get_db_dataset())), + ] + + ############################################## + # pretraining corpus. + ############################################## + + @classmethod + def get_pt_num_samples_and_chunks(cls, data_key: str) -> T.Tuple[int, int]: + '''Number of samples & chunks (e.g., 32*n_samples) in corpus.''' + assert hasattr(cls.pt_datasets, data_key), ( + "pretraining set '%s' not found (choices: %s)." + % (data_key, ", ".join(vars(cls.pt_datasets).keys())) + ) + chunk_dataset = getattr(cls.pt_datasets, data_key).chunk_dataset + return ( + len(chunk_dataset.sample_dataset), + len(chunk_dataset), + ) + + @classmethod + def get_pt_num_samples(cls, data_key: str) -> int: + '''Number of pretraining samples.''' + return cls.get_pt_num_samples_and_chunks(data_key)[0] + + @classmethod + def get_pt_num_chunks(cls, data_key: str) -> int: + '''Number of pretraining chunks (e.g., 32*n_samples).''' + return cls.get_pt_num_samples_and_chunks(data_key)[1] + + @classmethod + def get_pt_dataset(cls, data_key: str) -> RetroDataset: + return getattr(cls.pt_datasets, data_key) + + @classmethod + def get_pt_sample(cls, data_key: str, idx: int) -> dict: + return getattr(cls.pt_datasets, data_key)[idx] + + @classmethod + def get_neighbor_tokens(cls, sample_id: int, chunk_id: int, data_key: str="train") -> T.Optional[dict]: + try: + sample = cls.get_pt_sample(data_key, sample_id) + sample_token_ids = sample["text"] + chunk_length = cls.args.retro_gpt_chunk_length + chunk_start_idx = chunk_id * chunk_length + chunk_end_idx = min(sample_token_ids.shape[0], chunk_start_idx + chunk_length) + chunk_token_ids = sample_token_ids[chunk_start_idx:chunk_end_idx] + neighbor_token_ids = sample["neighbor_tokens"][chunk_id] + return { + "chunk_tokens": chunk_token_ids, + "neighbor_tokens": neighbor_token_ids, + } + except: + return None + + @classmethod + def print_neighbor_texts(cls, sample_id: int, chunk_id: int, data_key: str="train") -> None: + tokens: dict = cls.get_neighbor_tokens(sample_id, chunk_id, data_key) + print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") + try: + print("PRETRAINING CHUNK:") + print(" - %s" % shorten_str(cls.gpt_to_text(tokens["chunk_tokens"]), 150)) + print("NEIGHBOR_CHUNKS:") + for token_ids in tokens["neighbor_tokens"]: + print(" - %s" % shorten_str(cls.gpt_to_text(token_ids), 150)) + except: + print("" % sample_id) + + ############################################## + # usage. + ############################################## + + @classmethod + def print_usage(cls) -> None: + '''Print usage.''' + + print() + print("+++++++++++++++++++++++++++++++++++++++++++++++++++") + print("examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ]") + print("+++++++++++++++++++++++++++++++++++++++++++++++++++") + + print() + print("~~~~ indexed datasets ~~~~") + print("retro.get_db_num_indexed_datasets() : %s" % cls.get_db_num_indexed_datasets()) + print("retro.get_db_indexed_dataset_infos() :") + for i, (ratio, prefix) in enumerate(cls.get_db_indexed_dataset_infos()): + print( + " %s(%f, %s)%s" + % ( + "[" if i == 0 else " ", + ratio, + prefix, + "]" if i == len(cls.db_indexed_dataset_infos) - 1 else ",", + ) + ) + + print() + print("~~~~ counts ~~~~") + print("retro.get_db_num_chunks : %d." % cls.get_db_num_chunks()) + + print() + for sq_key in ("sample", "chunk"): + for data_key in ("train", "valid"): # test? + print( + "retro.get_pt_num_%ss('%s') : %d." + % (sq_key, data_key, getattr(cls, f"get_pt_num_{sq_key}s")(data_key)) + ) + + print() + print("~~~~ tokens, text ~~~~") + print( + "retro.get_db_chunk_gpt(chunk_id) : %s" + % shorten_str(str(retro.get_db_chunk_gpt(0)), 50) + ) + print( + "retro.get_db_chunk_bert(chunk_id) : %s" + % shorten_str(str(retro.get_db_chunk_bert(0)), 50) + ) + print( + "retro.get_db_chunk_text(chunk_id) : %s" + % shorten_str(retro.get_db_chunk_text(0).strip(), 50) + ) + print("retro.get_db_chunk_and_continuation_text(chunk_id) :") + for i, t in enumerate(retro.get_db_chunk_and_continuation_text(0)): + print( + " %s'%s'%s" + % ( + "[" if i == 0 else " ", + shorten_str(t.strip().replace("\n", " "), 50), + "]" if i == 1 else ",", + ) + ) + + sample = cls.get_pt_sample("train", 0) + sample_chunk_id = sample["neighbor_tokens"].shape[0] // 2 + sample_neighbor_id = 0 + print() + print("retro.get_pt_sample('train', sample_id) :") + print(" {") + for k, v in sample.items(): + print(" '%s' : %s" % (k, shorten_str(str(v), 50))) + print(" }") + + print() + print("(e.g., sample = retro.get_pt_sample(...))") + print() + print(" sample['text'].shape : %s" % str(sample["text"].shape)) + print(" sample['neighbor_tokens'].shape : %s" % str(sample["neighbor_tokens"].shape)) + print(" sample['text'] : %s" % shorten_str(str(sample["text"]), 50)) + print( + " sample['neighbor_tokens'][17][1] : %s" + % shorten_str(str(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50) + ) + print( + " retro.gpt_to_text(sample['text']) : %s" + % shorten_str(cls.gpt_to_text(sample["text"]), 50) + ) + print( + " retro.gpt_to_text(sample['neighbor_tokens']) : %s" + % shorten_str( + cls.gpt_to_text(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50 + ) + ) + + print("+++++++++++++++++++++++++++++++++++++++++++++++++++") diff --git a/tools/retro/config_utils.py b/tools/retro/config_utils.py new file mode 100644 index 0000000..00676c6 --- /dev/null +++ b/tools/retro/config_utils.py @@ -0,0 +1,632 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Config utils.""" + +import argparse +from collections import namedtuple, OrderedDict +import dataclasses +import enum +import inspect +import os +import re +import types +import typing as T + + +PARAM_KEYWORDS = { + "param", + "parameter", + "arg", + "argument", + "attribute", + "key", + "keyword", +} +RAISES_KEYWORDS = {"raises", "raise", "except", "exception"} +DEPRECATION_KEYWORDS = {"deprecation", "deprecated"} +RETURNS_KEYWORDS = {"return", "returns"} +YIELDS_KEYWORDS = {"yield", "yields"} +EXAMPLES_KEYWORDS = {"example", "examples"} + + +class ParseError(RuntimeError): + """Base class for all parsing related errors.""" + + +class DocstringStyle(enum.Enum): + """Docstring style.""" + + REST = 1 + GOOGLE = 2 + NUMPYDOC = 3 + EPYDOC = 4 + AUTO = 255 + + +class RenderingStyle(enum.Enum): + """Rendering style when unparsing parsed docstrings.""" + + COMPACT = 1 + CLEAN = 2 + EXPANDED = 3 + + +class DocstringMeta: + """Docstring meta information. + + Symbolizes lines in form of + + :param arg: description + :raises ValueError: if something happens + """ + + def __init__( + self, args: T.List[str], description: T.Optional[str] + ) -> None: + """Initialize self. + + :param args: list of arguments. The exact content of this variable is + dependent on the kind of docstring; it's used to distinguish + between custom docstring meta information items. + :param description: associated docstring description. + """ + self.args = args + self.description = description + + +class DocstringParam(DocstringMeta): + """DocstringMeta symbolizing :param metadata.""" + + def __init__( + self, + args: T.List[str], + description: T.Optional[str], + arg_name: str, + type_name: T.Optional[str], + is_optional: T.Optional[bool], + default: T.Optional[str], + ) -> None: + """Initialize self.""" + super().__init__(args, description) + self.arg_name = arg_name + self.type_name = type_name + self.is_optional = is_optional + self.default = default + + +class DocstringReturns(DocstringMeta): + """DocstringMeta symbolizing :returns or :yields metadata.""" + + def __init__( + self, + args: T.List[str], + description: T.Optional[str], + type_name: T.Optional[str], + is_generator: bool, + return_name: T.Optional[str] = None, + ) -> None: + """Initialize self.""" + super().__init__(args, description) + self.type_name = type_name + self.is_generator = is_generator + self.return_name = return_name + + +class DocstringRaises(DocstringMeta): + """DocstringMeta symbolizing :raises metadata.""" + + def __init__( + self, + args: T.List[str], + description: T.Optional[str], + type_name: T.Optional[str], + ) -> None: + """Initialize self.""" + super().__init__(args, description) + self.type_name = type_name + self.description = description + + +class DocstringDeprecated(DocstringMeta): + """DocstringMeta symbolizing deprecation metadata.""" + + def __init__( + self, + args: T.List[str], + description: T.Optional[str], + version: T.Optional[str], + ) -> None: + """Initialize self.""" + super().__init__(args, description) + self.version = version + self.description = description + + +class DocstringExample(DocstringMeta): + """DocstringMeta symbolizing example metadata.""" + + def __init__( + self, + args: T.List[str], + snippet: T.Optional[str], + description: T.Optional[str], + ) -> None: + """Initialize self.""" + super().__init__(args, description) + self.snippet = snippet + self.description = description + + +class Docstring: + """Docstring object representation.""" + + def __init__( + self, + style=None, # type: T.Optional[DocstringStyle] + ) -> None: + """Initialize self.""" + self.short_description = None # type: T.Optional[str] + self.long_description = None # type: T.Optional[str] + self.blank_after_short_description = False + self.blank_after_long_description = False + self.meta = [] # type: T.List[DocstringMeta] + self.style = style # type: T.Optional[DocstringStyle] + + @property + def params(self) -> T.List[DocstringParam]: + """Return a list of information on function params.""" + return {m.arg_name:m for m in self.meta if isinstance(m, DocstringParam)} + + @property + def raises(self) -> T.List[DocstringRaises]: + """Return a list of information on the exceptions that the function + may raise. + """ + return [ + item for item in self.meta if isinstance(item, DocstringRaises) + ] + + @property + def returns(self) -> T.Optional[DocstringReturns]: + """Return a single information on function return. + + Takes the first return information. + """ + for item in self.meta: + if isinstance(item, DocstringReturns): + return item + return None + + @property + def many_returns(self) -> T.List[DocstringReturns]: + """Return a list of information on function return.""" + return [ + item for item in self.meta if isinstance(item, DocstringReturns) + ] + + @property + def deprecation(self) -> T.Optional[DocstringDeprecated]: + """Return a single information on function deprecation notes.""" + for item in self.meta: + if isinstance(item, DocstringDeprecated): + return item + return None + + @property + def examples(self) -> T.List[DocstringExample]: + """Return a list of information on function examples.""" + return [ + item for item in self.meta if isinstance(item, DocstringExample) + ] + + +class SectionType(enum.IntEnum): + """Types of sections.""" + + SINGULAR = 0 + """For sections like examples.""" + + MULTIPLE = 1 + """For sections like params.""" + + SINGULAR_OR_MULTIPLE = 2 + """For sections like returns or yields.""" + + +class Section(namedtuple("SectionBase", "title key type")): + """A docstring section.""" + + +GOOGLE_TYPED_ARG_REGEX = re.compile(r"\s*(.+?)\s*\(\s*(.*[^\s]+)\s*\)") +GOOGLE_ARG_DESC_REGEX = re.compile(r".*\. Defaults to (.+)\.") +MULTIPLE_PATTERN = re.compile(r"(\s*[^:\s]+:)|([^:]*\]:.*)") + +DEFAULT_SECTIONS = [ + Section("Arguments", "param", SectionType.MULTIPLE), + Section("Args", "param", SectionType.MULTIPLE), + Section("Parameters", "param", SectionType.MULTIPLE), + Section("Params", "param", SectionType.MULTIPLE), + Section("Raises", "raises", SectionType.MULTIPLE), + Section("Exceptions", "raises", SectionType.MULTIPLE), + Section("Except", "raises", SectionType.MULTIPLE), + Section("Attributes", "attribute", SectionType.MULTIPLE), + Section("Example", "examples", SectionType.SINGULAR), + Section("Examples", "examples", SectionType.SINGULAR), + Section("Returns", "returns", SectionType.SINGULAR_OR_MULTIPLE), + Section("Yields", "yields", SectionType.SINGULAR_OR_MULTIPLE), +] + + +class GoogleDocstringParser: + """Parser for Google-style docstrings.""" + + def __init__( + self, sections: T.Optional[T.List[Section]] = None, title_colon=True + ): + """Setup sections. + + :param sections: Recognized sections or None to defaults. + :param title_colon: require colon after section title. + """ + if not sections: + sections = DEFAULT_SECTIONS + self.sections = {s.title: s for s in sections} + self.title_colon = title_colon + self._setup() + + def _setup(self): + if self.title_colon: + colon = ":" + else: + colon = "" + self.titles_re = re.compile( + "^(" + + "|".join(f"({t})" for t in self.sections) + + ")" + + colon + + "[ \t\r\f\v]*$", + flags=re.M, + ) + + def _build_meta(self, text: str, title: str) -> DocstringMeta: + """Build docstring element. + + :param text: docstring element text + :param title: title of section containing element + :return: + """ + + section = self.sections[title] + + if ( + section.type == SectionType.SINGULAR_OR_MULTIPLE + and not MULTIPLE_PATTERN.match(text) + ) or section.type == SectionType.SINGULAR: + return self._build_single_meta(section, text) + + if ":" not in text: + # raise ParseError(f"Expected a colon in {text!r}.") + return None + + # Split spec and description + before, desc = text.split(":", 1) + if desc: + desc = desc[1:] if desc[0] == " " else desc + if "\n" in desc: + first_line, rest = desc.split("\n", 1) + desc = first_line + "\n" + inspect.cleandoc(rest) + desc = desc.strip("\n") + + return self._build_multi_meta(section, before, desc) + + @staticmethod + def _build_single_meta(section: Section, desc: str) -> DocstringMeta: + if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS: + return DocstringReturns( + args=[section.key], + description=desc, + type_name=None, + is_generator=section.key in YIELDS_KEYWORDS, + ) + if section.key in RAISES_KEYWORDS: + return DocstringRaises( + args=[section.key], description=desc, type_name=None + ) + if section.key in EXAMPLES_KEYWORDS: + return DocstringExample( + args=[section.key], snippet=None, description=desc + ) + if section.key in PARAM_KEYWORDS: + raise ParseError("Expected paramenter name.") + return DocstringMeta(args=[section.key], description=desc) + + @staticmethod + def _build_multi_meta( + section: Section, before: str, desc: str + ) -> DocstringMeta: + if section.key in PARAM_KEYWORDS: + match = GOOGLE_TYPED_ARG_REGEX.match(before) + if match: + arg_name, type_name = match.group(1, 2) + if type_name.endswith(", optional"): + is_optional = True + type_name = type_name[:-10] + elif type_name.endswith("?"): + is_optional = True + type_name = type_name[:-1] + else: + is_optional = False + else: + arg_name, type_name = before, None + is_optional = None + + match = GOOGLE_ARG_DESC_REGEX.match(desc) + default = match.group(1) if match else None + + return DocstringParam( + args=[section.key, before], + description=desc, + arg_name=arg_name, + type_name=type_name, + is_optional=is_optional, + default=default, + ) + if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS: + return DocstringReturns( + args=[section.key, before], + description=desc, + type_name=before, + is_generator=section.key in YIELDS_KEYWORDS, + ) + if section.key in RAISES_KEYWORDS: + return DocstringRaises( + args=[section.key, before], description=desc, type_name=before + ) + return DocstringMeta(args=[section.key, before], description=desc) + + def add_section(self, section: Section): + """Add or replace a section. + + :param section: The new section. + """ + + self.sections[section.title] = section + self._setup() + + def parse(self, text: str) -> Docstring: + """Parse the Google-style docstring into its components. + + :returns: parsed docstring + """ + ret = Docstring(style=DocstringStyle.GOOGLE) + if not text: + return ret + + # Clean according to PEP-0257 + text = inspect.cleandoc(text) + + # Find first title and split on its position + match = self.titles_re.search(text) + if match: + desc_chunk = text[: match.start()] + meta_chunk = text[match.start() :] + else: + desc_chunk = text + meta_chunk = "" + + # Break description into short and long parts + parts = desc_chunk.split("\n", 1) + ret.short_description = parts[0] or None + if len(parts) > 1: + long_desc_chunk = parts[1] or "" + ret.blank_after_short_description = long_desc_chunk.startswith( + "\n" + ) + ret.blank_after_long_description = long_desc_chunk.endswith("\n\n") + ret.long_description = long_desc_chunk.strip() or None + + # Split by sections determined by titles + matches = list(self.titles_re.finditer(meta_chunk)) + if not matches: + return ret + splits = [] + for j in range(len(matches) - 1): + splits.append((matches[j].end(), matches[j + 1].start())) + splits.append((matches[-1].end(), len(meta_chunk))) + + chunks = OrderedDict() # type: T.Mapping[str,str] + for j, (start, end) in enumerate(splits): + title = matches[j].group(1) + if title not in self.sections: + continue + + # Clear Any Unknown Meta + # Ref: https://github.com/rr-/docstring_parser/issues/29 + meta_details = meta_chunk[start:end] + unknown_meta = re.search(r"\n\S", meta_details) + if unknown_meta is not None: + meta_details = meta_details[: unknown_meta.start()] + + chunks[title] = meta_details.strip("\n") + if not chunks: + return ret + + # Add elements from each chunk + for title, chunk in chunks.items(): + # Determine indent + indent_match = re.search(r"^\s*", chunk) + if not indent_match: + raise ParseError(f'Can\'t infer indent from "{chunk}"') + indent = indent_match.group() + + # Check for singular elements + if self.sections[title].type in [ + SectionType.SINGULAR, + SectionType.SINGULAR_OR_MULTIPLE, + ]: + part = inspect.cleandoc(chunk) + ret.meta.append(self._build_meta(part, title)) + continue + + # Split based on lines which have exactly that indent + _re = "^" + indent + r"(?=\S)" + c_matches = list(re.finditer(_re, chunk, flags=re.M)) + if not c_matches: + raise ParseError(f'No specification for "{title}": "{chunk}"') + c_splits = [] + for j in range(len(c_matches) - 1): + c_splits.append((c_matches[j].end(), c_matches[j + 1].start())) + c_splits.append((c_matches[-1].end(), len(chunk))) + for j, (start, end) in enumerate(c_splits): + part = chunk[start:end].strip("\n") + ret.meta.append(self._build_meta(part, title)) + + return ret + + +def verify_and_get_config_attr_descs(config_cls, strict_docstring_match=True): + + assert dataclasses.is_dataclass(config_cls), f"uh oh <{config_cls.__name__}>." + + # Parse docstring. + try: + docstring = GoogleDocstringParser().parse(config_cls.__doc__) + except Exception as e: + raise Exception(f"error parsing {config_cls.__name__} docstring.") + + # Get attributes and types. + config_attrs = docstring.params + config_types = config_cls.__annotations__ + + # Verify attribute names. + config_attr_keys = set(config_attrs.keys()) + config_type_keys = set(config_types.keys()) + missing_attr_keys = config_type_keys - config_attr_keys + extra_attr_keys = config_attr_keys - config_type_keys + if strict_docstring_match: + assert not missing_attr_keys and not extra_attr_keys, f"{config_cls.__name__} docstring is either missing attributes ({', '.join(missing_attr_keys) if missing_attr_keys else '--'}) or contains extra attributes ({', '.join(extra_attr_keys) if extra_attr_keys else '--'})." + + # @todo + # Verify attribute type names. + # for key in config_attr_keys: + # ... todo ... + + # Verify base class attributes. + attrs = {k:v for base_cls in config_cls.__bases__ if dataclasses.is_dataclass(base_cls) for k,v in verify_and_get_config_attr_descs(base_cls, strict_docstring_match=strict_docstring_match).items()} + for key in config_attr_keys: + if key in config_types: + attrs[key] = { + "desc" : config_attrs[key].description, + "type" : config_types[key], + } + + return attrs + + +def add_config_args(parser, config_cls): + attrs = verify_and_get_config_attr_descs(config_cls, strict_docstring_match=False) + for key, attr in attrs.items(): + _type = attr["type"] + if dataclasses.is_dataclass(_type): + group = parser.add_argument_group(title=attr["desc"]) + add_config_args(group, _type) + else: + + default_value = getattr(config_cls, key) + args = { + "help" : attr["desc"], + "default" : default_value, + } + + if _type == bool: + assert isinstance(args["default"], (bool, type(None))), \ + f"boolean attribute '{key}' of {config_cls.__name__} " \ + "has non-boolean default value." + + # When default=True, add 'no-{key}' arg. + if default_value: + args["action"] = "store_false" + args["dest"] = key + key = "no-" + key + else: + args["action"] = "store_true" + + elif _type in (int, float): + args["type"] = _type + + elif _type == list: + args["nargs"] = "*" + + # else: ....... treat as string arg + # raise Exception(f"specialize action for '{key}', type <{_type}>.") + + try: + parser.add_argument(f"--{key.replace('_', '-')}", **args) + except argparse.ArgumentError as e: + pass + + +def get_config_leaf_field_names(config_cls): + names = set() + for field in dataclasses.fields(config_cls): + if dataclasses.is_dataclass(field.type): + names.update(get_config_leaf_field_names(field.type)) + else: + names.add(field.name) + return names + + +def config_from_args(args, config_cls, add_custom_args=False): + + # Collect config data in a dict. + data = {} + for field in dataclasses.fields(config_cls): + if dataclasses.is_dataclass(field.type): + data[field.name] = config_from_args(args, field.type) + else: + data[field.name] = getattr(args, field.name) + + # Add custom args. (e.g., for tools, tasks) + if add_custom_args: + + config_keys = get_config_leaf_field_names(config_cls) + arg_keys = set(vars(args).keys()) + custom_keys = arg_keys - config_keys + + custom_data = {k:v for k, v in vars(args).items() if k in custom_keys} + custom_config_cls = dataclasses.make_dataclass( + "CustomConfig", + [(k, type(v)) for k, v in custom_data.items()]) + custom_config = custom_config_cls(**custom_data) + data["custom"] = custom_config + + # Create config. [ todo: programmatically create dataclass that inherits + # TransformerConfig. ] + config = config_cls(**data) + + return config + + +def flatten_config(config, base_config_cls=None): + + # Lift sub-config data. + flat_config = {} + for field in dataclasses.fields(config): + value = getattr(config, field.name) + if dataclasses.is_dataclass(value): + flat_config = { **flat_config, **flatten_config(value) } + else: + flat_config[field.name] = value + + # Convert to dataclass. + if base_config_cls: + base_keys = set(field.name for field in dataclasses.fields(base_config_cls)) + flat_config_cls = dataclasses.make_dataclass( + cls_name="FlatMegatronConfig", + fields=[(k, T.Any, dataclasses.field(default=None)) + for k, v in flat_config.items() + if k not in base_keys], + bases=(base_config_cls,)) + flat_config = flat_config_cls(**flat_config) + + return flat_config diff --git a/tools/retro/docker/Dockerfile b/tools/retro/docker/Dockerfile new file mode 100644 index 0000000..e8945b3 --- /dev/null +++ b/tools/retro/docker/Dockerfile @@ -0,0 +1,19 @@ +FROM nvcr.io/nvidia/pytorch:23.09-py3 + +RUN pip install -U faiss-gpu + +RUN apt update + +RUN apt install -qy htop + +RUN pip install -U transformers + +RUN pip install --upgrade google-api-python-client + +RUN pip install sentencepiece + +RUN pip install h5py + +RUN pip install nltk + +RUN pip install einops diff --git a/tools/retro/preprocess_data.py b/tools/retro/preprocess_data.py new file mode 100644 index 0000000..dd36eb0 --- /dev/null +++ b/tools/retro/preprocess_data.py @@ -0,0 +1,295 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Preprocess data for Retro. + +Stages (see argument '--retro-tasks'): +- Build chunk database (DB). +- Build index (train, add). +- Query pretraining neighbors. +""" + +import json +import os +import sys +import torch + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.datasets.retro.db import build_db +from megatron.core.datasets.retro.index import add_to_index, train_index +from megatron.core.datasets.retro.config import ( + RetroBertEmbedders, + RetroGPTChunkDatasets, + RetroPreprocessingConfig, + RetroTokenizers, +) +from megatron.core.datasets.retro.query.gpt_chunk_dataset import build_gpt_chunk_datasets_from_gpt_datasets +from megatron.core.datasets.retro.query.multi_split_gpt_dataset import ( + MultiSplitGPTDataset, + MultiSplitGPTDatasetConfig, +) +from megatron.core.datasets.retro.query.query import query_neighbors +from megatron.core.datasets.retro.query.utils import get_query_dir +from megatron.core.datasets.retro.utils import retro_makedir +from megatron.core.models.retro.utils import ( + get_config_path, + get_gpt_data_dir, +) +from megatron.training import get_args, initialize_megatron, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.tokenizer.tokenizer import ( + _BertWordPieceTokenizer, + _GPT2BPETokenizer, + _GPTSentencePieceTokenizer, +) +from megatron.training import get_train_valid_test_num_samples +from pretrain_gpt import is_dataset_built_on_rank +from tools.bert_embedding import BertEmbedder, DiskDataParallelBertEmbedder +from tools.retro.config_utils import add_config_args + + +def add_retro_args(parser): + group = parser.add_argument_group(title="Retro preprocessing") + add_config_args(group, RetroPreprocessingConfig) + return parser + + +def initialize_megatron_retro(): + '''Initialize megatron & save Retro config.''' + + # Prevent arguments.py from overriding preprocessing args. + project_dir_idx = sys.argv.index("--retro-project-dir") + retro_project_dir = sys.argv[project_dir_idx + 1] + del sys.argv[project_dir_idx] # delete key + del sys.argv[project_dir_idx] # delete value + + # Initialize. + initialize_megatron(extra_args_provider=add_retro_args) + + args = get_args() + args.retro_project_dir = retro_project_dir + + # Retro config. + config = get_retro_preprocessing_config() + + # Save retro config. + if config.retro_task_validate is None: + retro_makedir(config, config.retro_project_dir) + save_config(config) + + return config + + +def get_bert_embedders(config): + mem_embedder = BertEmbedder( + batch_size = config.retro_bert_batch_size, + max_bert_seq_length = config.retro_bert_max_chunk_length, + embedder_type = "megatron", + ) + return RetroBertEmbedders( + mem = mem_embedder, + disk = DiskDataParallelBertEmbedder(mem_embedder, config.retro_block_size), + ) + + +def get_gpt_chunk_datasets(config): + + args = get_args() + + # Dataset config. + data_dir = get_gpt_data_dir(config.retro_project_dir) + blend = list(config.retro_gpt_data_path) + for i in range(len(blend) - 1, -1, -2): + blend[i] = os.path.join(data_dir, blend[i]) + data_config = MultiSplitGPTDatasetConfig( + random_seed=config.retro_gpt_seed, + sequence_length=config.retro_gpt_seq_length, + blend=get_blend_from_list(blend), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + split=config.retro_gpt_split, + split_preprocessing=config.retro_gpt_split, + path_to_cache=config.retro_gpt_data_cache_path, + return_document_ids=True, + tokenizer=config.retro_tokenizers.gpt, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + ) + + # GPT datasets. + print_rank_0(" > multi-split gpt datasets.") + train_valid_test_num_samples = get_train_valid_test_num_samples() + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + MultiSplitGPTDataset, + train_valid_test_num_samples, + is_dataset_built_on_rank, + data_config, + ).build() + + gpt_datasets = { + "train" : (train_ds, train_valid_test_num_samples[0]), + "valid" : (valid_ds, train_valid_test_num_samples[1]), + "test" : (test_ds, train_valid_test_num_samples[2]), + } + + # Chunk datasets. + chunk_datasets = build_gpt_chunk_datasets_from_gpt_datasets( + project_dir=config.retro_project_dir, + gpt_datasets=gpt_datasets, + sample_length=config.retro_gpt_seq_length, + chunk_length=config.retro_gpt_chunk_length, + ) + chunk_datasets = RetroGPTChunkDatasets(**chunk_datasets) + + return chunk_datasets + + +def get_gpt_tokenizer(config): + '''GPT (BPE) tokenizer.''' + tokenizer_type = config.retro_gpt_tokenizer_type + if tokenizer_type == "GPT2BPETokenizer": + assert config.retro_gpt_vocab_file and config.retro_gpt_merge_file + return _GPT2BPETokenizer( + vocab_file=os.path.join( + config.retro_project_dir, + config.retro_gpt_vocab_file, + ), + merge_file=os.path.join( + config.retro_project_dir, + config.retro_gpt_merge_file, + ), + ) + elif tokenizer_type == 'GPTSentencePieceTokenizer': + assert config.retro_gpt_tokenizer_model is not None + return _GPTSentencePieceTokenizer(os.path.join( + config.retro_project_dir, + config.retro_gpt_tokenizer_model, + )) + else: + raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type) + + +def get_bert_tokenizer(config): + '''Bert (Wordpiece) tokenizer.''' + lower_case = { + "BertWordPieceLowerCase" : True, + "BertWordPieceCase" : False, + }[config.retro_bert_tokenizer_type] + return _BertWordPieceTokenizer( + vocab_file=os.path.join( + config.retro_project_dir, + config.retro_bert_vocab_file, + ), + lower_case=lower_case, + ) + + +def get_tokenizers(config): + return RetroTokenizers( + gpt = get_gpt_tokenizer(config), + bert = get_bert_tokenizer(config), + ) + + +def get_retro_preprocessing_config(): + + # Arguments. + args = get_args() + + # Retro config. + config = core_transformer_config_from_args( + args, config_class=RetroPreprocessingConfig) + + # Add tools. + config.retro_tokenizers = get_tokenizers(config) + config.retro_bert_embedders = get_bert_embedders(config) + config.retro_gpt_chunk_datasets = get_gpt_chunk_datasets(config) + + return config + + +def save_config(config): + '''Save copy of config within retro project dir.''' + + if torch.distributed.get_rank() == 0: + + # GPT config + block size. + config_subset = { + k:v for k,v in vars(config).items() + if k.startswith("retro_gpt") and k != "retro_gpt_chunk_datasets" + } + config_subset["retro_block_size"] = config.retro_block_size + + # Bert config. + config_subset["retro_bert_tokenizer_type"] = config.retro_bert_tokenizer_type + config_subset["retro_bert_vocab_file"] = config.retro_bert_vocab_file + + # Neighbor directories. + query_dir = get_query_dir(config.retro_project_dir) + config_subset["retro_neighbor_dirs"] = { + k : (os.path.relpath(v["neighbor_dir"], query_dir) if v is not None else None) + for k, v in vars(config.retro_gpt_chunk_datasets).items() + } + + # Save. + config_path = get_config_path(config.retro_project_dir) + with open(config_path, "w") as f: + json.dump(config_subset, f, indent=4, sort_keys=True) + + torch.distributed.barrier() + + +if __name__ == "__main__": + + # Initalize Megatron. + config = initialize_megatron_retro() + + # Expand tasks. + task_remap = { + "build" : [ "db-build", "index-train", "index-add", "query-neighbors" ], + "index-build" : [ "index-train", "index-add" ], + "db-build" : [ "db-build" ], + "index-train" : [ "index-train" ], + "index-add" : [ "index-add" ], + "query-neighbors" : [ "query-neighbors" ], + } + tasks = [] + for task in config.retro_tasks: + tasks.extend(task_remap[task]) + config.retro_tasks = tasks + + # Select task to run. + for task in tasks: + + print_rank_0("start '%s%s'." % ( + "" if config.retro_task_validate is None else "[validate] ", + task, + )) + + # DB (i.e., chunk db). + if task == "db-build": + build_db(config) + + # Index. + elif task == "index-train": + train_index(config) + elif task == "index-add": + add_to_index(config) + + # Query. + elif task == "query-neighbors": + query_neighbors(config) + + else: + raise Exception("specialize for task '%s'." % task) + + torch.distributed.barrier() + + print_rank_0("end '%s%s'." % ( + "" if config.retro_task_validate is None else "[validate] ", + task, + )) diff --git a/tools/retro/sft/README.md b/tools/retro/sft/README.md new file mode 100644 index 0000000..e589879 --- /dev/null +++ b/tools/retro/sft/README.md @@ -0,0 +1,3 @@ +## Note + +The content within this `sft` directory is still under active development and will be updated soon. \ No newline at end of file diff --git a/tools/retro/sft/dataset_conv.py b/tools/retro/sft/dataset_conv.py new file mode 100644 index 0000000..3dd8fa9 --- /dev/null +++ b/tools/retro/sft/dataset_conv.py @@ -0,0 +1,446 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import re +import json +import os +from typing import Any, Iterable, Dict, Optional + +from numpy import ndarray +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.utils import Split +import torch +import numpy +import glob +from collections import OrderedDict + +from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig +from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset +from megatron.core.datasets.utils import Split +from dataclasses import dataclass + + +_DATASET_NAME_PATTERNS = { + Split.train: r"(?P[^\0]+)\/(?P=name)\_QA\_train.json", + Split.valid: r"(?P[^\0]+)\/(?P=name)\_QA\_dev.json", +} + + +@dataclass +class JsonQADatasetConfig(BlendedMegatronDatasetConfig): + """Configuration object for the QA finetuning pipeline + """ + ft_neighbours: int = 1 + + bert_retriever_neighbours: bool = False + + longform_answer: bool = False + + inference_only: bool = False + + retrieved_neighbours: bool = False + + fix_newsqa: bool = True + + def __post_init__(self) -> None: + super().__post_init__() + assert self.blend_per_split is not None + + +@dataclass +class RetroJsonQADatasetConfig(JsonQADatasetConfig): + """Configuration object for the Retro QA finetuning pipeline + """ + retro_num_neighbors: int = None + + retro_gpt_retrieved_length: int = None + + def __post_init__(self) -> None: + super().__post_init__() + assert self.retro_num_neighbors is not None + assert self.retro_gpt_retrieved_length is not None + + +class JsonQADataset(MegatronDataset): + + def __init__(self, dataset: Any, dataset_path: str, indices: ndarray, num_samples: Optional[int], index_split: Split, config: BlendedMegatronDatasetConfig) -> None: + super().__init__(dataset, dataset_path, indices, num_samples, index_split, config) + matches = re.findall(_DATASET_NAME_PATTERNS[index_split], dataset_path) + assert len(matches) == 1 + assert len(matches[0]) > 0 + self.dataset_name = matches[0] + + @staticmethod + def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: + return len(low_level_dataset) + + @staticmethod + def build_low_level_dataset(dataset_path: str, config: JsonQADatasetConfig) -> Iterable: + assert os.path.isfile(dataset_path), f"{dataset_path} does not exist on disk" + return preprocess(dataset_path, config) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx: int) -> Dict[str, ndarray]: + sample = self.dataset[idx % len(self.dataset)] + + # unpack tokens + query, answer, neighbours = sample + + # tokenization + output_tokens = self.config.tokenizer.tokenize(answer) + + input_tokens = reformat_prompt( + query, + neighbours, + self.dataset_name, + self.config.ft_neighbours, + len(output_tokens), + self.config.tokenizer, + self.config.sequence_length + ) + + # padding + tokens, answer_mask = pad_and_convert_to_numpy( + input_tokens, output_tokens, self.config.tokenizer.pad, self.config.sequence_length, self.config.tokenizer.eos + ) + + train_sample = { + 'text': tokens, + 'answer_mask': answer_mask, + } + + return train_sample + + +class RetroJsonQADataset(JsonQADataset): + + def __getitem__(self, idx: int) -> Dict[str, ndarray]: + + sample = self.dataset[idx % len(self.dataset)] + + # unpack tokens + query, answer, neighbours = sample + + # tokenization + output_tokens = self.config.tokenizer.tokenize(answer) + + input_tokens = reformat_prompt_retro( + query, + neighbours, + self.dataset_name, + self.config.ft_neighbours, + len(output_tokens), + self.config.tokenizer, + self.config.sequence_length + ) + + # padding + tokens, answer_mask = pad_and_convert_to_numpy( + input_tokens, + output_tokens, + self.config.tokenizer.pad, + self.config.sequence_length, + self.config.tokenizer.eos + ) + + # get retro neighbors + # context chunk and answer chunk + n_chunks_per_sample = 2 + num_neighbors = self.config.retro_num_neighbors + # disable retro encoder + neighbor_tokens = numpy.zeros( + [n_chunks_per_sample, num_neighbors, self.config.retro_gpt_retrieved_length], + dtype=numpy.int64 + ) + + train_sample = { + 'text': tokens, + 'answer_mask': answer_mask, + 'neighbor_tokens': neighbor_tokens, + 'context_len': len(input_tokens) + } + + return train_sample + + +def format_multichoice(multichoice_options): + options_text = ["({}) {}".format(chr(ord('A') + i), option) for i, option in + zip(range(len(multichoice_options)), multichoice_options)] + return "Choose one based on the following options: {}".format(" ".join(options_text)) + + +def format_multichoice_question(question, multichoice_options): + return "{}\n{}".format(question, format_multichoice(multichoice_options)) + + +def format_answer(answer): + return " {}".format(answer) + + +def preprocess(dataset_path: str, config: JsonQADatasetConfig): + assert config.ft_neighbours > 0 + if config.longform_answer: + nq_examples = [] + with open(dataset_path, "r") as f: + for fn in f: + nq_examples.append(json.loads(fn)) + else: + nq_examples = [] + for my_data_file in sorted(glob.glob(dataset_path)): + with open(my_data_file, "r", encoding='utf-8') as f: + nq_examples.extend(json.load(f)) + + data = [] + for instance in nq_examples: + question = instance["question"] + if 'qa_type' in instance and instance['qa_type'] == "multi_choice_qa": + question = format_multichoice_question(question, instance["multichoice_options"]) + if config.bert_retriever_neighbours: + contexts = instance["bert_pretrain_corpus_neighbours"] + neighbours = ["source: " + ctx for ctx in contexts] + else: + if config.retrieved_neighbours: + contexts = instance["ctxs"] + neighbours = ["title: " + ctx["title"] + ", source: " + ctx["text"] for ctx in contexts] + else: + if "sub-paragraphs" in instance: + if type(instance["sub-paragraphs"]) == list: # doc2dial: + neighbours = [ + "title: " + instance["sub-paragraphs"][0] + ", source: " + instance["sub-paragraphs"][1]] + else: + neighbours = ["title: , source: " + instance["sub-paragraphs"]] + elif config.fix_newsqa and "sub_paragraph" in instance: + neighbours = ["title: , source: " + instance["sub_paragraph"]] + else: + neighbours = ["title: , source: "] + + if config.inference_only: + data.append((question, None, neighbours)) + else: + if config.longform_answer: + if "longform_answer" in instance: + answers = [instance["longform_answer"]] + else: + continue + else: + if "answers" in instance: + answers = instance["answers"] + elif "answer" in instance: + if type(instance["answer"]) is str: + answers = [instance["answer"]] + elif type(instance["answer"]) is list: + answers = instance["answer"] + else: + answers = [str(instance["answer"])] + else: + raise ValueError("need to have answer or answers") + if len(answers) < 1: + continue + else: + if type(answers[0]) is dict: + answers = [answers[0]["text"].strip()] + elif type(answers[0]) is str: + answers = [answers[0]] + else: + raise ValueError("unsupported type for answer(s)") + + for answer in answers: + answer = format_answer(answer) + data.append((question, answer, neighbours)) + + return data + + +def count_stat(dataset, tokenizer, k): + nb_lens = [] + for i, d in enumerate(dataset): + query, answer, neighbours = d + nb_lens.extend([len(tokenizer.tokenize(neighbour)) for neighbour in neighbours[:k]]) + + print("len of nb", len(nb_lens)) + print("max of len nb", max(nb_lens)) + print("num of cut ", sum([l > 128 for l in nb_lens]), sum([l > 128 for l in nb_lens]) // len(nb_lens)) + print("last max", sorted(nb_lens)[-10:]) + + +def reformat_prompt_retro(query, neighbours, dataset_name, ft_neighbours, \ + max_output_len, tokenizer, max_seq_length): + system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives " + "helpful, detailed, and polite answers to the user's questions.\n\n") + + if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]: + input_tokens = tokenizer.tokenize(system + query) + return input_tokens + + short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq", + "tqa", "quac"] + yes_no_without_context = ["BoolQ"] + multichoices = [""] + formatted_dataset_name = ["doc2dial", "quac", "qrecc", "sharc"] + + if dataset_name in formatted_dataset_name: + dialogue_turn = query + else: + if dataset_name in short_span_with_context: + user = "{} Answer the above question with a short phrase.".format(query) + elif dataset_name in yes_no_without_context: + user = "{} Answer the above question with True or False.".format(query) + else: + user = "{} Answer the above question with a long complete answer.".format(query) + + if dataset_name in short_span_with_context: + dialogue_format = "User: {}\n\nAssistant: The answer is" + dialogue_turn = dialogue_format.format(user) + else: + dialogue_format = "User: {}\n\nAssistant:" + dialogue_turn = dialogue_format.format(user) + + if ft_neighbours > 0: + context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n" + context_tokens = tokenizer.tokenize(context) + dialogue_tokens = tokenizer.tokenize(dialogue_turn) + system_tokens = tokenizer.tokenize(system) + context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)] + context = tokenizer.detokenize(context_tokens) + + all_input = system + context + dialogue_turn + print(all_input) + input_tokens = tokenizer.tokenize(all_input) + else: + all_input = system + dialogue_turn + input_tokens = tokenizer.tokenize(all_input) + + return input_tokens + + +def flan_format(system, context, dialogue_turn, template_id=0): + templates = [ + "{}User: Answer based on context:\n\n{}{}", + "{}User: {}Answer this question based on the article: {}", + "{}User: {}{}", + "{}User: {}Answer this question: {}", + "{}User: Read this article and answer this question {}{}", + "{}User: {}Based on the above article, answer a question. {}", + "{}User: Context: {}Question: {}" + ] + template = templates[template_id - 1].format(system, context, dialogue_turn) + return template + + +def reformat_prompt(query, neighbours, dataset_name, ft_neighbours, \ + max_output_len, tokenizer, max_seq_length, template_id=0): + system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives " + "helpful, detailed, and polite answers to the user's questions based on the context. The assistant " + "should also indicate when the answer cannot be found in the context.\n\n") + + if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]: + input_tokens = tokenizer.tokenize(system + query) + return input_tokens + + short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq", + "BioASQ", "DuoRC_ParaphraseRC", "TextbookQA", "tqa"] + yes_no_without_context = ["boolq", "multirc"] + multichoices = ["race"] + # multi-turn qa datasets + formatted_dataset_name = ["convqa", "chatgptgen", "doc2dial", "quac", "qrecc", "sharc"] + + if dataset_name in formatted_dataset_name: + dialogue_turn = query + else: + if dataset_name in short_span_with_context: + if template_id == 0: + user = "Answer the following question with a short span. {}".format(query) + else: + user = query + elif dataset_name in yes_no_without_context: + user = "Answer the following question with True or False. {}".format(query) + elif dataset_name in multichoices: + user = "Answer the following question by selecting one of the provided options. {}".format(query) + else: + if template_id == 0: + user = "Please give a full and complete answer for the question. {}".format(query) + else: + user = query + + if dataset_name in short_span_with_context: + if template_id == 0: + dialogue_format = "User: {}\n\nAssistant: The answer is" + else: + dialogue_format = "{}\n\nAssistant: The answer is" + dialogue_turn = dialogue_format.format(user) + else: + if template_id == 0: + dialogue_format = "User: {}\n\nAssistant:" + else: + dialogue_format = "{}\n\nAssistant:" + dialogue_turn = dialogue_format.format(user) + + if ft_neighbours > 0: + context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n" + context_tokens = tokenizer.tokenize(context) + dialogue_tokens = tokenizer.tokenize(dialogue_turn) + system_tokens = tokenizer.tokenize(system) + context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)] + context = tokenizer.detokenize(context_tokens) + + if template_id == 0: + all_input = system + context + dialogue_turn + else: + all_input = flan_format(system, context, dialogue_turn, template_id=template_id) + input_tokens = tokenizer.tokenize(all_input) + else: + all_input = system + dialogue_turn + input_tokens = tokenizer.tokenize(all_input) + + return input_tokens + + +def reformat_prompt_short(query, neighbours, dataset_name, ft_neighbours, \ + max_output_len, tokenizer, max_seq_length): + if not query.endswith("?"): + query = query + "?" + query = "Question: {} Answer: The answer is".format(query) + + if ft_neighbours > 0: + context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n" + context_tokens = tokenizer.tokenize(context) + dialogue_tokens = tokenizer.tokenize(query) + context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens)] + context = tokenizer.detokenize(context_tokens) + all_input = context + query + input_tokens = tokenizer.tokenize(all_input) + else: + all_input = query + input_tokens = tokenizer.tokenize(all_input) + + return input_tokens + + +def pad_and_convert_to_numpy(input_ids, output_ids, + pad_id, max_seq_length, + eos_id): + """Pad sequences and convert them to numpy.""" + if len(input_ids) > max_seq_length: + input_ids = input_ids[:max_seq_length - 1] + + if len(input_ids + output_ids) > max_seq_length: + output_ids = output_ids[:max_seq_length - len(input_ids)] + + tokens = input_ids + output_ids + answer_mask = [0] * len(input_ids) + [1] * len(output_ids) + + # padding + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens. + filler = [pad_id] * padding_length + tokens = numpy.array(tokens + [eos_id] + filler, dtype=numpy.int64) + + # answer mask + answer_mask = answer_mask + [1] + [0] * padding_length + answer_mask = numpy.array(answer_mask, dtype=numpy.int64) + + return tokens, answer_mask diff --git a/tools/retro/sft/open_inst.sh b/tools/retro/sft/open_inst.sh new file mode 100644 index 0000000..9ebe063 --- /dev/null +++ b/tools/retro/sft/open_inst.sh @@ -0,0 +1 @@ +DATA_BLEND="1.0 open_inst" diff --git a/tools/retro/sft/sft_retro.py b/tools/retro/sft/sft_retro.py new file mode 100644 index 0000000..fd7e8d8 --- /dev/null +++ b/tools/retro/sft/sft_retro.py @@ -0,0 +1,275 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import torch +from functools import partial, reduce +import sys, os + +sys.path.append(os.path.abspath(os.path.join( + os.path.join(os.path.dirname(__file__), "../../../")))) +from megatron.training import get_args, get_retro_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.core import tensor_parallel +from megatron.core.enums import ModelType +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.training import pretrain +from megatron.training.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import average_losses_across_data_parallel_group +from pretrain_gpt import model_provider, is_dataset_built_on_rank +from tools.retro.sft.dataset_conv import JsonQADataset, JsonQADatasetConfig, RetroJsonQADataset, RetroJsonQADatasetConfig + + +def get_tasks_args(parser): + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title='tasks') + + # parameters for the knowledgeable dialogue generation + group.add_argument('--task', type=str, default=None, + help='Task name.') + group.add_argument('--epochs', type=int, default=None, + help='Number of finetunning epochs. Zero results in ' + 'evaluation only.') + group.add_argument('--keep-last', action='store_true', + help='Keep the last batch (maybe incomplete) in' + 'the data loader') + group.add_argument('--pretrained-checkpoint', type=str, default=None, + help='Pretrained checkpoint used for finetunning.') + group.add_argument('--data-folder', type=str, default=None, + help='dataset folder') + group.add_argument('--answer-loss-only', action='store_true', default=False, + help='take the loss from answer part, ignore the context') + group.add_argument('--weight', type=float, default=1) + group.add_argument('--adaptor', action='store_true', default=False) + group.add_argument('--project-size', type=int, default=256) + group.add_argument('--cyclic-train-iters', type=int, default=None) + group.add_argument('--stored_params', type=dict, default=dict()) + group.add_argument('--eval_ppl', action='store_true', default=False) + group.add_argument('--debug', action='store_true', default=False) + group.add_argument('--add_retriever', action='store_true', default=False) + group.add_argument('--return_doc_ids', action='store_true', default=False) + group.add_argument('--return_neighbor_ids', action='store_true', default=False) + group.add_argument('--add_offset_doc_ids', action='store_true', default=False) + group.add_argument('--offset_dict_path', type=str, default='') + group.add_argument('--neighbors_path', type=str, default='') + group.add_argument('--valid_neighbors_path', type=str, default='') + group.add_argument('--database_path', type=str, default='') + group.add_argument('--valid_database_path', type=str, default='') + group.add_argument('--encoder-layers', type=int, default=12) + group.add_argument('--encoder-hidden-dropout', type=float, default=0.1) + group.add_argument('--encoder-attention-dropout', type=float, default=0.1) + group.add_argument('--k', type=int, default=2) + group.add_argument('--r', type=int, default=128) + group.add_argument('--m', type=int, default=64) + group.add_argument('--dpr-mode', type=str, default="multi") + group.add_argument('--faiss-ckpt', type=str, default='') + group.add_argument('--original-db-file', type=str, default="") + group.add_argument('--ft_neighbours', type=int, default=1) + group.add_argument('--reuse-top', action='store_true', default=False) + group.add_argument('--shuffle_topn', action='store_true', default=False) + group.add_argument('--chunk0', action='store_true', default=False) + group.add_argument('--disable-encoder', action='store_true', default=False) + group.add_argument('--qa-space-pad', action='store_true', default=False) + group.add_argument('--retro-mask-encoder', action='store_true', default=False) + group.add_argument('--without-title', action='store_true', default=False) + group.add_argument('--longform-answer', action='store_true', default=False) + group.add_argument('--bert-retriever-neighbours', action='store_true', default=False) + group.add_argument('--prefix', action='store_true', default=False) + group.add_argument('--question-in-encoder', action='store_true', default=False) + group.add_argument('--reset_eval', type=bool, default=True) ## by default reset eval for each eval + return parser + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text', 'answer_mask'] + datatype = torch.int64 + + if args.retro_add_retriever: + keys += 'neighbor_tokens', 'context_len' + + # Broadcast data. + if data_iterator is not None: + try: + data = next(data_iterator) + + except BaseException: + data = data_iterator + raise ValueError("error with data_iterator") + else: + data = None + + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + chunk_size = torch.min(data_b['context_len']) + retro_args = get_retro_args() + # two chunk retro has at least seq_len / 2 of chunk size + retro_args.retro_gpt_chunk_length = max(args.seq_length // 2, args.seq_length - chunk_size.item()) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + answer_mask = data_b["answer_mask"].float()[:, 1:].contiguous() + + if args.retro_add_retriever: + neighbor_tokens = data_b['neighbor_tokens'].view(-1, + retro_args.retro_gpt_retrieved_length).long() # [bs * l * k, r] + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + + if args.answer_loss_only: + loss_mask = loss_mask * answer_mask + + if args.retro_add_retriever: + _, _, neighbor_position_ids = get_ltor_masks_and_position_ids( + neighbor_tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + neighbor_attention_mask = None + return tokens, labels, loss_mask, attention_mask, position_ids, \ + neighbor_tokens, neighbor_attention_mask, neighbor_position_ids + else: + return tokens, labels, loss_mask, attention_mask, position_ids + + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + if args.retro_add_retriever: + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids, \ + neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + output_tensor = model(tokens, position_ids, attention_mask, + retriever_input_ids=neighbor_tokens, + retriever_position_ids=neighbor_position_ids, + retriever_attn_mask=neighbor_attention_mask, + labels=labels) + else: + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + retro_args = get_retro_args() + + tokenizer = get_tokenizer() + + def fix_and_split_blend_pair(pair): + weight, name = pair + return [ + [weight, os.path.join(args.data_folder, name, f"{name}_QA_train.json")], + [weight, os.path.join(args.data_folder, name, f"{name}_QA_dev.json")], + None, + ] + + blend = [args.data_path[i:i+2] for i in range(0, len(args.data_path), 2)] + + if len(blend) == 1: + blend_per_split = [ + os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_train.json"), + os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_dev.json"), + None, + ] + else: + blend_per_split = [ + list( + reduce( + lambda x, y: x + y, + list(zip(*map(fix_and_split_blend_pair, blend)))[0] + ) + ), + None, + None, + ] + + blend_per_split = [get_blend_from_list(blend) for blend in blend_per_split] + + extra_kwargs = {} + + if args.retro_add_retriever: + dataset_cls = RetroJsonQADataset + config_cls = RetroJsonQADatasetConfig + extra_kwargs["retro_num_neighbors"] = args.retro_num_neighbors + extra_kwargs["retro_gpt_retrieved_length"] = retro_args.retro_gpt_retrieved_length + else: + dataset_cls = JsonQADataset + config_cls = JsonQADatasetConfig + + config = config_cls( + random_seed=args.seed, + sequence_length=args.seq_length, + blend_per_split=blend_per_split, + split=args.split, + path_to_cache=args.data_cache_path, + tokenizer=tokenizer, + ft_neighbours=args.ft_neighbours, + bert_retriever_neighbours=args.bert_retriever_neighbours, + longform_answer=args.longform_answer, + inference_only=False, + retrieved_neighbours=False, + fix_newsqa=True, + **extra_kwargs + ) + + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + dataset_cls, + train_val_test_num_samples, + is_dataset_built_on_rank, + config + ).build() + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + pretrain(train_valid_test_datasets_provider, model_provider, + ModelType.retro_decoder, # ModelType.encoder_or_decoder, + forward_step, + extra_args_provider=get_tasks_args + ) diff --git a/tools/retro/sft/sft_retro_lm.sh b/tools/retro/sft/sft_retro_lm.sh new file mode 100644 index 0000000..8c13f10 --- /dev/null +++ b/tools/retro/sft/sft_retro_lm.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# bash examples/qa/finetune_normal_lm.sh landrover_tasb_retrieved 843m 1 3e-6 1 + +blend_name=$1 +model_size=$2 +global_bsz=$3 +lr=$4 +ft_neighbours=1 +model_card=pp1 +ckpt=$5 +TASK=none + +train_iters=1000 + + +DATA_HOME="" +data_folder="$DATA_HOME" + +SFT_HOME="" + +TOKENIZER_MODEL="" + +RETRO_WORKDIR="" + +K=2 + +PRETRAINED_CHECKPOINT=${ckpt} + +SAVENAME="retro-${blend_name}_${model_card}_same_format_ctx${ft_neighbours}_${model_size}_${global_bsz}_${lr}" +CHECKPOINT_PATH="${SFT_HOME}/checkpoints/applications/${SAVENAME}" +TENSORBOARD_DIR="${SFT_HOME}/tensorboard/${SAVENAME}" +mkdir -p ${TENSORBOARD_DIR} + +. ./tools/retro/sft/"${blend_name}".sh + + +if [[ $model_size == "843m" ]]; then + # model param + mod_par=1 + layers=24 + hid_dim=1024 + heads=16 + pip_par=1 + + # node param + num_nodes=1 + lr=5e-6 + min_lr=5e-6 +fi + + +GPT_ARGS="--apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --no-position-embedding \ + --use-rotary-position-embeddings \ + --rotary-percent 0.5 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --pipeline-model-parallel-size $pip_par \ + --tensor-model-parallel-size $mod_par \ + --num-layers $layers \ + --hidden-size $hid_dim \ + --num-attention-heads $heads \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --lr-decay-style cosine \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --clip-grad 1.0 \ + --weight-decay 0.01 \ + --adam-beta1 0.9 \ + --adam-beta2 0.98 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --bf16 \ + --use-distributed-optimizer \ +" + +FT_ARGS="--eod-mask-loss \ + --answer-loss-only \ + --ft_neighbours ${ft_neighbours} \ + --task $TASK" + + +OUTPUT_ARGS="--log-interval 10 \ + --save-interval 500 \ + --eval-interval 200 \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --log-validation-ppl-to-tensorboard \ + --eval-iters 100" + +options=" \ + $GPT_ARGS \ + --retro-workdir ${RETRO_WORKDIR} \ + --retro-add-retriever \ + --retro-num-neighbors ${K} \ + --retro-attention-gate 0 \ + --data-path ${DATA_BLEND} \ + --data-folder ${data_folder} \ + --recompute-activations \ + --lr $lr \ + --micro-batch-size 1 \ + --global-batch-size ${global_bsz} \ + --min-lr ${min_lr} \ + --retro-cyclic-train-iters ${train_iters} \ + --train-iters ${train_iters} \ + --dataloader-type cyclic \ + --save $CHECKPOINT_PATH \ + $OUTPUT_ARGS \ + $FT_ARGS" + +if [[ -d "$CHECKPOINT_PATH" ]]; then + options="$options \ + --load $CHECKPOINT_PATH " +else + echo $PRETRAINED_CHECKPOINT + options="$options \ + --load $PRETRAINED_CHECKPOINT \ + --finetune \ + --no-load-rng \ + --no-load-optim " +fi + +######## Command. ######## + +run_cmd="python -u ${SFT_HOME}/tools/retro/sft/sft_retro.py ${options}" + +export NCCL_DEBUG=INFO +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPROCS=8 +CMD="\ + pwd && cd ${SFT_HOME} && pwd && \ + export PYTHONPATH=$PYTHONPATH:${SFT_HOME} && \ + python -m torch.distributed.run \ + --nproc_per_node ${NPROCS} \ + --nnodes 1 \ + --node_rank 0 \ + --master_port 6000 \ + ${run_cmd} \ +" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "CMD = '$CMD'." +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +eval $CMD + diff --git a/tools/retro/text_generation/evaluate.py b/tools/retro/text_generation/evaluate.py new file mode 100644 index 0000000..2031118 --- /dev/null +++ b/tools/retro/text_generation/evaluate.py @@ -0,0 +1,200 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +import sys +import os +from tqdm import tqdm +import string +import json +import regex +import numpy as np + +sys.path.append(os.path.abspath(os.path.join( + os.path.join(os.path.dirname(__file__), "../../../")))) +from tools.retro.text_generation.metrics import F1Metric + + +def normalize_answer(s): + def remove_articles(text): + return regex.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def compute_f1_score(predicted_answers, groundtruth_answer, exp_name="default"): + """Evaluating F1 Score""" + print(len(predicted_answers), len(groundtruth_answer)) + if len(predicted_answers) != len(groundtruth_answer): + groundtruth_answer = groundtruth_answer[:len(predicted_answers)] + + guess_list = [] + answer_list = [] + + assert len(guess_list) == len(answer_list), \ + "lengths of guess and answer are different!" + + for pred, ans in zip(predicted_answers, groundtruth_answer): + pred = pred.strip() + if type(ans) == str: + ans = ans.strip() + elif type(ans) == dict: + ans = ans['text'].strip() + elif ans == None: + continue + if "<|endoftext|>" in pred: + pred = pred.replace("<|endoftext|>", "") + if ans == "no_passages_used": + ans = "" + guess_list.append(pred) + answer_list.append(ans) + + precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) + print('Method: %s; Precision: %.4f; recall: %.4f; f1: %.4f' % ( \ + exp_name, precision, recall, f1)) + + +def load_groundtruth_file(data_file): + with open(data_file, "r") as f: + nq_examples = json.load(f) + + data = [] + for instance in nq_examples: + if "answers" in instance: + answers = instance["answers"] + if len(answers) < 1: + answers = [None] + elif "answer" in instance: + if type(instance["answer"]) is str: + answers = [instance["answer"]] + elif type(instance["answer"]) is list: + answers = instance["answer"] + else: + answers = [str(instance["answer"])] + else: + raise ValueError("need to have answer or answers") + data.append(answers[0]) + + return data + + +def read_prediction(prediction_file): + prediction_list = [] + print('reading %s' % prediction_file) + with open(prediction_file, "r") as f: + for i, line in enumerate(tqdm(f)): + if prediction_file.endswith("jsonl"): + line = json.loads(line)["pred"] + # print(line) + line = line.replace("Answer:", "") + line = line.replace("Answer: ", "") + line = line.replace('???? ', "") + line = line.replace('A: ', "") + line = line.replace("A:", "") + + line = line.strip() + + if "<|endoftext|>" in line: + line = line.replace("<|endoftext|>", "") + line = normalize_answer(line) # normalize the answer + prediction_list.append(line) + + return prediction_list + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def ems(prediction, ground_truths): + return max([exact_match_score(prediction, gt) for gt in ground_truths]) + + +def evaluate_ems(prediction_file, ground_truth_file, dev_num=3000): + prediction_list = read_prediction(prediction_file) + ground_truths_list = [] + + if ground_truth_file.endswith(('txt', 'lst')): + raw_data = open(ground_truth_file, 'r') + else: + with open(ground_truth_file, 'r') as f: + raw_data = json.load(f) + if "dev" in ground_truth_file: + raw_data = raw_data[:dev_num] + prediction_list = prediction_list[:dev_num] + + for each in raw_data: + if ground_truth_file.endswith('txt'): + each = json.loads(each) + + if 'answers' in each: + ground_truths_list.append(each['answers']) + elif 'answer' in each: + ground_truths_list.append(each['answer']) + else: + ground_truths_list.append([each]) + + exactmatch = [] + + good_example_list = [] + for i, each in enumerate(prediction_list): + score = ems(each, ground_truths_list[i]) + exactmatch.append(score) + if score: + good_example_list.append(i) + + final_em_score = np.mean(exactmatch) + + print('Exact Match: %.4f;' % final_em_score) + + print('done :-)') + + return final_em_score, exactmatch + + +def load_prediction(data_file): + data = [] + with open(data_file, "r") as f: + for line in f.readlines(): + data.append(line.strip()) + + return data + + +def evaluate_f1(ground_truth_file, prediction_file, reduced_test_only=False): + groundtruth_answer = load_groundtruth_file(ground_truth_file) + predicted_answers = load_prediction(prediction_file) + if not reduced_test_only: + compute_f1_score(predicted_answers, groundtruth_answer) + + +if __name__ == "__main__": + model_names = [] + model_names += "retro-open_inst_pp1_same_format_ctx1_843m_128_5e-6", + + for model_name in model_names: + ckpt_path = "/path/to/checkpoints/{}/".format(model_name) + + n_ctx = 5 + n_enc = 2 + iter = 1000 + model_param = "843m" + + prediction_file = ckpt_path + "/retro-generate-nq_{}_{}_{}_test_greedy_0_20000_{}.txt".format( + n_ctx, n_enc, model_param, iter) + ground_truth_file = "/path/to/NQ/test.json" + print(prediction_file) + print(ground_truth_file) + evaluate_f1(ground_truth_file, prediction_file) + evaluate_ems(prediction_file, ground_truth_file) + + print("=====================================") diff --git a/tools/retro/text_generation/metrics.py b/tools/retro/text_generation/metrics.py new file mode 100644 index 0000000..bd0b5fe --- /dev/null +++ b/tools/retro/text_generation/metrics.py @@ -0,0 +1,80 @@ + +# The following code is adapted from +# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, +# which is licensed under the MIT license. More details on the license can be +# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. + +"""Provides standard metric evaluations for dialog.""" + +from collections import Counter +from typing import List +import numpy as np +import re +from nltk import ngrams + +re_art = re.compile(r'\b(a|an|the)\b') +re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') + + +def normalize_answer(s): + """ + Lower text and remove punctuation, articles and extra whitespace. + """ + s = s.lower() + s = re_punc.sub(' ', s) + s = re_art.sub(' ', s) + s = ' '.join(s.split()) + return s + + +class F1Metric: + """ + Helper class which computes token-level F1. + """ + + @staticmethod + def _prec_recall_f1_score(pred_items, gold_items): + """ + Compute precision, recall and f1 given a set of gold and prediction items. + :param pred_items: iterable of predicted values + :param gold_items: iterable of gold values + :return: tuple (p, r, f1) for precision, recall, f1 + """ + common = Counter(gold_items) & Counter(pred_items) + num_same = sum(common.values()) + if num_same == 0: + return 0, 0, 0 + precision = 1.0 * num_same / len(pred_items) + recall = 1.0 * num_same / len(gold_items) + f1 = (2 * precision * recall) / (precision + recall) + return precision, recall, f1 + + @staticmethod + def compute_each_pair(guess: str, answer: str, n=1): + if answer == "": + return None, None, None + if guess == "": + return 0, 0, 0 + g_tokens = normalize_answer(guess).split() + a_tokens = normalize_answer(answer).split() + g_tokens = list(ngrams(g_tokens, n)) + a_tokens = list(ngrams(a_tokens, n)) + precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) + return precision, recall, f1 + + @staticmethod + def compute_all_pairs(guesses: List[str], answers: List[str], n=1): + # additional augment: + print("guess:", len(guesses), ", answers:", len(answers)) + assert len(guesses) == len(answers) + + precision_list, recall_list, f1_list = [], [], [] + for guess, answer in zip(guesses, answers): + precision, recall, f1 = F1Metric.compute_each_pair(guess, answer, n) + if precision is None or recall is None or f1 is None: + continue + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + + return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) diff --git a/tools/retro/text_generation/retro_api.py b/tools/retro/text_generation/retro_api.py new file mode 100644 index 0000000..b706774 --- /dev/null +++ b/tools/retro/text_generation/retro_api.py @@ -0,0 +1,221 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +"""Inference API.""" +import numpy as np +import torch +from megatron.core import mpu +from megatron.training import print_rank_0, get_retro_args, get_args, get_tokenizer +from megatron.inference.text_generation.communication import broadcast_float_list, broadcast_tensor, broadcast_int_list +from megatron.inference.text_generation.generation import ( + score_and_return_on_first_stage) +from tools.retro.text_generation.retro_generation import ( + retro_generate_tokens_probs_and_return_on_first_stage) +from megatron.inference.text_generation.tokenization import ( + detokenize_generations) + + +def tokenize_prompts(prompts=None, tokens_to_generate=None, + add_BOS=None, rank=0): + """Tokenize prompts and make them avaiable on all ranks.""" + + # On all ranks set to None so we can pass them to functions + sizes_list = None + prompts_tokens_cuda_long_tensor = None + prompts_length_cuda_long_tensor = None + + # On the specified rank, build the above. + if torch.distributed.get_rank() == rank: + assert prompts is not None + assert tokens_to_generate is not None + # Tensor of tokens padded and their unpadded length. + prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \ + _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS) + # We need the sizes of these tensors for the boradcast + sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size + prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght + + # First, broadcast the sizes. + sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank) + + # Now that we have the sizes, we can boradcast the tokens + # and length tensors. + sizes = sizes_tensor.tolist() + prompts_tokens_cuda_long_tensor = broadcast_tensor( + sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank) + prompts_length_cuda_long_tensor = broadcast_tensor( + sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor, + rank=rank) + + return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor + + +def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS): + """Given a set of prompts and number of tokens to generate: + - tokenize prompts + - set the sequence length to be the max of length of prompts + plus the number of tokens we would like to generate + - pad all the sequences to this length so we can convert them + into a 2D tensor. + """ + + # Tokenize all the prompts. + tokenizer = get_tokenizer() + if add_BOS: + prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt) + for prompt in prompts] + else: + prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts] + + # Now we have a list of list of tokens which each list has a different + # size. We want to extend this list to: + # - incorporate the tokens that need to be generated + # - make all the sequences equal length. + # Get the prompts length. + prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens] + # Get the max prompts length. + max_prompt_len = max(prompts_length) + # Set the tokens to generate to the max prompts length for Retro + args = get_args() + if args.retro_add_retriever: + tokens_to_generate = max_prompt_len + # Number of tokens in the each sample of the batch. + samples_length = max_prompt_len + tokens_to_generate + # Now update the list of list to be of the same size: samples_length. + for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length): + padding_size = samples_length - prompt_length + prompt_tokens.extend([tokenizer.eod] * padding_size) + + # Now we are in a structured format, we can convert to tensors. + prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens) + prompts_length_tensor = torch.cuda.LongTensor(prompts_length) + + return prompts_tokens_tensor, prompts_length_tensor + + +def retro_generate_and_post_process(model, + prompts=None, + neighbours_array=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + random_seed=-1, + logits_mask=None): + """Run inference and post-process outputs, i.e., detokenize, + move to cpu and convert to list.""" + + # Main inference. + tokens, lengths, output_log_probs = retro_generate( + model, + prompts=prompts, + neighbours_array=neighbours_array, + tokens_to_generate=tokens_to_generate, + return_output_log_probs=return_output_log_probs, + top_k_sampling=top_k_sampling, + top_p_sampling=top_p_sampling, + temperature=temperature, + add_BOS=add_BOS, + use_eod_token_for_early_termination=use_eod_token_for_early_termination, + random_seed=random_seed, + logits_mask=logits_mask) + + # Only post-process on first stage. + if mpu.is_pipeline_first_stage(): + tokens, prompts_plus_generations, prompts_plus_generations_segments = \ + detokenize_generations(tokens, lengths, True) + + if return_output_log_probs: + output_log_probs = output_log_probs.cpu().numpy().tolist() + for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)): + output_log_probs[i] = prob[:len(seg) - 1] + + return prompts_plus_generations, prompts_plus_generations_segments, \ + output_log_probs, tokens + + return None + + +def retro_generate(model, + prompts=None, + neighbours_array=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + random_seed=-1, + logits_mask=None): + """Given prompts and input parameters, run inference and return: + tokens: prompts plus the generated tokens. + lengths: length of the prompt + generations. Note that we can + discard tokens in the tokens tensor that are after the + corresponding length. + output_log_probs: log probs of the tokens. + """ + + # Make sure input params are avaialble to all ranks. + values = [tokens_to_generate, + return_output_log_probs, + top_k_sampling, top_p_sampling, + temperature, add_BOS, use_eod_token_for_early_termination, + stop_on_double_eol, + stop_on_eol, + random_seed] + values_float_tensor = broadcast_float_list(10, float_list=values) + tokens_to_generate = int(values_float_tensor[0].item()) + return_output_log_probs = bool(values_float_tensor[1].item()) + top_k_sampling = int(values_float_tensor[2].item()) + top_p_sampling = values_float_tensor[3].item() + temperature = values_float_tensor[4].item() + add_BOS = bool(values_float_tensor[5].item()) + use_eod_token_for_early_termination = bool(values_float_tensor[6].item()) + stop_on_double_eol = bool(values_float_tensor[7].item()) + stop_on_eol = bool(values_float_tensor[8].item()) + random_seed = int(values_float_tensor[9].item()) + + if random_seed != -1: + torch.random.manual_seed(random_seed) + + # Tokenize prompts and get the batch. + # Note that these tensors are broadcaseted to all ranks. + if torch.distributed.get_rank() == 0: + assert prompts is not None + + context_tokens_tensor, context_length_tensor = tokenize_prompts( + prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS) + + retro_args = get_retro_args() + retro_args.retro_gpt_chunk_length = context_length_tensor.item() + + retro_args = get_retro_args() + args = get_args() + r = retro_args.retro_gpt_retrieved_length + l = int(np.ceil(min(args.max_position_embeddings, context_tokens_tensor.size(1)) / retro_args.retro_gpt_chunk_length)) + if torch.distributed.get_rank() == 0: + neighbours_array = neighbours_array.reshape(1, args.retro_num_neighbors, r).repeat(l, axis=0) ## dim (l, k, r) + + if tokens_to_generate == 0: + return score_and_return_on_first_stage( + model, context_tokens_tensor, context_length_tensor) + + # Main inference function. + # Note that the outputs are available on the first stage. + return retro_generate_tokens_probs_and_return_on_first_stage( + model, context_tokens_tensor, context_length_tensor, + neighbours_array=neighbours_array, + return_output_log_probs=return_output_log_probs, + top_k=top_k_sampling, + top_p=top_p_sampling, + temperature=temperature, + use_eod_token_for_early_termination=use_eod_token_for_early_termination, + stop_on_double_eol=stop_on_double_eol, + stop_on_eol=stop_on_eol, + logits_mask=logits_mask) \ No newline at end of file diff --git a/tools/retro/text_generation/retro_generate.sh b/tools/retro/text_generation/retro_generate.sh new file mode 100644 index 0000000..53f7d76 --- /dev/null +++ b/tools/retro/text_generation/retro_generate.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +TASK=$1 +model_size=$2 +sampling=$3 +split=$4 +gen_start=$5 +num_gen=$6 +ckpt_step=${7} +ft_neighbours=${8} +model_card=${9} +ckpt=${10} +K=${11} +retrieve=${12} + +QA_HOME="" + +TOKENIZER_MODEL="" + +RETRO_WORKDIR="" + + +if [[ $model_size == "843m" ]]; then + mod_par=1 + layers=24 + hid_dim=1024 + heads=16 + pip_par=1 +fi + +GPT_ARGS="--apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --no-position-embedding \ + --use-rotary-position-embeddings \ + --rotary-percent 0.5 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --pipeline-model-parallel-size $pip_par \ + --tensor-model-parallel-size $mod_par \ + --num-layers $layers \ + --hidden-size $hid_dim \ + --num-attention-heads $heads \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --lr-decay-style cosine \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --clip-grad 1.0 \ + --weight-decay 0.01 \ + --adam-beta1 0.9 \ + --adam-beta2 0.98 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --bf16 \ +" + + +sample_input_file="/path/to/instruct_tuning/data/$TASK/${split}.json" + +top_k=1 +micro_bsz=1 +SAMPLE_ARGS="--top_k $top_k" + +CHECKPOINT_PATH=${ckpt} +sample_output_file="${CHECKPOINT_PATH}/retro-generate-${TASK}_${ft_neighbours}_${K}_${model_size}_${split}_${sampling}_${gen_start}_${num_gen}_${ckpt_step}.txt" + +DIR=`pwd` + +echo $sample_input_file +echo $sample_output_file + + +GEN_ARGS="$SAMPLE_ARGS \ + --gen-start-idx $gen_start \ + --num-gen $num_gen \ + --ckpt-step ${ckpt_step} \ + --sample-input-file $sample_input_file \ + --sample-output-file $sample_output_file \ + --retro-workdir ${RETRO_WORKDIR} \ + --retro-add-retriever \ + --retro-num-neighbors ${K} \ + --reuse-top \ + --retro-attention-gate 0 \ + " + +if [[ $retrieve == 1 ]]; then + GEN_ARGS="$GEN_ARGS \ + --use-retrieved-neighbours \ + " +fi + +FT_ARGS="--eod-mask-loss \ + --answer-loss-only \ + --ft_neighbours ${ft_neighbours} \ + --task $TASK" + +DISTRIBUTED_ARGS="--nproc_per_node ${mod_par} \ + --nnodes ${pip_par} \ + --node_rank 0 \ + --master_port 8889" + +######## Command. ######## + +COMMAND="python -m torch.distributed.run $DISTRIBUTED_ARGS ${DIR}/tools/retro/text_generation/retro_text_generation.py" + +COMMAND="$COMMAND \ + $GPT_ARGS \ + $GEN_ARGS \ + --load $CHECKPOINT_PATH \ + --micro-batch-size $micro_bsz \ + $FT_ARGS" + +export NCCL_DEBUG=INFO +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "CMD = '$CMD'." +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" +eval $COMMAND + diff --git a/tools/retro/text_generation/retro_generation.py b/tools/retro/text_generation/retro_generation.py new file mode 100644 index 0000000..f69103d --- /dev/null +++ b/tools/retro/text_generation/retro_generation.py @@ -0,0 +1,250 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + + +"""Generation utilities.""" +import torch +import torch.nn.functional as F +from megatron.training import get_args, get_tokenizer +from megatron.training import get_retro_args +from megatron.core import mpu +from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.inference.text_generation.communication import ( + copy_from_last_to_first_pipeline_stage, + broadcast_from_last_pipeline_stage, + broadcast_from_last_to_first_pipeline_stage, broadcast_int_list, broadcast_tensor) +from megatron.inference.text_generation.generation import _build_attention_mask_and_position_ids +from megatron.inference.text_generation.sampling import sample + + + +def retro_generate_tokens_probs_and_return_on_first_stage( + model, tokens, lengths, neighbours_array=None, + return_output_log_probs=False, + top_k=0, top_p=0.0, + temperature=1.0, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + logits_mask=None): + """Main token generation function. + + Args: + model: no interleaving is supported. + tokens: prompt tokens extended to be of size [b, max-sequence-length] + lengths: original prompt length, size: [b] + neighbours_array: neighbours array of size [b, l, k, r] + return_output_log_probs: flag to calculate the log probability of + the generated tokens. Note that the log probability is the one + from the original logit. + top_k, top_p: top-k and top-p sampling parameters. + Note that top-k = 1 is gready. Also, these paramters are + exclusive meaning that: + if top-k > 0 then we expect top-p=0. + if top-p > 0 then we check for top-k=0. + temperature: sampling temperature. + use_eod_token_for_early_termination: if True, do early termination if + all the sequences have reached this token. + Note: Outside of model, other parameters only need to be available on + rank 0. + + Returns: Note that is size is adjusted to a lower value than + max-sequence-length if generation is terminated early. + tokens: prompt and generated tokens. size: [b, :] + generated_sequence_lengths: total length (including prompt) of + the generated sequence. size: [b] + output_log_probs: log probability of the selected tokens. size: [b, s] + """ + + args = get_args() + retro_args = get_retro_args() + + tokenizer = get_tokenizer() + + batch_size = tokens.size(0) + min_prompt_length = lengths.min().item() + max_sequence_length = tokens.size(1) + print("max_sequence_length", max_sequence_length) + print("min_prompt_length", min_prompt_length) + max_sequence_length = min(max_sequence_length, args.max_position_embeddings) + + # If the context is too big, this happens + if min_prompt_length >= max_sequence_length: + raise ValueError("context length + tokens_to_generate too large") + + # forward step. + unwrapped_model = unwrap_model( + model) + unwrapped_model.language_model.seq_length = max_sequence_length + + # Added termination_id to support the case that we want to terminate the + # generation once that id is generated. + if hasattr(args, 'eos_id'): + termination_id = args.eos_id + else: + termination_id = tokenizer.eod + + # =================== + # Pre-allocate memory + # =================== + + # Log probability of the sequence (prompt + generated tokens). + output_log_probs = None + output_log_probs_size = (batch_size, max_sequence_length - 1) + # Lengths of generated seuquence including including prompts. + generated_sequence_lengths = None + if mpu.is_pipeline_last_stage(): + if return_output_log_probs: + output_log_probs = torch.empty(output_log_probs_size, + dtype=torch.float32, + device=torch.cuda.current_device()) + generated_sequence_lengths = torch.ones( + batch_size, dtype=torch.int64, + device=torch.cuda.current_device()) * max_sequence_length + + # Whether we have reached a termination id. + is_generation_done = torch.zeros(batch_size, dtype=torch.uint8, + device=torch.cuda.current_device()) + + # ============= + # Run infernece + # ============= + + with torch.no_grad(): + attention_mask, position_ids = _build_attention_mask_and_position_ids( + tokens) + for context_length in range(min_prompt_length, max_sequence_length): + prev_context_length = 0 + sizes_list = None + neighbor_tokens_cuda_long_tensor = None + + # get the chunks for retrieval + if torch.distributed.get_rank() == 0: + neighbor_tokens = neighbours_array + neighbor_tokens_cuda_long_tensor = torch.cuda.LongTensor( + neighbor_tokens.reshape((-1, retro_args.retro_gpt_retrieved_length))) + sizes_list = [neighbor_tokens_cuda_long_tensor.size(0), # Batch size + neighbor_tokens_cuda_long_tensor.size(1)] # Sequence lenght + sizes_tensor = broadcast_int_list(2, int_list=sizes_list) + sizes = sizes_tensor.tolist() + neighbor_tokens_cuda_long_tensor = broadcast_tensor( + sizes, torch.int64, tensor=neighbor_tokens_cuda_long_tensor) + + _, _, neighbor_position_ids = get_ltor_masks_and_position_ids( + neighbor_tokens_cuda_long_tensor, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + neighbor_attention_mask = None + + # Pick the slice that we need to pass through the network. + tokens2use = tokens[:, prev_context_length:4096] + positions2use = position_ids[:, prev_context_length:4096] + attention_mask2use = attention_mask[ + ..., prev_context_length:4096, :4096] + + logits = model(tokens2use, positions2use, attention_mask2use, + retriever_input_ids=neighbor_tokens_cuda_long_tensor, + retriever_position_ids=neighbor_position_ids, retriever_attn_mask=neighbor_attention_mask, + ) + + if mpu.is_pipeline_last_stage(): + # Always the last stage should have an output. + assert logits is not None + + # Sample. + last_token_logits = logits[:, context_length - 1, :] + # last_token_logits = logits[:, -1, :] + + # word banning + if logits_mask is not None: + last_token_logits[:, logits_mask] = float('-Inf') + + new_sample = sample(last_token_logits, + top_k=top_k, + top_p=top_p, + temperature=temperature, + vocab_size=tokenizer.vocab_size) + + # If a prompt length is smaller or equal th current context + # length, it means we have started generating tokens + started = lengths <= context_length + # Update the tokens. + tokens[started, context_length] = new_sample[started] + + # Calculate the log probabilities. + if return_output_log_probs: + log_probs = F.log_softmax(logits, dim=2) + if return_output_log_probs: + # Pick the tokens that we need to get the log + # probabilities for. Note that next input token is + # the token which we selected in the current logits, + # so shift by 1. + indices = torch.unsqueeze( + tokens[ + :, + (prev_context_length + 1):(context_length + 1)], + 2) + output_log_probs[:, + prev_context_length:context_length] = \ + torch.gather(log_probs, 2, indices).squeeze(2) + + # Update the tokens on the first stage so the next input to + # the network is correct. + copy_from_last_to_first_pipeline_stage(batch_size, torch.int64, + tokens[:, context_length]) + + # Update the context length for the next token generation. + prev_context_length = context_length + + # Check if all the sequences have hit the termination_id. + done = None + if mpu.is_pipeline_last_stage(): + # TODO(rprenger) These stopping methods are tokenizer dependent + # instead tokenization should be in the inference loop so stop sequences can be used + if stop_on_double_eol: + hit_double_eol = (new_sample == 628).byte() & started.byte() + hit_two_eols = (new_sample == 198).byte() & ( + tokens[:, context_length - 1] == 198).byte() & started.byte() + done_token = hit_double_eol | hit_two_eols + elif stop_on_eol: + hit_double_eol = (new_sample == 628).byte() & started.byte() + hit_eol = (new_sample == 198).byte() & started.byte() + done_token = hit_double_eol | hit_eol + elif context_length > min_prompt_length + 64: # previous retrov1 limitations + done_token = 1 + else: + done_token = (new_sample == termination_id).byte() & \ + started.byte() + + just_finished = (done_token & ~is_generation_done).bool() + generated_sequence_lengths[just_finished.view(-1)] = \ + context_length + 1 + is_generation_done = is_generation_done | done_token + done = torch.all(is_generation_done) + done = broadcast_from_last_pipeline_stage(1, torch.uint8, + tensor=done) + if use_eod_token_for_early_termination and done: + break + + # =================================================== + # Update the length of based on max generated length. + # =================================================== + + tokens = tokens[:, :(context_length + 1)] + if mpu.is_pipeline_last_stage(): + if return_output_log_probs: + output_log_probs = output_log_probs[:, :context_length] + + # ====================================== + # Broadcast to the first pipeline stage. + # ====================================== + + generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage( + batch_size, torch.int64, generated_sequence_lengths) + if return_output_log_probs: + output_log_probs_size = (batch_size, context_length) + output_log_probs = broadcast_from_last_to_first_pipeline_stage( + output_log_probs_size, torch.float32, output_log_probs) + + return tokens, generated_sequence_lengths, output_log_probs diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py new file mode 100644 index 0000000..2705009 --- /dev/null +++ b/tools/retro/text_generation/retro_text_generation.py @@ -0,0 +1,263 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Sample Generate GPT""" +import torch +import os +import sys +from typing import Union + +sys.path.append(os.path.abspath(os.path.join( + os.path.join(os.path.dirname(__file__), "../../../")))) +from megatron.training import get_args, get_retro_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.core.models.gpt import GPTModel +from megatron.training import get_model +from tools.retro.text_generation.retro_api import retro_generate_and_post_process +from tools.retro.sft.sft_retro import get_tasks_args +from tools.retro.sft.dataset_conv import reformat_prompt, preprocess, reformat_prompt_short +import numpy as np +import time +import megatron.legacy.model +from megatron.training.arguments import core_transformer_config_from_args + + + +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + """Builds the model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model + """ + print_rank_0('building GPT model ...') + args = get_args() + config = core_transformer_config_from_args(args) + + assert args.use_legacy_models, 'retro text generation only implemented for legacy models' + + # not support core model yet + model = megatron.legacy.model.GPTModel( + config, + num_tokentypes=0, + parallel_output=False, + pre_process=pre_process, + post_process=post_process + ) + + return model + + +def pad_neighbours_for_query_only(args, nb_tokens, pad_id, ft_neighbours): + # take top k neighbours and padding + neighbours_tokens = [] + retro_args = get_retro_args() + r = retro_args.retro_gpt_retrieved_length + + if args.reuse_top: + valid_nb_tokens = nb_tokens[:args.retro_num_neighbors] + else: + valid_nb_tokens = nb_tokens[ft_neighbours:args.retro_num_neighbors + ft_neighbours] + + for nb_token in valid_nb_tokens: + if len(nb_token) >= r: + nb_token = nb_token[:r] + else: + nb_token = nb_token + [pad_id] * (r - len(nb_token)) + neighbours_tokens.append(nb_token) + print("len(nb_tokens)", len(nb_tokens)) + print("len(neighbours_tokens)", len(neighbours_tokens)) + print("args.retro_num_neighbors", args.retro_num_neighbors) + + if len(neighbours_tokens) < args.retro_num_neighbors: + assert ValueError("neighbours are not enough, add empty ones and create mask for those empty ones") + neighbours_tokens = np.array(neighbours_tokens) + return neighbours_tokens + + +def add_text_generate_args(parser): + """Text generation arguments.""" + + parser = get_tasks_args(parser) + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, + help='Sampling temperature.') + group.add_argument("--greedy", action='store_true', default=False, + help='Use greedy sampling.') + group.add_argument("--top_p", type=float, default=0.0, + help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, + help='Top k sampling.') + group.add_argument("--out-seq-length", type=int, default=256, + help='Size of the output generated text.') + group.add_argument("--sample-input-file", type=str, default=None, + help='Get input from file instead of interactive mode, ' + 'each line is an input.') + group.add_argument("--sample-output-file", type=str, default=None, + help='Output file got from --sample-input-file') + group.add_argument("--num-samples", type=int, default=0, + help='Number of samples to generate unconditionally, ' + 'defaults to 0 and interactive conditional sampling') + group.add_argument("--genfile", type=str, + help='Output file when generating unconditionally') + group.add_argument("--recompute", action='store_true', + help='During generation recompute all attention ' + 'instead of using previously computed keys/values.') + group.add_argument("--epsilon", type=float, default=0.01, + help="Minimum factor by which each probability is multiplied") + group.add_argument("--debug-gen", action='store_true', + help="If set, additional debugging output is printed to stdout") + group.add_argument('--length-penalty', type=float, default=1.0, + help='length penalty') + group.add_argument('--gen-start-idx', type=int, default=0, + help='project size for adapters') + group.add_argument('--num-gen', type=int, default=-1, + help='project size for adapters') + group.add_argument('--ckpt-step', type=int, default=None, + help='setting ckpt step manually') + group.add_argument("--short-format", action='store_true', + help='Use short format QA') + group.add_argument("--use-retrieved-neighbours", action='store_true', default=False, + help='Use retrieved neighbours') + group.add_argument('--template-id', type=int, default=0, + help='template id for generation,') + return parser + + +def generate_samples_conditional(model): + args = get_args() + start = time.time() + avg_time = [] + tokenizer = get_tokenizer() + model.eval() + if torch.distributed.get_rank() == 0: + + data = preprocess(args.sample_input_file, inference_only=True, + retrieved_neighbours=args.use_retrieved_neighbours) + print("total rows {}".format(len(data))) + all_data = data[args.gen_start_idx:] # start from gen_start_idx + if args.num_gen > 0: + all_data = all_data[:args.num_gen] + input_count = len(all_data) + input_pos = 0 + + terminate_runs = 0 + while True: + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + sentences = [] + n_arrays = [] + print("global batch size", args.global_batch_size) + for _ in range(args.global_batch_size): + print(input_pos) + if input_pos >= input_count: + print("reach the last row") + break + else: + sample = all_data[input_pos] + input_pos += 1 + + if True: + max_target_len = args.out_seq_length + query, _, neighbours = sample + + neighbours_array = pad_neighbours_for_query_only(args, + [tokenizer.tokenize(neighbour) for neighbour in + neighbours], tokenizer.eod, args.ft_neighbours) + print("neighbours_array.shape", neighbours_array.shape) + tokenizer = get_tokenizer() + + if args.short_format: + input_tokens = reformat_prompt_short(query, neighbours, args.task, args.ft_neighbours, + max_target_len, + tokenizer, args.seq_length) + else: + input_tokens = reformat_prompt(query, neighbours, args.task, args.ft_neighbours, max_target_len, + tokenizer, args.seq_length, template_id=args.template_id) + raw_text = tokenizer.detokenize(input_tokens) + print(raw_text) + else: + raise ValueError("invalid arg for task") + sentences.append(raw_text) + retro_args = get_retro_args() + + resp_sentences, resp_sentences_seg, scores, \ + tokens = retro_generate_and_post_process(model, prompts=sentences, + neighbours_array=neighbours_array, + tokens_to_generate=args.seq_length - retro_args.retro_gpt_chunk_length, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=1.0) + print("len of resp_sentences", len(resp_sentences)) + for prompt, generation in zip(sentences, resp_sentences): + datum = generation[len(prompt):] + print("prompt:", generation[:len(prompt)]) + if "<|endoftext|>" in datum: + datum = datum[:datum.find("<|endoftext|>")].strip() + datum = datum.replace("\n", " ") + print("cont:", datum) + yield datum + avg_time.append((time.time() - start) / args.global_batch_size) + print("avg time for each sample: ", sum(avg_time) / len(avg_time)) + start = time.time() + if input_pos >= input_count: + print("finish all lines") + terminate_runs = 1 + else: + retro_generate_and_post_process(model) + + terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) + torch.distributed.broadcast(terminate_runs_tensor, 0) + terminate_runs = terminate_runs_tensor[0].item() + + if terminate_runs == 1: + return + + +def generate_and_write_samples_conditional(model): + args = get_args() + if args.sample_output_file is None: + sample_output_file = args.sample_input_file + ".out" + print('`sample-output-file` not specified, setting ' + 'it to {}'.format(sample_output_file)) + else: + sample_output_file = args.sample_output_file + with open(sample_output_file, 'w') as f: + for datum in generate_samples_conditional(model): + if torch.distributed.get_rank() == 0: + f.write(datum + '\n') + + +def main(): + """Main program.""" + + initialize_megatron(extra_args_provider=add_text_generate_args, + args_defaults={'no_load_rng': True, + 'no_load_optim': True}) + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + print(model) + args = get_args() + + if args.load is not None: + _ = load_checkpoint(model, None, None) + model = model[0] + + # Generate samples. + if args.sample_input_file is not None: + print(f"{args.sample_input_file}") + generate_and_write_samples_conditional(model) + + +if __name__ == "__main__": + main() diff --git a/tools/run_mamba_text_generation_server.py b/tools/run_mamba_text_generation_server.py new file mode 100644 index 0000000..844d018 --- /dev/null +++ b/tools/run_mamba_text_generation_server.py @@ -0,0 +1,121 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Sample Generate Mamba""" +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.core import mpu +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.core.models.mamba.mamba_model import MambaModel +from megatron.core.transformer.spec_utils import import_module +from megatron.training import get_model +from megatron.training.arguments import core_transformer_config_from_args +from megatron.inference.text_generation_server import MegatronServer +from megatron.inference.text_generation import generate_and_post_process +from megatron.inference.text_generation import beam_search_and_post_process + +import torch + +def count_parameters_in_layer(model, layer_name): + num_params = 0 + for name, param in model.named_parameters(): + if layer_name in name: + num_params += param.numel() + print_rank_0(f" - {name}: {param.numel()}") + return num_params + +# Taken from pretrain_mamba.py +def model_provider(pre_process=True, post_process=True) -> MambaModel: + """Builds the model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + MambaModel: The returned model + """ + args = get_args() + + print_rank_0('building Mamba model ...') + config = core_transformer_config_from_args(get_args()) + + assert args.use_legacy_models == False, "Mamba only supported in Mcore!" + + if args.spec is not None: + mamba_stack_spec = import_module(args.spec) + else: + raise("You must provide a valid Mamba layer spec!") + + model = MambaModel( + config=config, + mamba_stack_spec=mamba_stack_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + hybrid_attention_ratio=args.hybrid_attention_ratio, + hybrid_mlp_ratio=args.hybrid_mlp_ratio, + hybrid_override_pattern=args.hybrid_override_pattern, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type + ) + + for l in range(model.decoder.num_layers_per_pipeline_rank): + layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.') + print_rank_0(f" == params layer {l}: {layer_params}") + + return model + +def add_text_generate_args(parser): + group = parser.add_argument_group(title='text generation') + group.add_argument("--port", type=int, default=5000, + help='port for text generation server to run on') + return parser + + +if __name__ == "__main__": + initialize_megatron(extra_args_provider=add_text_generate_args, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True}) + + args = get_args() + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for text generation.") + exit() + print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text " + "generation.") + args.exit_on_missing_checkpoint = True + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + + if args.load is not None: + _ = load_checkpoint(model, None, None) + + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: + server = MegatronServer(model) + server.run("0.0.0.0",port=args.port) + + while True: + choice = torch.tensor(1, dtype=torch.long, device='cuda') + torch.distributed.broadcast(choice, 0) + if choice.item() == 0: + try: + generate_and_post_process(model) + except ValueError as ve: + pass + elif choice.item() == 1: + try: + beam_search_and_post_process(model) + except ValueError as ve: + pass diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py new file mode 100644 index 0000000..3dad098 --- /dev/null +++ b/tools/run_text_generation_server.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Sample Generate GPT""" +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir))) +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.core import mpu +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.core.models.gpt import GPTModel +from megatron.training import get_model +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.yaml_arguments import core_transformer_config_from_yaml +from megatron.inference.text_generation_server import MegatronServer +from megatron.inference.text_generation import generate_and_post_process +from megatron.inference.text_generation import beam_search_and_post_process +from megatron.core.transformer.spec_utils import import_module +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) + +import torch +from typing import Union +import megatron + + +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + """Builds the model. + + If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model + """ + + args = get_args() + use_te = args.transformer_impl == "transformer_engine" + + print_rank_0('building GPT model ...') + + # Experimental loading arguments from yaml + if args.yaml_cfg is not None: + config = core_transformer_config_from_yaml(args, "language_model") + else: + config = core_transformer_config_from_args(args) + + if args.use_legacy_models: + model = megatron.legacy.model.GPTModel( + config, + num_tokentypes=0, + parallel_output=False, + pre_process=pre_process, + post_process=post_process + ) + else: + if args.spec is not None: + transformer_layer_spec = import_module(args.spec) + else: + if use_te: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm) + else: + transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm) + + model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=False, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent + ) + + return model + +def add_text_generate_args(parser): + group = parser.add_argument_group(title='text generation') + group.add_argument("--port", type=int, default=5000, + help='port for text generation server to run on') + return parser + + +if __name__ == "__main__": + initialize_megatron(extra_args_provider=add_text_generate_args, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True}) + + args = get_args() + if args.num_layers_per_virtual_pipeline_stage is not None: + print("Interleaved pipeline schedule is not yet supported for text generation.") + exit() + print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text " + "generation.") + args.exit_on_missing_checkpoint = True + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + + if args.load is not None: + _ = load_checkpoint(model, None, None) + + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: + server = MegatronServer(model) + server.run("0.0.0.0",port=args.port) + + while True: + choice = torch.tensor(1, dtype=torch.long, device='cuda') + torch.distributed.broadcast(choice, 0) + if choice.item() == 0: + try: + generate_and_post_process(model) + except ValueError as ve: + pass + elif choice.item() == 1: + try: + beam_search_and_post_process(model) + except ValueError as ve: + pass diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py new file mode 100644 index 0000000..b42196f --- /dev/null +++ b/tools/run_vlm_text_generation.py @@ -0,0 +1,218 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +"""Generate text using a vision language model.""" +import glob +import json +import logging +import os +import sys +from collections import defaultdict +from functools import partial + +# Add megatron to the path. +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + +import numpy as np +import torch +from PIL import Image +from torchvision.transforms import Compose, Resize, ToPILImage + +from megatron.inference.text_generation.api import generate_and_post_process +from megatron.inference.text_generation.forward_step import ForwardStep +from megatron.training import get_args, get_model, print_rank_0 +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from pretrain_vlm import model_provider + + +def add_text_generation_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='Vision language model text generation') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') + group.add_argument( + "--out-seq-length", type=int, default=1024, help='Size of the output generated text.' + ) + group.add_argument("--output-path", type=str, required=True, help='Output file path') + group.add_argument('--input-path', type=str, required=True, help="Input directory") + group.add_argument( + '--num-partitions', type=int, default=0, help="Number of partitions for inputs." + ) + group.add_argument('--partition-id', type=int, default=0, help="Partition index") + group.add_argument("--drop-vision-class-token", action="store_true", default=False) + group.add_argument("--gt-path", type=str, help="Optional ground truth file") + + return parser + + +def preprocess_image(target_h, target_w, img): + """Example image preprocessing. Resizes input image to target size. + + Args: + target_h (int): Target height in pixels. + target_w (int): Target width in pixels + img (np.array [h, w, c]): Input image in a numpy array. + + Returns: + output_img (torch.Tensor [c, h, w]): Input image resized to target size. + """ + # Imagenet's mean and std for normalization. + pixel_mean = [123.675, 116.28, 103.53] + pixel_std = [58.395, 57.12, 57.375] + pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1) + pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1) + + # Resize image considering ratio between input and target image sizes. + img_h, img_w = img.shape[0], img.shape[1] + ratio = float(max(target_h, target_w)) / max(img_h, img_w) + + scaled_h, scaled_w = int(img_h * ratio + 0.5), int(img_w * ratio + 0.5) + + image_transform = Compose( + [ToPILImage(), Resize((scaled_h, scaled_w)), lambda x: x.convert("RGB")] + ) + img = image_transform(img) + + # Normalize pixel values. + img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std + + # Pad to target size. + delta_h, delta_w = target_h - scaled_h, target_w - scaled_w + output_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h)) + + return output_img + + +def generate_samples(model): + """Text generation using a trained vision language model. This is an example for the COCO dataset.""" + args = get_args() + + image_files = sorted(glob.glob(args.input_path + "/*")) + # Optionally, process only a subset of the input files. + if args.num_partitions > 0: + per_part = len(image_files) // args.num_partitions + image_files = image_files[per_part * args.partition_id : per_part * (args.partition_id + 1)] + + num_samples = len(image_files) + images = [] + + # Run image preprocessing. + for image_file in image_files: + img = np.array(Image.open(image_file)) + img = preprocess_image(args.img_h, args.img_w, img) + + images.append(img.reshape(-1, 3, args.img_h, args.img_w)) + + # Load optional ground truth. + gt_image_id_to_captions = defaultdict(list) + if args.gt_path: + gts = json.load(open(args.gt_path)) + for gt in gts["annotations"]: + gt_image_id_to_captions[gt["image_id"]].append(gt['caption']) + + idx = 0 + while True: + image = images[idx].cuda() + image_id = int(image_files[idx].split("_")[-1].split(".")[0]) + + forward_step = partial(VLMForwardStep, image) + + if torch.distributed.get_rank() == 0: + prompt = "Give a short and clear explanation of the subsequent image.\n" + + resp_sentences, _, _, _ = generate_and_post_process( + model, + forward_step=forward_step, + prompts=[prompt], + tokens_to_generate=args.out_seq_length, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=args.temperature, + random_seed=123, + ) + + for prompt, generation in zip([prompt], resp_sentences): + output = { + "question_id": image_id, + "prompt": prompt, + "caption": generation[len(prompt) :], + } + + output["ground_truth"] = gt_image_id_to_captions[image_id] + + print_rank_0(output) + + yield output + idx += 1 + if idx >= num_samples: + break + else: + generate_and_post_process(model, forward_step=forward_step) + + idx += 1 + if idx >= num_samples: + break + + +def generate_and_write_samples(model): + args = get_args() + + for output in generate_samples(model): + if torch.distributed.get_rank() == 0: + with open(args.output_path, 'a') as f: + f.write(json.dumps(output) + "\n") + + +class VLMForwardStep(ForwardStep): + def __init__(self, images, model, max_batch_size, max_sequence_length): + super().__init__(model, max_batch_size, max_sequence_length) + self._images = images + + def _forward(self, tokens, position_ids, attention_mask): + return self.model( + self._images, + tokens, + position_ids, + attention_mask, + inference_params=self.inference_params, + ) + + def __call__(self, tokens, position_ids, attention_mask): + logits = super().__call__(tokens, position_ids, attention_mask) + + # On the first inference iteration, we compute image tokens. + # Update the sequence length offset by the number of image tokens. + num_tokens = tokens.size(1) + if num_tokens > 1: + self.inference_params.sequence_len_offset += self.inference_params.key_value_memory_dict[ + "image_tokens_count" + ] + + return logits + + +def main(): + """Vision language model text generation.""" + + logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.") + + initialize_megatron(extra_args_provider=add_text_generation_args) + + # Set up model and load checkpoint. + model = get_model(model_provider, wrap_with_ddp=False) + + args = get_args() + if args.load is not None: + _ = load_checkpoint(model, None, None) + + model = model[0] + model.eval() + + generate_and_write_samples(model) + + +if __name__ == "__main__": + main() diff --git a/tools/text_generation_cli.py b/tools/text_generation_cli.py new file mode 100644 index 0000000..223928c --- /dev/null +++ b/tools/text_generation_cli.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import sys +import json +import requests + + +if __name__ == "__main__": + url = sys.argv[1] + url = 'http://' + url + '/api' + headers = {'Content-Type': 'application/json'} + + while True: + sentence = input("Enter prompt: ") + tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) + + data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} + response = requests.put(url, data=json.dumps(data), headers=headers) + + if response.status_code != 200: + print(f"Error {response.status_code}: {response.json()['message']}") + else: + print("Megatron Response: ") + print(response.json()['text'][0])

Dc^k?l6xK!bO9r3r>RYkXso0LxOEhy_dEYJoY_?3A2KdFuOVl|vFEL>a~wAt*d z0$}Fg{lgM@E2$pKfVPmfisub185_P&kl;;r5QQwnYHyz(fj6-kLC?*^Dx_XJ%wRuc zM-2cgo1pvLbS~)B-YQ$PUxOZ$nfpq76gl`>$GYb=2vlE~|AK$hWU4*ZYjQ}5Cu}i^ z*^k}nP=01$+>HwW1r4=Tnr6jL{UNXt%h-l5fI1FW@*aEyD(0l#sZ;a>&hb&yZ=s%@ zPuiGp3(!L5eR(2bmAj*4U}PD%ZQ-xJ{+Q@v(ch~zaLFgA9GKlm?}0?f?hJIVG9GMj zV{{2w;kLu{R5HDwq=wTO4fk|0EjzzCTkFK5PXI`Dh-~y0v+Y%NckU7p=sLO3&d=BF zSV3F+^8@=iyI}f}xHqr@uSZJE8V`Lg>`N)D%I_|+&#U2uPGU6>CamY{q+B(X#;bO& zzk(Dpw(9|_PnRuXfe8KjkKw{UGHPlnmqqh4Xj``g@6VD$pf1_W^eC1Xn1gFil|cjD z$RsG=Kt;ScSag=#aVK%xqKGXZkbgcF@H}D&4mcH<>lpIgHJsRwbmhBQJfX4 zE3iTE}e*Iqh-DEkQUg2!!igWTsua6WqqP-=&ljBH9D6p|wH`2^OH=XOw$8F5qs~+p;z}tzI^hPLxqsT_L07ndndXwGCoVAsI?Psh@Z*c zyG~373b*@m!`-|p1?b;%iih4xOk&)ZI-os^)?-vQOWU4mtjh7i?A7J-jw1)0GbynO z#)Jez8oQ*kva)pTA}+2F<2-bttf3ilipgr#DFKxgGn=(UJmvf$S$^G)FJ`MO!x;9h zEJbX%#$9Pna?y*R`8&?ndcyq84*3aIMw^%)R*L8Ml_-|(q$3Zhy;HvtaFZNp%P$6b zceZRd_77?rq?z4npDqm)G3VsvM^I#NhqV_aYbo&5@BmG1WnlJoA8*7S?8q9E7;~IG zwEeCQq(^*rCQc;N!w~m+^K|@wCuE{7)R++qXr1}2`l+ahPts+=zG=~a6P8+=|McBu#+(i-2B+x)T=7UlrzRm+TU-51$p;|bB1p(VXUhQjyh|`e6Vo)c^2l@0E6#+ z30X1Er4d?2z_pODx4C(l21QtZtB!!>EqcZzN8mj`c2pS&eK^MbQp$)H~5+RbVa8s6Z!?^!G@ z*s$>qQ|cC)mPFwveSMM2&lCEetcuQdSk48&^*F1y%JESrROx&1 z%uCDV9~u1{85%}v&-{8$4PhQ1&8ljtv376x+`yfE|adgU&ZM8J7< zc2)(Cz@V_cB2Woz`Mm^&x`Mo%wwPuWJ3L!znen`DCn6-$$;rtWQzOvxWeyoCP^y2O zcMt2xe}b|;xiiXpKSE5?@qs0me3G@{K=b3*{7Rg0k~{((cl0l?`3`Ngw0H70yJJr@ zcc^y5N9xR)<}oys5PB+ZFP9GNm(4laiX%i@>qunw|8W8RW3F345ip)mBNi@C_M5~~ zMum}{c;T{txSV#K5F_dC`Y>wiRYkuqI=ubT0=;NwC*3`}=Pq!{k<% zCE7B&BXDyg=`Rt{JUa2>8RL#)hn>vx3k%JIbSjObGWMbCa&@c)_x+tUzI{?N-&$%z zgmku}q9jv=HrB-gK1g*v-g@=&PbcZFCMg*IkL2eszX2Z`QHjECWOak9`3;5qI^zUX z)zq9O`2V_zGNAQ}17@a4*bl$YoOX$xzD1f!Vy78ML4Sv@7?c7`TEb6?MeU1hop@m^7ttv^joe4OXrM|dP+tx4HA|_+~zWIKbt@-_h zd}MHX*ZRBfVk~=+0numo^~#J4b75O1g-+88rjNB7e6>tQH7=w>IJ1HEy#p1+m)^URQeq)1x|3CxeRU;A(9DLRHxUpw&dwD7&23yjI!G;PM`dVb(uB38NPyksyItz8Ctla%; zYLMJz8@6z5suDRB)KsQTv9EpTg-q3VC6DE$&U`?T|LxBEXFdGCb|41nfaK|9ixm$6 zmgo;>mD{?RtA}HfryD>4hVh!38azUn|SujPqn|xkkM6>*~GPT z@HS_xE6N(a+ZbM~ZQN2fIQ;IM-zdsQlsfpGDr##JW#7TU$+%m}Exb$Gj7FSR z?^bIe-V2C-s^8pnFsOFp#^G=&85!)pdAh|5ziRt#E(brl#$&MlW^h}{ajPSsqTd;t ze`%d<4W4gpZsJe@k&vXz)PRaY&QgSISRE&xhKGeU{@9dqIto}?MpDDr78-m zB<&@MJz#I2{$B}(e^*wo+`UP5`t<3P%1V*zUK`4R9EsxIcjLsEW*rVWdxVC!>s=|& z?Y6|W=TX&kS@6D%iKo{rj*;Wou~h0T1Ler5q?}=Q_jnb>@}{7sf|m#Cx9MCtG*h-= zd6C4mz@izY0qB>Kt~e1-YjB)RsCMFD!S(kSiZTA;dL`$#wvcuc3|>H0d*lLVj*#3l zKg{P9A$E52ha{iW9>aq~jg1_H&r)pNii$Pn2caR2g~Z^Hza5MJ71Uy24+u!EvygBX zR!dePA9HIgty7pS*>1Gq_a96lV5QR#+py6PIvVxH+v`=XzDjr$Y*JD_XJU-+=#(^g z2h!c|RdL?&7WV^~0dbl5Wrm2sptVaCz0UU-as3RPt*cAR5&$K2jRuBNq&KC6XsT+L z-qRWqr|t`BD@q?aaBeeh{kkRWm~H*g%gb`_$uxBH2B%2enKp?&qg zP6g)IgyR=mIfEKET;RnKSA8h>v?NXgHgG{NTrPr3vxTFCJeXesV+GhrWCF zCLuwHZjLj+8|30$Ba8z7A;I{s#p!QK+S>yXZExPDVEzIbXLFOwBBh+C19AMwrNi92 zkb*1Oi~WQ}?+O;Zrs$recAQUNS$+7JMJPVmwgQ?SqL)E*SZrMd2u=V z_k|WZ;w};TkJa~&lBl{2nKS(#&zhgifQLE>bA3##hq}T^hCA(*GC$q`Fj6u$v8T^p zfb&pMASyDFCTd=xFIgqWx~!=tc0hM*wf26rX|U}oebZ2#_YyQ6-ZuOaKfbii#uQb` z#peq8qS8e1kjOuV$$v9_|2d8UPuLwWqp=p~;lvuI+5Om~cMMMyt60%xQb{!Ne7{&J&CSi3K2K)U z7{EnoF5gX`JZ`v`m7Q%YYcuvV{=A=;#lg$(ovyb`NF-$F=|}`*sGws+Ykz#2pZqJ> zyWH3mpe?hC-I&Lh+1mQm*edeB;!`qXtNB2w1G6iu`{dwMfylt6+uyt4O4gT}_n1|} zM5hGW0Cx5EY67gR_`SoRcO3rIW$ zpILeUy`P&MBFB8G<+`w$qZLjyGcB!B*zoK8n?uqRGq*Cro2lRQsh3{zjUZO_uZ%U_mX0Jmkn z)RdjJeNBjcv23>X#@#_sP@DCJaYUPl^h^S7E=dx*tkn7=Lwa3o*2-R6OQ8dy%M1!? zDqvDpoLb5u(1>za*t)l#@!{m4g`J51SR6q4n3Vvm$6@--auy*~UiE2Zp+Kp$he=x8 z@iFIpyk5a!xMMQakEr3&k9{6SOo`jn4q)D1@!PAGj&(%8a zaNG`VqRRoigR(Gt;oWWSHvEg^WYf2O#|wSn1x+{mk_LsnF{2W?8DCTQ1W2v)`AAn( zj{CZ9^Rz`M<?c9Wq1^>sX;?X6}THVPUBCBnI&EK6>pkt}toDLqsg zDAr)h-fw=VUTtiY$!lHR)y@58rE(X@dIJLbCR&zFpOFrUG;~iaGTPa#Jvse^ztr&5XtBf$5UEwGFC|x@C+vT=$vS zdv8-KfENU8pOKi{08bcZ#H~QIC*-5pp~OCFA9)3a?b|x9ocl>s>v)%~b$4rPaGS0I zXc$1D3_d8fvKNz7hn_!w(|BV#e7J5S=qH~JQ?L|!f9SjRDk7{MDtPtnIss-K{k}`I zBkfZ>WL5czN`aCBvd&wge$V}H$WptDIB!vM-br9pSjc-q4#~@%q`vt&EixrIXX2>P z*{^t?@=}P5=KO43IpuGr@u20|h^ZbCjjmNF09M^#A%>#u zOoRjMY;Le3=^n$+zWAlmOcRVME;~@?zP;<*!klu5tXi^y7`C(-E0Kp8?dbzeLeXO) zOAmmp{d>2sPynuB)^OD-fLH=m{lr-Vbmay1u9{0qEQA7eI*1(Yw(^--`>i%@DIMo^ zI$Q~;O&?YHyw!H@zVGl5^_j6*8;(EcClBRjF=W)oO@n*Na>!n;k4-uzFb{4WsqcjbU>ZPE9C z^djXX35M&fun}A=RGdG$!~Zr5zQlMs{Wiz#*!y{9h)P01QX+@2i24aqXWdVGf*YwT z2ejmdgqWFmIC!uQ5sE0{?GbB0$d3txoCW8Sc{E?q>y61@RBjH~HXoD)LO@`2Pc&T}9ChjAw>b~DrA(gBpCag!_CqYZp zY0Cg_e}dL`oYzAMuM7K)DI&BO;)gVVjt9mMV^Xeu3H0o?uhIk5|6)@AlUPZ)5qSD6 zfv5k-Xcq3rIwc@S-?6|aQbKjp=s*^->ib4KopT0v_;|wj&i7V^U!pi_ypB7TmRTxf z$|E;0FDuOoT@96N#=hJh9OQmeQ^S_jJ-7O(iIh){Ai^ZPmM7Y_0h&v@<4lKjPd}i? z8Bq9f@?@@N>mMzeAPW)N%GUCBN_n}^`OCs@=_F#JX`4Yy#;i1`M;0Lx65dqFI0N)l zim~`Mqf_+7M-7{7K@!UaK4cWBB4Y4 zZMG0VSX5z^_kf;VT!pe_wqn;jqw&Dc&PGR@e*#9u(^&e=KfPxumP9@8+v-}ije-r*9)|5azPH`H1&lV`DDbNV$Npaawyg-@j zQHlpHbj%*hVUpB&n>!?EXt1#i0E`8J0kCJHZ<(BzyQ<4OrJ_H`R(P6N=$G%or`6FB z3;96Fw3mo*geN!k^>49H2S4^dRTGxD)U8jm3zlirfc`aCh$e(?KCQ6EP5H8I?ORn+ z&F$xnWxJ(?0-h2jJ`VuEG}Z78xe9zL&N>#DrRt+}|qhX!hDVwNDJkWL`E?=jgteo?zo}Fv43$|!f-|@Z~{(iu81wX z{VPc-FM!uON_oT**bLx*?S8gF1=WNa-u@87R%+0&uJ=daYSZ5u{M zAta~zpfv)Db3>7g3HFEUozbwfQzlR-7W|G5pFy_OWY}ST`y&$4TZF?Sjkqyl9Ui%I zitKdIX`mM-6ZAs$g2t#M3PzpzrKtP%&y|3E$-{`(5ObFSqF31D%RFNRri`N#d=t(D6GjJ`$!lpv*^dP)Qt4Wd6tvIcpwlV!2(?d)P(CyK7BmD zwI1+31TCP?WpcW?QQoYXF&l}xA=tYbCoJH`hyTrR|F`#+ivA4@cyntpIS?p=RBEY*a6IO^ zK@cl;cBho0Kb(8u+(?)oL_n2O?6s>7XXlS>3w^=j_sWyVciWqE-<58?kYz|dI@?ml zRl6b3>@;J4jE98i^2=X?@VaxA7YhDL5(z%V*ndOhwi&Us_Hn(RpaDJpbKBx~jvt4e zLwRl9@K;yUVG3Q8pxp%?6dgr*V@b+#WKn9kvo;_!c|gRmu5sIOxj}t>SyEQkZ-4ti zoYx)3TTxk8pE|cIuzePA{tYNvgn*XYr_sM+DF2ebD3bm3%aK1moc9$SKXwA9*sYwg zr2hsTmT^iqz9ohy2m`qh2&%Qg3(OVn)Xd7q=ac@5&fB?4y*c@3WLF?c>kPp%MR z3BMJpQ}P=itmEY`x}Q~P;(qS@9!mIzHTnJ4&sjU$E9br;MHp3pX9ai;M}7xnYLD{@>5AWc2eB^k0wAaH@ik@lM0^l|vbbV~ zKcay@%R znaXbgUQcDVf4jh7O{ins;xdB}|6`x~LP9OzYT?<&V49xZUTQ-_L&y2Hs0y!LE1did z>US|Q^fzzbv^U^?-th+9)w`pqu1=#6%@r(QJ@`mfwTJdy-~ZpoT>smi`ya{X=JSBb z0Q67$2yGE|9nhVmfE;_jL`{^lvy;hm!n~?>0d#8s7M50b8OkeT|HoWdYW(4t3YGnK3#8EiKSKuV<91$n58 z?|5wuioG#cUXm}QuM_Oe`U=7oE%Xbm$kwJ%`q>=VkrloZg;(-f)klTo=jY$Ewz9Ce zjom;Y>W*jq@e#|uR#sLs6B~Z+;s4^=8{j>kjoT zE}N%4%mx|f>lJxm_RPT=-BT2Fy>;%~xyB*n@vaAU#JmrpzDKibX>A>PvdMNVjf<1; zcJtYM#TFLfGj|;`Y!acQ@dQg8a)Ev@rM6co?B)d%-Cx#OoMezcH~DR?RiNZ9QWy|3 z7BEAa1Nz2ZnB3NZ`uOZxhb(rtTc;3e$tu-Uu8s9moAHy+kB2lSRE{*H7wEGel8~n7 zf(3cUd#BtEKVIpK+38bhS%jP%R5)DSe$U4XtzqaE-&Vwpet9O1U$8%ZE=tP*LD?_r zBeMTW7XI_i^fzjlkq=oQ%{L$q>Ht()nR-tdX2pmiY4$7!vIjv-AoPqhia`QQ>U<#V znaxRdwhIdA5$v~o!bu{lj(NG3fR(Wm(qgAkySC;Ivz9LIhVUbMY6uW zjwv|VMTSF8wjmLdlao)Ti_E)|JXW$YX3G6m%V!WL;Px@(jslYQ>!({yJ$K&!kHz3` zp8tQ4>x-U#s@{;l3@L_ykymQXvH!wOQH_75ko7hru9EOh@WN~_g$^G5yO!Q8tjdkR zIw~}QS@YquxJA1SV@qnsbv!ugt-SdS-?uJ7)$6Vg5Hf3ymV zD77AHW;?;KwFF7-B)6)Si4tQ9q-4*!7>dQ82Ue3s7>zl$#lcwk`s!tPN(hC#Nh=|a z1=2-4w2&9*1+1`uFv5kG z?^^iEWCR2Sn}ZjY-w6u>rJv~2s_+a2pM^B%@%*@If4(<(jOr~1vXISec zfIB>tE5(3=|g%Rd0T`AspAX`&a zU46S*!2cM3h->cvUrI<#OS`F@Bp!Pldp65!K1$p8;A_TWqRB%H>!)o*R@-4a>rM6>b`?w)w()s+VK93EetoSkt z;0QWat(pR|?0)~|D}=J4c-#Q@U-(jz5nqz)8T6S6i>|_@fp%BB9_ATKUS8yU)hk@Ha;<&M5Cu!c55bRzv%YvB+<$baY zXZq_dfabwG{f*}#_lU3rq^{(UdXZPnHpUKteG7N|JUajat#O)1wUx2J0?3T32>YQ4;18)4h!|{d?j~a9>#XRPF zVrDxtgs>55)!)HH75LorJ1qAt2h4@ z4EhsBe6%5X_3c!T7urE|XBmWo9V-{{rqs3lD_RuW22)pSs6(o_I}&($6Ia+ zH+lbkX_GQ4IsO>a&-djp5$6sSI2bU29Dcxxo(?!kh+!Ipq@h&O8Q$*kHuT z0U`p8Ml-`|9bpCrmk|dY=HOQ1{YP^P&*iQebDZO#L+>Q*)v!w*w5l}2j^|-@JsWUX z-O;*z3xBsb4HDmu^qJ+{Bg#HbC8G|0lu+eypOB*NVaVW3>{#+*E2;Re05|`87`Z_0 z2wvl9A6~@FapCJgd~IEZIS|hZ%j_O}ewkKgo2gXbtwO<|2wL}@Yf~r-U9TUo{jc;h zNCfji>Hlz?XZWBi6b7-pJMr0w3v5-5$*yioj(d2(+`j zB+3gLH+ zHRNmvyKl~aigcnsSb^caoCMN%d_tsF-%`B6SB!4+f+!4kNZzWo$x%q#_wM$@oyBxQ zNeRmUD;v|#no$eT&a46qrzXBP?!1fuJ;>8!QI5^`KGX-(beogYdses1|H&>s&)%1F z@}SGTdA>acW#JCKvxIpU;d4V0KTU&}@-CpEbDBV*1d`1AZ+qOm2P4T^M&tpeH=C1A%$F3&wr8E@(SSo3%=H zJViBQe}c829Wh71Pm!N*q~NW&n?~{p3aQbU>eH^ zr<@Q9qK&-`D%^=N=!(TwbwfuSe&DAHZqni{1%=uX09)PmR-sT?mx`o5GKCLhGnD|$ z4GG!m6`DNw`8^=2TL8W`;Q9RApin5jQ3d}9*S2rmQc?dw+Sz%VVZ+#y7_;l(O^6Sc zB-)2)0M!I|N3U}d%Mo{5QC)wcZy*hYF6IS;GXqa1Re`4Phx~XXz1f$RQ@c0bN60|c zx#w3ddd}XvO1HDKbzJ7c&|gMbTIi|d`XQP&yk!sz!Vka(2dSg^aD#8en3Wgx9ux-a zwX~qIfMElnvc)6@h85-w>SkLczUF0SVl^KDFagrdk;hP8+v1d}@uRQH*A|=r!ta1& z!t#|&@)7%%x(>a@!!a9I`0|E~>f6`X~WQ?cR=2c;(_W+*1$I(t7 z(Og7RCwuK4HF-oL$N(N0sVKN4uvGJ31}X{2q|UUk=?YH!a{dHNx0DI4>4*w0acpdC z*4=N>eTD+I{}8~vx*io@-y_4F#Swb> z&!RRlF|nsvz*9H~+G~hTUsQ3dP z&%I!}9QScEAJ|bQ7Qz<~JV&N+SImuhKyypa2E8rIYm4f~UWVju)_x-DAn_2ITv{qH zi7hLa@%BEH$Rnu!9)M)f!*IPZGv@wS^9{@V24YlaT=Wtw{KU_ixV-HSv-FVpGxnGJry{LALoJ&fUL0EeNHgqH5_r8t-m;e^bF2wB-T_ zmMEJmEKUks9RMO6=^*GN$~3!^7MWV+c8lkJPRX?u4(62n?5ojTFnTcX7v=7WC%if8 z%pja^7zPHjG6ESFj9(rFvf^CNZGu8Fd+jaD$sh|oM|=%_s~uyF#9C94Vn(|C1l`t0 zccrX%c6Xc9RV3)V_t5Z}nHh4uc2i>XoVV0~-7>YPY(%7YHJk3Mn-HRy+=@^;s77~I z1TGt1NT}R*aysIm)r7Em0QP4=oJ3t3@;c2|R}Ri>%+Q>CD+?=74 z);#GMqyUMkuFu)}g+E_L6ugXV%e_@?Phd8f0P%^cf001Nm(Z-WpZf^q3zLJy<*4fE z;iOz+0X_KFTvT1!2qf=8()8nbKP%B)D?%V+0InU)`ve5GANlg_TY{?nE;2Gw;E2__ z`U3B|<&!{2Y5J)Tx4ccVBtg~-tFA$a%b^MP#vfHGRq7%R#}`>M7lCH99|<{#B-fj5 zZjIqF3|UO_=8?n}MojFtN&M@_2BY6SqKpZ0A&W-Xz4yQzW4kSnO#n6 z;s-}C!9o2sYc%1+Nu;(y!s<#_+@u=3rex97XRb9ccjXB`Vdx#)3Ei~IQa)m|*j!}| zg**v3_md`8Ixn(Xz~hyUw_=c7q$|(wCjI;48%7~#KR-+w?Glm!vB2+HhK|Qpa^RfI zAE?V^voFOn=Q^(tFXy2zHV2NGM~=k1)<$*T4kknbQ<*QB8QL=u{O5pbfU8{HmDRoa z0VH-9I*R%5;gPQHh<-B=v`%*QlP1<4fSaHRt?zLGX3zg*4|BBPfAaj>qzATk!V7Kl;ILatQ}`xY9(=1(P* zV&35kWCyS9y~G6S@(uJZ^0$Y~ZVl8u8FKhD8pepzV2%Ls)9et&$b51#!lB#xVIbr% ziH+szdUhvCl&{a9V%wMUSyq6B;PLS%ZlZ2$Iwj@hC^z`~_wV6 zVPg|!p5(a@y}h?S#!k8tEeictQ~7fxToEJ1o%Wsq(qu|j>b*E+0a^Ex`8A8RTZSM{ z*7$XVaIg-v&(tEHRwY@bJh{pzk#d8Bjpfj|%K@;PGGv98onMEm@<9aa8v>Bv^ud7T zpB4|OoC{eJ?h+IAHuMz^(+>@5JTTF=T}cvR$N26OeDtqRvw{h7qn*gfE`8RQ4B-0@ z*%urhFe5R{<&*(dc1f50A5`bxA%-#OAQwQPEe2i$q&H{5)d}wy80%&eT17v7cJLyA zxD`$z*`u%UcU60p(5arlCPo$#0q=#9;plTd@~W9qx4i9 z_fASqt|h|GZfjkd4fj+L;dh7>Mvqr+-{LRu-=RQmQVc;n`X+$fnhL(dI668yxw9h& z6B85jcbR1-LUmNs`+1^8#Qip~CFPXYL-NtZk!3N}cYG#0z^9a~9+N``G;-gJUrv5d z?mSkJ%>j;j85`Yo{#qZc(H;J0b}JL60kUW1tMz|;Om4nRaH9Wb`zAv`A9>+E4ZaUp*3cAIV*_}LkUo%t{8W|8t#gQr-Psx z@Weq(#$3%aRZrmoRF!Za2e2jy=Oay;OGZ+w{K|N>fHOqddL=XQaP(I0KuyiK4p1J- z@=l&@5;Qj&tx!0@x&Z4N0HPL=a9w(!mz5hi1!JJe|viEzbTP_RM`3(bdPUPJFd+1faBCGor%?{#jA}s*5}Y|Ng}p(fI9sF z<(NUJ#$BizTDgo~Sk2a0YkBo_NAL~-{ zudJI`1Y+TXbCi@5c8~GH5l6$es0^r^yF1EB(XS5GT;7QJ>u#ZcQSceB{OI%N4o6H9 z3jXqD#$A_u6gu_Wr)6pNrQmt$R<5F@Yi}D4XoUX~rHeWKDTw5mO z>u`E77~{b~rAc-20AWgu+Ri@ijxCb&cZY*+HtuEwxPdLof0h|lv3}U!US7Ly>wej} z+BaPV;n#9-`rp^M7xnvR6Kco7=_&{~P9xzj%s9&5oFy?kHtAZe#GAy}ekm8kAOl_Y z)m0nTPxOygZ&g>a2W&CkEOctE5q80^Npx1nH?&(VGy>fNvjoHh9Qvn%N!kp)Vb||* zTV+OKp&Q{pwu0PF-1zA+#D_BitXkkR&ne`)hx-O@Ldnd`^fxt{E14ay2Frd>)Z4m;3#yr!*@A3OjdDhry-r(yooR4e%>|~5PS*fIX&!erw_HYwdMT$ z{0Pgeq?aHv#Kpy3qg!y9T_o^x7dN-b{r#%La^%ta&;)920VsI2wkZUc$zHINp9r0p z!6X3O4>N1Am_!gLY0ROa$&FJC8^Vt5^nV~hfUvUsrqe${#hqtD7*i7u$pekMr@K5X z_gZhqLurxw{8SM=%2Z`G93{&q2fPL2L)ZE;=$e5N4KzG`*6d)*CbGTK@F)LepQ-EU zFmP(+z5#ONc*(bbfB+AW?PBAt+c*A9k_tUp8!PAaTOaaU-yPFG_SvgO`L?@f(Jyu* zB2XtssFRbCN}A1zE*$z0$+%vfYglwiKYDH=+D4=-q4CCrq6_4AZwMyWGc)OZ;{71$ z`7j`-b71Sk!%weseu$H{;d*l%k{6cc(3Q_*(2z&8u`Kk2bokU9x zcXCMSZUQ1Um|Ilgp-Y_D&K*eV<4`p%j_<6dJ<1uATISuKuxyr3EoBnVv)K24&-^Aa z@^bjh(E+*Oce`RG-!0uLes&2cypn5?n~;y7g#q^vmsB$sfS=P0xKiawN_W{QQ-Ua4FQ&@-9K`6_@R2nnKzZ9xe`Umx}6h z8?YD5*m)yCq{e3YH1`|jLMd&2)WdJD?ZB&9nk6bHSx5HGSJ_?Uzr3QM6d%2)*H?ee z1vigQ9on^p-gVA7d+BC-?nuck{ZM`yJDnDppYKnadj(u9MVIul$-822(t8(xdHb(C zhdA60!4RVvG!U~mE!EWA40Wxntu+J%WdOi_@4YyXt<;9Y4sSsqFkM~UA?ShxvDU$r z7xlDc2C`Rvyw2B4v=hzcSDF{~$8^w|Z$*q5U#E$RDH{Zb7aRAPm@Y4{!BtWeGWybH z&Gj|3bPLMcSNr=1X9p&{Iaj!tNlKUFE@b|Adxegt-GLlpd;cYZdiHwl=yjPtR%}P< zFCZ>_M2a^$szz5Sw6?ZdSzE{V*s=NTPp>X`<1sC}oxsMdTAelA-wc9?PhJ?h2v*D= z!$w#;|LSiwv<$2){gu(z(LXZ!l!E$&`q$?ZkF}o{kCfcK+6{t+Zwr-|DJ~tNZq+P# zICy9Tg$rcoW2Z%bfo%IfL?a^xL`VoBZO)>%WWbFKhH& zb;XvF?o39S@M{{8O3s_p-Vw7LDS^z|Y37bLD_`Hq>SDpWRFx_540X6wmDecmp(?$e zvzrP2xoj%9KuSQD6v)2;F3u$Edx_yDfGa!!^-zq z3a~qipSgi=Jf1-P7ot*Xb{@KK zs@tttk&(Y9o%1^+mvh-1_Kac-P&~|LGOgPiaKzh$df|fTnCTXBZt9Z%+{gi*H^F?= zc#Zd=)`se^0>T_)uT)EieYJVuIQ}I7iEDmBLBXK7oOWmi9A?nA;Mjg-u&)u%AX_-K|>k zq;KFhma^w$T)n9IF0^jTL-61Vt6KE!_H zjqem>ZtplqFgc%*tt?b6jgW_LW|Vaui)<{*QeGAv%m=H1zJUDZf-TLis{DEe)a|(FR3_(eocdwpiU61J!%%hYN(m`JOshtLO0JYs%!T5l`~xCR*@J037^5jRLNYQk zdMzafJ^x%>Q6bC?t1}Q05gD!VESB7Fpt5QTq2uQhiXdVE4)!wf6y%I4kXeKbZ3jm?MT1A#zLbs2Xp#`BN8gF1bq{+g}_vduqtOh z6b$H6{cNL>XGqcZQrpwZWK*nc-fQ<_*~y6pd+xH4f}5r_e3N?#7Ng3#6wv76t07h z{w1LLkHYg&h~!)9E72jRMi!iDAY_-)m27R&w&>3FXTd+I#CMNG<}cp0ZYFv3hdaZ_ zjVf#r3aJtn(Vs3bGFl?2uTVsN(9dkMx+=lPPl;h3XH z!g(aB&x85R8MvC=C==c1f=EU@E#oDAsHGIF`*E_G=hNA(hW+3gmfdKph$>x4k67j)X*geNaxTg(hNNe&2P^=$2j-*efNIn z9REWRoOi!_ul1~FJM^s{|suwG;oAu$2RMwlG(?>^bbHD$t{PzHWtGQ7WjmhZi zLu!Deu)M3kUqxuW?#!xaW=9dhf_sm1-`ri@~z?n#mqDwo~0{2dpn%c0Ufqz{-_Q(ii^Ta{9?(4 z-!7$JYUqngHBAz)7EhQ657sse>o{WWaO{{TzxuXf`ZqjaoNqK16piPm?d5{%X=2^)fAcLiz7NB_zhnQCsZq%dvpW)GQi`>35?GtUz)Gh2AM)`X(EQ%sUkkxN+>UTgx4|UF~l!uD%&bfI2HcZ zI-$jwfm_|u*|!G0N%qx&kD-cCUQmi4Iqc`pLS}q2zTK6tA7-D^dzm@5zq*E% z$9p1{zw?iPHAYP_zf9v%uIHgrxbYNE^<6>d5jr06V$H*R8mT2imT`-YRf&51%*4c( zzKZadW>y?8vZe8g;0z+VA3r)l^17b<5d>_j!QLH=Yp-OIlat#AYG7V;Ohm*ik?%-> zp?|>JxAO_-C?)@SMDy9Bw=J$wfvE|lb=E}QwsLD{&o-2N9`0&#lLg@(KpRPJ@ik{g zOg%whw}oJ0k$PntQAjh$e#XbE=;VOrOp>7dSKk+*1$4R#*^ZD1xceAWY?tlPdjXYb z_c)xYm|eQS;Gl)>a&=XwrP8G(o47KU7mql(+6B>x9z92##jeBda7Oo{6(`G=eFVYV zl?~9&!`r*&1|OVXjPIh~jhm>ODEVNZoJi_t>UVRK0-K&_p19&?a*x!aR$!kq!G z!*2jq?Rdqf^*-XRJZaP)fY~4q6ZV;0-{y^WcPDut4SG95-jcC7VT)N!*62z|NL)rn zm;4lR{&u zSP*(Vok%`jv3R00T)T`)Raov!2{9k8|3aQHIohIC+0{ni$!`@;*wj^8J})YiQvU>g z-4iOXdz-CVaXLBFr$QBiwCo#M;UfgGvd|LQ>kx6tB*(}q`2FSkdr#MxU;Yu^!^prm zicOQPx!~AX3L#*-Smk76Q@}_Sjn0cf^{U+_uH--Z1<)Ob*{3F%^=O{i?= ztIY!SCHnHos~5hzW+i(U+HF6~qdnDws7*~#F%H}C1u~oCw|E=_%Qb}FS<$a|HFa$q zo_^Zh4`^oGKt z>JCPk)>;WSIGlrZddJHsScmkf{s;~mniDvQMp#nZ=32$(<_EnHz9xtCz5L+* znlxoO|F5?DtPhNog7lORdq2`zx;$Nv%{e|QshV>{5u?qTSx&U??d%aRI(?$}+Cu!>7#Uyu`f+}yU zAF!)5j4ofe!FBz?vO07BL_me6uCNf_0DuoKAOl90j$W}ZUaS;V(1g+d zPT|(A;IyxK8@C6W*LAsVetmcZ>c-Rq9k@jVXnSLFNKR7H)hTr6`Cm}>)_I@va^AgJ z(XF~1M!3RwqToCwt@PAya2X_JL@7i`6&c4rp_u(K)C;x-7n^R`!geU!cuuMnvq2-z zOFh68p|F^j64JRlc6=v!ue!W)v$h!h^{stx9(g@VALgx!zATdarOH{M92;!lEo6q{ z2#yA40|emzbipW*vFHm47B7XN%0ACnr8U{t#M?cN+NJUAO#m%!5(cPJZ~{x7&6)Mo zpR1X;yiE7U`Y;2!m>WM+Ey1oYUCwPWEvndlUS0?Y=s{mO)+q<@qY3DfGm&s1zYUwh3MNpO>YtW9;@(&a%7Y-nJ*W(oNZg5lZf^8sG> zJ;p?{d9+Yg>xhv3NS+L9wadCc#g)~*$cWLZ)yBiUZ0kHO;Hsn4XX4`GpF&k1t6ec7 zR+8VuqulvH>{Q0#P=zsvUqUSPw4z;~4>y8Krz>ArW zs@c9_HI#pEwA5zs46$HRe0;olSI603fyzMvKEc013A53UsMaja8v55}RG zSYF?dJlciOD7r+WQZ9aZQV$p8m1yoP(ODfIB`fHemYh=B z$6F)C=3$s%aIjXZ9}^RkXl+dOOQ7VV)13K>Dl_+Bb5%AC`W8p;oyYTexbDhQZcacC zx5bXWBD_`5q6dV_c~ve|$+B@#R@IwIb=B5pBG5CYPMRkArBF9f06N`PnhAU{bYQ8x zAZ)IaA_QgfF{?>@^;G`rZZp`@QS}1yBHMx}+%XY(LxD4zk%!|^+vUrZY=j|1S?RN{ z+q-XW#q1C{>M;_Q!FYzlL-Rom84h`5RMkmXCNyCtPF8t-ELp)g0>voO1DKro*+Oy2 z3;i7ud{L9yixLkXmkiP=;wu}%hjBbHr6kj)l5y#9ZZWCby+2Zt<|UBFFF!jnUdqqE z1G-EdojEn+jMwqMc=4bkfy}7t?A4S^3DeMTq`uwU`vVPsb z?-HLbk%BS6KdaBcKCI#sX4D+DIo99*xLRAexk;!$2v5LSxNytgUY3n&m$*Do=DeV37qk(k+#{)xko{%e8_>+*3xJ~(B zkC}RV*8&6D<^wPUN+!wWtB@)h(dlj0YXeFGMf*>#&8BfNR;Z z%{Al?r1g&?O8aY3OLbrMt*3fU5OqhBp88LMEb^LJu>*z|C;BG(?oRI-VOteyGTNH2 zwUG6XAQRS%&`UXPl^uEL+yncVWT}ajw6q-IDe)x9Cco{r-nA>!=JNVNeh*=sg8Z4h z!AEwz=)E?=mI8bbC(cQ(D?9aa3?rSq%{^N^`Kjco`RwndMW6tJ(qi3l;~uQZK0o7* zxWTE?Vj*Q>5#PS$AyZlj&+j&XY9}VYucxxo2}|3J#>s)pexo<;V1t<4ak`?GnVgg~ z4V%`Z+jD1UXK8O9G6KWV5x)f7AF?^M%Zbk3=C`E{r&YM1Ajz#aOg8 zR)`>OU(!xy7cWBs^;fEsc!SSN;)Iar7kRZ%?~hMIT#A>;%N7cnCh_$|_H!%dKMF&) z<6roS1sasAf;9{UKvZFWjh+c<%u&5f9JnyGu>~l%BPoi0k-K|c1?X+UMHk1nYO@(~ zCgu>tgsZ(ztd@K6FyO2?u#&{YUwD-8;QxUUo_k|j%CuePot1%a-t=HPIq2c>aj5>$ zk~*L@)b%}8Q&Us>mb#{G7b4X4ov`P7 zOFM|sn=kWQc&2xQ&ZjmAc@x?7v+s#sv}dtjO_N;c>wUEx5m%vQA{ywrO^hz9)tdI! z6Wl!$#MgWhZ~j0rT9Ud!%?y!4!x6SwolQwjxtZOm|F+#0R;bLjXL;a-aMm&E1qA;< z`I#iK7ve|RCs{nBBRv^~t!`*d*9uKT!_F*u$30`c%Fd#m^fBjVx{7nNNHr(ipyQVVNoTQpSiK2aFe0iF?F=y{#Kl`lN-UK>)RK>C zun4(%b-!SArB#+u@!mea^RBXmhm+_^Tbx|HRaqv8cSvYv*q&Q@NOgqc{--0}j_u-h z|8Ie4_x|$>w;*@0SFhpX;_d@$+O#lmt)Nzlx+=~j10N=RhP~UH9C~_sOs*a`)zFAu zEuW`xzS{4J9?w`~|i1iVTa|lZECWFhqJoBdo+A^mu>5hKY`$!w^b} zx9{)1eXt3U8)_XO^iEcuCzjPPgntYzcns4DzzacVchbIfJ1?N3S=3CcUHY^u^f7y@ zjJ72Pdh59*UlU7Z9$l8@;hD3|-c%`SWjwiZ*gr8W64s~Bt8OS0Lc!|5#Kf{%!Or@< zBcKNA;MOID&msF@20<_IfZRq8_x7d$#cC5Z4r*uArft?v&hlnMLj$Iff)u8|cFkd} z8Vwe#9TploannE{2lv;6r{n7eUQUG-SC);;LAq!8p6y$Rc@QsMZ2^8iNUR+_rJ{on0vx$zQG3q4;KBd z`Qs;vD?A4}#nW5d2j?Y=PagP!u5(?_c|wAq{bFoRTSg`;xWb?j`?sxcz<^;?a3$u- z(sp;_?FS|%07)z<0@IIlWD*xA|LfVC}0!F#_CgS48f{eOClZ|nNkt2YCFEHZve)R&L6 zF7Z!X=j5c{v~6n4vz<#d1n0Ew(?LzqdTIdPTT~Q_?rPED7_CEk`QhflhF)&_$aT&I z#tR{xHEKZqMEnWb(VF;MigHk02BzN||CJPvD3+FX-1*-cKgA??2(QhzC^r3m`Ub&u!?N_p@_MJ519(sxTdIzL~pG`f)Hs zD7Eu=;Mu5Nc%hg?d1;V|x*VgsL&1kTG~Ea--?Pfsrl~GY7|#Vq*(KgxY{5gyhhnz_Jd{N_Wt=??EPqZ`Nu> z`P82x$Yl^gS`}{AxocOs6kxi_6yI^Bt$jV8`c5Ct$569!G-)bwpl5^zKZt7E&u7ML# za`~jOY^H`P)0{86COMCi)m*qMx#u*BKu1NnnGaDy6V*3c_818#Thlyw2Bcu+J9c$d zFdGrlJMJ-ALHGTenMA4;_ds%gUkt~?$Ax5Nv4|)rOui%8?Vlo2<#$=B^7CtEo-7(R zFqfdFQv~<1nf><1qve6}WJ#J(uPX*Pj1!Mn*N!_q69rv20cG~);?23-o0oq9wDZ}! z(LFqva&aClXW3_RrS~XK#Pn=aZp?t7TE(#W*_+k*CkK)m@9oD*taUfMZZw(%3aqyI z2sK0EHfF8$;kwIFRhCWj@ZOhX?y583Ps z%#^{^4?JSwt&Gc#K=lzXKd4j84&^3>nDu_n6i=qR*B9#__npns#{<$?geu}cAcZHNs$=JAle+1?g=&>7BE<PPw@r2*)AA<2LPM@xPI49wNcq}gS;#jy@EphIOI%$q zR*pI>()GAcfY%T`znh#}2jfgi9y^$KG`l`MYtgwhZc#d=TZ$ceL7qNpAZ=SVa zxEF)}$96W+y}EwP&An{Id`BJY)vH$=T`*6iCBMsh5&)QB1l1#5-EkfGr;5MuO#ka+ zKi6T6Vzii4)G+(N^R;>uCmc6EWWB~=U5bL7k^+63jI~A%2&}+X?mBZIUz-b}yp`tO zy)-rru36|^Dx#Q-h>%CkuXUOrqC%L`WUVg(KmqW6!2t)E(G3op*W=3$2$R!8?8`i| zcv4Jq7!G}3{}kGyfQ~9zy!b6P+2Wt=ti9@*v?ZNXR;Dqx7i8lhez1eAq{2ZA4c&!N z4Ke?M9jqaqR0NY?qdm?dQd=81u-*?Ze2JFuJ@+3 zz1!^Y<m)>+yQ}q40;#Lf>56nX={Qx-*sm?#_r&<%nzcYy}3wJ`<`oUexk-7K3UAf zGFxedLAlQ@7LHmK&DDMS7OkF>0?c1nubhPa_tEOl5T^|v!|`>bXcu3z4D)hd@Mjc3 z`{TN=(ysPUHYXxd`}$npxq@A3Pz&onJlNnT@)y6Ukf`G6Sq+S0{gab0u&G!y z@nZH>%Zr)}<Wgo?N6cB2h(e)OZP5bxZ7s#@L zS5HH7osri7R$1CGg^KpZtH}`(8U4G z$K3=xBld@UA<+1{wOximrRT&`kNCA}(G{C=GHa^!j_Ri97vm;_`&G`%vkfAsGDY1{ zT(IBaKrp&KJgB0#g&i{q=9OrK7upG1IcvGYhd%%nf!jvV#lBcm4&LU*mzFb2B7VZ8 zFrCbN#V zkwSqX1!|Zf5tpc`k%*&9zX0)^!aBb7E}=>m6gnKffeiee`hqi{L`7NMoV5*>D~@k>|!o&nzkKNm!&7(p(~`x0Gi0}u#C^7Ahp_S97WMNfBkcN7BV zr3#$?U8o|+ku1=I#nfi5{EDUxmR6m05fiuYzob87rc!IKaGRkVS7<1VBh=<{Ll$1U zzy@fj7PqpkOWuMF@364K1ma0W^PR%sRpY#Q&}(ze(_XJc;YS(Yy2gCpw;`2tpc3%> zg1n8Ox>JgVw)dBj$Hl$*wpo2dfx?YZ5wdc92QfI9flcEg3P{1t>#MP4#d&%1&STH~ zF@tHAXJ}7$KQ#va5^s_h;{_55WPyFo+4HUzTrV!%iutHFL6vIQFcID%9Ryl@%0d2P z(+Z|+Djls9$9ETFCA67`cZQ;s<)dW4iH%%lyFQeMVj^3DHo*IqSb%5&bmAR=8$~A} ztt@)oyrcQMa<|UoBIpd&2yIyIj3N+em1{NH7}#cFv~MFNC4~Y*HkoeSdO&+H)SNCt z$N6{A`2UA@!Y%=!+F;k@ZftYqswVoKU7wnJZe@uVT0fESp@JC6mIs#%uY}d z`Imrp0@G>5C@p95tYQclViOL5z<^y{CTcI>|9g~gzv3@^WB2~m&osFuiCZ*-7K?ch zB^(uZh1BNq2JuwE0gdFVg4cWTtlG_q5gnv@P+Dr5HPvy4w^Z`}?THx38#A}LC{LD; zo~+nBV_#Dm`POqpUDf8S2qI@mwy3EoKFWPWThnIv!Y$6vE!L$iiw?T0&Q)Z5iyIu` zsXWk*fx^byg~LrbU-`d-@j4?6ffuOu9Dz{~xE-MeO+D2?tH5pjBevtp5W9rLUbFXy zXva&JzsLS>C*yBqItUJ+32qP3q_EE~yj)JWTdRLVW-TFqZscQnu>BH^S?$iIqgFYvPETaa?z} zvOllCduBOjag~Qr7Py&I94q!JF1Op>vTrBwWMuCQ3?EvyDEnQ7fqBJuJVD4yw^@kd zq~f!?uYa;X=sDo7tDA7V2RYHiv|2n%7`}C6{=$~{A5IldoLtrhpT$aM;)T>2#T|C<`e!)=e*X8DdeA4$f$*(Lagm-HPBw+R`qDa) z!j5QOhTM(DB=S?JEnb?zsRt1CHsc#CsBCKv3Hd1Hc~%wjj|#8_l|Gp|W4d_E~y*0@>DBja%h!PkK!p zVbwWe)R5Udr$zu+nwyl zlo%2hnD)Zst;f2Sis?_kXg5(wW{}2}@T6<7tY$VCu@$S?=0%PgdIbJ6UUF8 zlln)}7CCutUi^Uv&-wwqug?Zm^@Ie_7C5brEjoJI*%i`wZa?z;N>BR;3?gtuCkQyp zD+q49tT`B8vv6_Qt=O<+Q&CsXNl6g{)YBl*2F>>+{<-SrrF(v zl4d%jcHDo}XaRQU$UEt@uS zym?WMF(+F+2C#}fLU*8hRprAv{ugf9e8~Jm)pGjmTohQKt!3BcAl`}Ysi`FO%9^cC z0Wb+#;Wi!Jd5DJuA+YCo3-MOfv~lr2%%9BNNdGl^sZE>+Lwi9wkHh&=s1oR$3( zqhC>;_L|WEwKvMFc1s&V?(K+pT@s?>SEa2yS_7vS1nk#nI*X!$kaLo3wA@%pUsB(v zb9n+reMgl$v$#;+KbY6T%;l-{mMn^a({?Frlvf_*wEClmfTIZF4%YU^;8N&#rE>_c z0|JHd9D`v7Hl&v>fj@0bTp}Y|E}u{Ai)7LMYAYKrd&G*u@;G4&vu>x$8f1UNa~fc4&?T*R!;(ZCTW8 zk>xn%yOjnHR72V4Gu6@U#*glsg%5lk%O>1&zI~f##t&+AgLGkLZC}LAqH7Q_Jn7aD zJ6uzChfaamdQV%zzgf;At0|HOylM!BhOeq<+#QA-T`1e2cxr)sq^q1^$8kAOz5a*q z(8u)_Aan%X7qE$_*2`u-e7KGrEALxAl__)4dZJ1b!24!Y1MTs8ZyKeSklXeGAgLx6ZijqUylj9hc^ zvz#`xwqjn}4Z3ujS{NN0&pKaBrXQVu-x=oqK=a|bzQnh5v!-00zX!4SdLBq>(AC{5=*(0)4@X1*{Bx0@2 zJAER#^I;cz4CQ-6vI_CC%W1jM>m}2n?9{?NA9@4$fzmOj(%GzjznqF0+$GWNKG125 zpBwZ&SsSpQR&rt95j81F=tEf8!vcYFq+*)Qp)CNtToTRk)E zl+`;-D%*wLV$t18mJWQVF6~ib_dd$<;fPkF)D2gi9)jm1djq8yjdSxzIqL)ce2wL( zv1IPI))9N20^^3NohHCyp0(X?17F*gp`coXtcI(P(~`wx#zob3E$r#y6;AHbZzKr$ zj+mZhL~_4?v<98Z41LtBYOjIn$cj~SbRY)imfmcZUb62c$McYL)`yg^kT?UrNgUZr z?G9PK;0`7xCdZn!T+RLP*iwJf7>djt9n6{P{j!%XBEBwDvY5hB_p0f1W%`~d6 zT<;n)l~Jz-=HM*E?g(Ns9dv&I89c^vj`H3;pcDxRdZ1{hlg{y5!_U5tp#Z z!82Sugrm%)b4ZlB_MY({OA-!zNZ0x3;y;JJ&77*Eu+b{R>E&IIs z-by^13;g8a@Wkd^r@q0#`De%N?Xn@`q&KVikp>(HKQsNOeTbavPgL+F&+HK3Hg6sp9dt)aTx zM6X`GoXj_2V2ibkzNb9r3QUQeaH>agZI8H}2d+G}Fj4Fa3X5&Onun2_C&iK<7hkvP zaO@gfbXe4Id+~rFC^?H z1A|-chROvpviYq0;^N{ombd;%y#DED>!h(jo$ls!g6GEXZ3FFt!#lQQUPdSygR zG_Pqjps8B~d9To^%(2g1QAUea`bl92+AH$Jb>O52D7rid|8gzN$S-z$;jU_5`sCqs z*$%b2qRq*Ako4jHNX))d%xQI}$_^O%-qw$|#z_y-RQuiiMrQepK10ZyNPN0`AUXuf zG>=M;Vp?|n5uN^eRQc&A>U3~1HhM(1{({QP$8w8lzh#DI6N~Zp&e>)gNIc@8s>dBV zdIUsxPX1zYCI0wi?kH(Zl#xx5gQui)E>B*IQp*Sxv{QlpRBNxo+ds^gVC^O`-EM6g zwI49$rv?2xh)m=Hn2)g}YhO+T@$5K4FtPxx23&B90jh83e>o$sSX!HNL~5f|_8j&w z_nd9#LRC$CyGd9itNf#u*r!%naZm|-vRymZfgg>nb$YOY4i_HxFn-%oKdsB(ir($P zbqhnUN8CD>fx#==iZ8v0MU{evIzFK>{f^}m*@^%=o56}PXGRWY6Fkn{E|y0m$y$|M zx&YwqEA&Zb{LaEBzlZYyMb5bb5-)^E#}A^hJqUnc`O@pR({|9 zU3c~G8Sk%dTYJVS2R_HudJ;@|uPlc>Ygay3OlxyFBuG}J=gVmE1dcaIj_Y%guF9ZQ zE@VolIQ{seR;fp_z^-{;EOVoze;1`Q&kqPVr40^}$8v|oKiiK#9^*~}R<*BgcCM>9 zW)0Q7M+v%wxB7;Slri>o%8v>%oB_tAqo!V=r=p+PKMp=#eq{$iUI*@4UT7z9pB0Xu zRs?5BSUc|34^Tb_!_<)I;UfEA^rjEh>Spb-N>I@xhuZ=2Tq{5}m4B$0(lDc1wo5I! z+m{d!B3$ljw?4H!4({ERbI#FE_!-;Cc>3|&90c5#{ztNcJ6R0 za9;HH41OW2)Qm4wS_ahyuwUH%O!JjYNIUHzXVN0C_g!H_`$+S&_7x z&7DJIvOlP>=XEL=zBxea)TFYqaw@pY$8NxRw;rFZeTYIIj3~zSiDj2#_z{Z#*F*o` zzma=eanh+5$7=*o5XN?Qn{$W1lIlwMHJTw)oXvbmj>9j6IW@k9>jek%AqILf0xwzm z(E;amY&s6OLguQ|fb37Z%QcD~m)8^zgyCgVBn_{hHRlXk7)CFTH1W zwx=9V5PoN z&(Aix_I()9{8!m&yK-R8Lhx45@=yG~r9g zYx7L7T2QomjI1KgBkKb0Dnb zu3uU>{r#Q%9pLc&aQV2h-r?d;N_Dis)KQ=-gT@bfh3m@LSt*`lXRU^r%i!ijPOgw) zIa*v=C=YB{*;Xs@`65Mg)upiLe3QKMGI&KcSFlK_o(^VLaapmp_b|}!-sbLcqf@gt zZu&{)bqugu_=zAB|F3WWgU^3;>TIllC59-;wwMH6lA+B5eEi^U3;Jzy@nc=-pso^? zTKZHXFc?P(jBNqqFjC#?mXATE~mBSyoDJXxJVKB>^ZxYl=6K+38Y#?pAlCt{Ua~?bD`)4}lo)3!xtP6M zt$vo+6dcgA{pKMaj_tO^&-DKD=hT=6;!KX1!Y|5m1owQtWQ$n|RcQ%?9Lvc;2}>p) zD~jf)r+;0jf1_^hFy=D4t*xR1CG70Z7^6Nn;#_tgNlop!3WdYR$*(!;Kx(^%4+Pzn z^(ziv6e+m=G{mj$9SLa4e69+AdiGE6@xO=D9|`l%Umf5U7js*#708XmmMXn6#$Fjr ziVK!(3Ph}IL|Lkh%#%`GSxQgwW+SIlI0z0e6NA_}Us`h6Fo$sEe+r^ab)ic7Cs&P>=S z+vk)Z1(FFzx8UYz_Jln1ZC3?|hntR-6><~pyz9a81HgjE_o)RLpwSuYSCt1Y5=n5V z>cLy#p2(_*R<_}p2tH4tme-l=u?b$UOwx3-yOV77*+4lT(1#jXA@CTkXoIX#i-kYcUyDpwG(oytHmL633ji9XZgV$p5ZFm)o}eK zJ7=0cUh9^{D7eO-PRw<&q+Qm(wpRZkN3~(8I;M>{eHFJ7sM3N5Ww1>0D!w8T@s^|D zl(O$7yCr1qcX)WXUmdaEblGa$evI_67;IBSKR3#qSdEOC71q;}BApiW2JVLM`17iD zV}sYf|GXLc^I!g_X!&3M9Raq_&W_V;!bmcUE0g2GCnv|dCq)M&7lp-QX4vfgM0As1 zzl-*=DXq%ZD;L0z4-rF-d5f_?kw+8aHe_S@LJ;7h{PK+Ur*k6&l>gcO2H`&;lz;H~ zuHMl(o4w3be6&De)uElXyD8xnBP$G93im93M`EC=rwZGR zzN63DF!uAingYuhVB;%{8y*?C^f{D1J1%eA{{5HBB-GT< z?QMs~M${y2{6R6_ukob$g^F4kMmSOJ#g$O$Wo1_=jjoL;cu_#O#5FHoHtStr=A#Ps zS}w21d)UP?nxwEprLOB~Uke@8$Cr?LiAe@vlj6ra{s>%tm?^mp`?G7qb2-`B#U>pT zP@QrmjQtYaj;aE#2q}6PkJ#AQX#MSh zqnGthw2qIDV@14cYIfkT#U@G{6*IGibxBk7&#(ON9uq*nFU46Db9zIC`HICz;TaxP zvCiqVQFm3r%d(K(e6{IJu+sd2nn^almzJ*Vj zrx#w__$h0P8W@~7_|{sTla`S|VT28hWcn(qsu2nYTrbiF@^qKoFgGzjdv#9;Oc(&8 zsU)ny?7(Td1QxOyu`^1HAJE@k+{M=b$y4z#dJo<2h~e86 z14iL9T<)z`OxOD~jJqB@Qn~*cDr^3utUKj1jVZBhJrF`GYs;_5^tyqbCELdPU=(N+HaniQ{~$d6?mGdr6TulDzd3$nt9}Luttwi& zT(_L!WA8k)2ROAqoa)@}=oWpCeR}XCgXiKPpqB6ih2Zv$E#7W*@>3%}t$5GpxMVfz z2g`c)!WC<*ox_V=w(g`)3nzgFcR4-IT1iiD;<>ZY%X>##A}2+9UVFyX`%5{ocb|QE zj=q29r$znSFQm@kw_A)|_TXOJ{?&Svg_UK2=Q5qobS(SoZ@LQv-V`JWPY=Area~=P zZkl;+E+5zOQ-a(MW3@EYL{0R$kqP5>}yEPWv}Ra$!l#fWFH-dEFE!*9ts#wme#Q5 zk8q^~OLC|$v7T_`3xPLd_GijVL=V|#!nq&&m;_edzjvC>H^o#zu1Uh*{L9c;>UOp* zl{t99%OP810dWXs0~MtM6=Q9Kr9J1L%yI;Q;0eymSie=(cbz2#^T!QK78<25B9TbU z%0JOFucaS z7!|x)wI!b*>{-!exwp4x4vek=AtCBuO81KwFD_kJA1+k|1_B@;P*qh;d;k77RW-F8 z*N*ynF`?ZlUoCJ^G-Un>)_>jo|A%c?*NJ6}xLJ%^w?N*lkR>4>Ma*u44pVuh_%Ks>D#d_0tgL5+TfUR6i0Z+W6#+&BI}W-0XyH(Y7)8gZ)j7>}!Sy<%2 z+^NXE!-85t zNA$Pfl*~c{_P>d5Y0SZJl_{DwnI*KL-w0^Yo_tMmU=pL1HeQHZugz1gENWltD!wzvD}v1 zV1m0Ota>MQxvGuDT0sE40VXX}QqN!eset&~S?kWwSw@SuaOAp$l}Mqaqa32dW^!^G z*3x1h5m6&*PM?7)pG^5;VKo#A(*c;Frs0N#vlkI+rJ>Av^ujIEcVU`1KpM_P$9<2F zQ#oQLmli_hytCu-6WMYKt3d^f8hN}^?+w({BXG;8SzR`*;s=;XBfY)7%#(d#Vj}Yp zHQ*<(8#HVVrmi?SK7gC|F~}%a0T~;=?bLGyxmcsD#B0Cg09a#LA+7jbp5&B#h4xM> z3m^v)N4=iVOImd9GKKRUa{zm9Ikdp82gh~9`NREk0lfHWg>xLhOGOQ8IfuchITuFq z{DQ@cVPJIFhYG(rdp8^cTeG>ZJ>JA#v3zP9?2qRI?O*&H4KZ`;+H0`Es2LH>iH(a2 zg2~_-a1}{KWpAn?laAhITJKzNvB$wa@QN6lKQt5x5jieP(+c<}!2SErR^q52BFqm$ ziQ(YLBY4z?)1cM|w!3`c>~6t;{8Mr34b<~KE-uJY9o-=lm<*)W}wWa&hfVwZ-l?R!g?Wrf7P8=6glW`gy~P>bE$!($$e_@p*#`rc-_3ROg+_ zoQo{c>o6P(uS``u$v3cot)XXwc=fn---_@)>|?mzGQGYvs|4E1KOHx5%J%9bw(n=FhhaJmF)&fhMslM>q-bY_-$HU!cV2V71x9QwjzxSdb z)NgKWIez(si$EaCOD@yUfOhG|$iGm_~VZkgd zEe)gQ6m=`ii6~_ULBs(Ji39@1GA|n&Yip;y=H_P0!`+PlZ~^(w`yXR${$6UVzLaU5 z*}X$=SquV0LLS@NEc20b90USk2yX3<-#!<6mtPrB2q9!_cQt1To-*2=6G4h-U7MFm zsj_=IW;}Tzb35Dxwi5dA=Xd&@-2#)>Vpa#h3?|GqN^;8{x>IXwFfc?$a(H++U55PN z+S%6dH&|@ASMFJ0##}8wL&66!m;dbaw$CfrgHa;jItUeTLwr9Fmmx+*+`K}!q`{Ul z=}LqG_wiE6sC8fXZRN26*BO8hQX|>)^T0J8Y#<9kFhULoRaYO+wvgP`(m)s0z>B{_=%BRceWCq0TTGizL( zL96}{`TtcC`HCQBn{IL*+n+D61?b)s3jnd~Y^Fz=9k>W?wH!dzL#;aS2yoPFMKe?C zzg(i_)_Rou3^L9o+7(E9*877?DTrSEJ}E35Ub>@4uL*)w=XTq4rWJY~3*mEtt8@kv z8yY*OGQ?(AnfqIMEg(;wF+4uhM(w`FJB2w@r9Yjpq*_4SFWGFI*6EVQ2`9PqJ~Wj5 zxvp2qc9rWEmuV-J`fi7*dY8A!N*m71v}m0=mgH_Pm`yf?XT(wO3^u5a%pT)Y^Rc<+%~Kp;2BY49Qdy?Dx{i;0;pKle~jL-M}v;^N`0j}vRIX~KMA`=GR9y1!_)K|{?Z zkU6k(xliupSwBmiFXyBw`BatP&zILH%i3%z-Lsl8N8a5J1)xPKjWfS>)t}@pudsOq zrhoIHM-$W_k=l7SPkRQ0n&CYdnpr)WDZXX1qneqkKAmSxIK6E?4Q+Sc`p^8`3PvVZ ztw#)T0v8UN@}F=I>KJPELC#4=cYu>6#Ok`e@F)aaG}f){6GX{d@dBa;Y6oyF2FfXx zW*iMJWfuz(-p`DUi*t9F9By2z+6q?7)tJURa{U@A)k(92u%~~3sqqGYtlG`7_L=g- zhi_=W^%I_e4I!B+w4gxh z-+r;(WIiQ3YShB;Hb52EE?k6{#Q2Z8Xu+zhL32wkQF`AvH#PMr-cc)Fh|obFYPBj- z)Yn7DwxgC2+aw*KMmaj^-c$#u{Nhhpu`xTY>Q^5<7olAf^zG{iiySbq64*>$ezZd1 zv$WVVq;zv;Fnq(sdz3cev0|SNofLR^;Gtj{eVW<4V%a4$db0%fqaS$YVG@fScC+7&kqT=^n?n5M4( zR>BwewQBWRV3BB-W(g`SE-tJ8u(LJ^6FHnF7LwKzarXhCd2uNRre^%73smH^@8TB3?S-mIG>*Y7QirNV;aH4`vlN1ye6a@WuS8 zN$_(H_^&03+fCctl_aKJbA`uwOL3NNytvMU6sPG|nAZB`2SFm&i;}C=EQ4_1>)0|-W7xwv`K^0cC<%~V>Ho)VZDm*{=GSH!06X8qE@mGe}#eGibeu;^83EDyp4 zK@Q_*(*~DHGqrV~lhy&<*2_{lmXgk}cKnra#<6;{l?nd1;e2CrGdhW0`X{>LPgv|v zq5gkb{v;Tdq-cBIVF*0OAc4{H?EybEnpnxF4>jv%*k@yO_PH)(i{;gM34-1Pms(_M zqx*JeFyqOL}AHM z=`Lv)x;sZY1{fNohoQUqJ>%})yWn^4UHAU{3z&I1pLw2>@AE$A;0syhnJ$hz9JE5K zDwR3fP2Dd0H}UW%BdlNG_%q}wYkJ@8bv9AFBy!i++vz3vuARIXK7bfbbCj@AUn;oy z=*_9It0mnQ4uH4kxb4wB=L8U)5J}6;PG*6FN>MWbjta)tgK&hax%a17ci-8%+{GOLeQztvK~aBDDtzHSncHJ6aL1jGS&Tv)BTx6c~JIWa7C#KbS%nVfzTOhmmHI ze67GB-n@}9%{RQ|o>NP*3r5pq4DWVo9=8;I=0a9?94c|^wgaXJ^{0s z4K#8yce}4no$o~7vZd7|1laLbBf=*Wb~t=pkm*%=0H?!gS~3cPGv_T%DXRjYuIQzE zd%f{Sr(I@2i9ChPReBp8m^C$TTz7Zn%1)PvPV4aYGpQ;0BYh#~m0;htd@7=x3b(s$ zB_zDgo;xQXWxl!iS~B=St)`^&#mR|@U}jMQzk8QYO2IlhUN#5a7fz50&+%1wZlEHJL4bn^FaRK&_x~gg#$SqZHdA45KYg{{6a5ve>PoF*& zeRHK?HyPr;ed|_YYN`ZoLhGd6L(>q{?Vequ>>2`>jSc_(!}cA2vDVLM`xTz5m4t zHv*>B;MRV|u11Q%*&@C?uh4kpU`Z{F*HW&>cd`k0R;gZ|A{$UzCl zQxkvnnw1SOCw95G5GsIKH5uob!KTwG??|U_V1ZC6ZX;Y_RY*(O&{|<;KznGf85nu5 z4@RSrbLe;S#BB_NG0W@Okr-UIlPs6gLWZ?}Q^MRTxRVN9@>I8^d`Wk}pxc4y;r;devxJ0@gyqW9)Sdf#J8Kgx&WaA}w)>dVNI11ksVl%)H+U>e z4`UcVW90-$xdFbizoFS0`DuD9RiK=f|CSQzm$w_cBAzzanf7HbG)F)j9xjoef%Bzz} z?oq#7lB9OX%mR?kpc6blFV_HBM%*yCwlR{hoNO;Gb1^w{fbs(X>XH?_wEyf-E5dIdgGRb2H!AzY{s**<)8+fv@*>KG>p>5+ zJcjJCOn{(bt;RcL@a9bn&ZTMT=ooCw4^DYm3EZzr!X;EEPo1)+y|9q^TCC9)?p#bt$s>mF4B;&q!mT z#-Ya&rTPa?V#Cv`w_9YCoh3cUrT`ssSh!=K&TvE#~T3Z&r~a-QsCOvh=@*UWxxZ1la1ujMwTP`>RS_seyLovdo; z^l=yh#BzI{)Jpz{&@_;V2#D3tu=@#mVN$eDmzwzb)E#}_ORs}$oRdy(?; zi$}WnZI!|7C=<^<+p=>kC2cQl4pyZPz5rPi&n=_J4Fd{bRD8qhq^p|$3SZ$VOV6uJ9q-Q&^GXXqIjSn(ym`al zea!_7Cyv0`=f5xhe?|epOoNh>I zWzK`nGvX^{(c`uTkt`1a>VrA*+){c|28M^DgMCFXWucQ8IHXkUU~O})Xh&QzgkP|o z#8fNY7;_|M4i`EoH)t;wENnc7N2+29!vX2MGA~|yLKM&Qf|7nv_jb$;2t*yN=8y)E z9Ek+3ce?LfTqLn~g(`f7^!1ayj>Y*iC=}s+2048E;-aZ^{;FLQfu_E(@j5qr=}!9P zP0h7e1*XxD9zBw1L?cxDjheUDu*H(gS=bpqUSAcJ6+2d zEw{~Fi*$XxyKf8PiV*AZ`r%dWe~$lFWC_`t<~%^E$Fr!}rcXG`zAONVUWk%_^CY(F zi~@ZHBx6{~*x0CJQkdFSh2`#z9B1JWM-wJR&zvm0LzMU_LZ#lU;Ei!O z=rk@E4#f%doioMPuj}!&+8}{+obZIw&^9sNx|@n@mNav{{X{gxc|H|MytC-K*mlE- znL!e=IWx63gUZKD4XdYONhM@nKKeqaprwB(1JfA6$2(=LCr?EHQ#L7;!(siH4geM7 z0LfLbzE=)|!Nji((Z9+!4mU2_(hldh>*(;yA29cfSLwBKr~qYuo)PK-*T6_#%`H`3Ru5qW6Xu^qE50&?R zN?Qic-g@lQ{big{k~}kT(4w7-?I2@=+wCvU{Ri9mMmQtmgHy9%vMW$t}- z)7M3;6^y6mG+rUGrbicE5P)B37mn-|A z<*Ss`JkJ0}P*=V$?(8|A>6MU+5J+=qL<|Wh`s1MY+00ea8D(MQ5#ie`e{cAPE(urwl&6Wr-T|&Rki_9JXTdUy8Z3NIQpyhE#ib(8R>VxwcEA76`LFev;#O;~yft4=tScwq!`kZ)!N|5JE2uFb~7StLiy701)<@TyxzNU`)te(1vw6@z4 z7J9J1_n@k~jA;8#0I~Rr1@$~1EbcAyx|Cb5$!vKliAlEatDg3R09E;2i!dhP zs1BJ?KQ@chtnf-{V&t7wTlIm%WTEpUfU4=*z@TSF4HV(QyQuw5Bt)qw7BG>(*7P9U zh&{9uP8$HcOh@}XA>Hl%#!OdAt|jJW>HanrEbPU!`w>ERGk`&|wJIn+X{s;a0?O9V?v96uyMhri8#bSu?QZ``^0 z0^L%o?km#z!uDzB$6h;8Q8&(6b>pJQS+Cl7jv>a)98;I%nP&o0!T0U+MrnQdyH|%b z+~VRMxl}QAU8swZJUuL9VSv&w9Mlm&y3*-eZ<91!cMb;pJg2;t*?X}hSEkb^12`ko zUTcm1Mpo!}lD$v0>vxUx^#rH>p$G{^1GZOoesp$k9UdWc-V|n1&Px@I#;rmAKFr^~ zrlv-BexR_Ssi_nAQNNIowZIANsBMSk~jx_w&G@g;4dh%^m&7I>GY?V9dOUv3ud6@9`7G zm}~NQV)r$*_+9%l0XpoD4qGiq>wGUpPRTX&-Hp#WH(ECAUyNVwe6D4bneZC@%5nTI zV9f*TQ}=G&#~SdI?OF5Y*CHsP7qz&nM*$rF+`dBsq#t%p-U#ATr%&px``zy=NXfu{ zY;CO;cCq}H^HyO|CSWDStmRKVjJTk}6j7n0CorWpWk_)DOI#cVEI@~W*r+2#RbEa` z5A0?}h^F6JtdhiKcnqcu=Vb1m#5-{l@IJMRjHRmsyv|ftRY^?EF?-cNlLk3{sl815 zbV{)m9|bcD%YvhQivokwT7DPL1$#`5!p?cZ{B;y{YU)B<+Jg)KRDcQ2--NC0FurG! zR2oR<14UPvgyggQh+sgOSrS7Dbg!Z{w2a*K%0*blh!WNn#P`ZtKy5`1^{RIP+oxAK zonhNd<@`ZAK^^h6tp{qtq$96i{z1sw3ohHzbY-H6JOvcl#}^i+H}xrhm)wJ- z^z`%vdxoi>@_T=NGi%eyv(GlLwp%ptv+b|Xgz#%4kmM$Wh+z{d2@V#`Px*m!U5=n2 zXb-9fjDw*(`!B_`|rzLoHPcfg@n_s=MwIv3+U52{t zh_9_kT@Q_#B}*y-Mz*7F)%k0eout{}#*u4}@wz|LoN9N~?F#wj^4ToI-UrR$dpp)E zIBzuo2s_`+A&(7bDo4SHl_dj1x#?xmw@r?9;b>&b#AEpzM5mXKUZn zt?FMi8NNbF8DFpG4pVv^jSh_Xef(CV>UgAr;=K9LA z3#k2mr9F~PP8o}YCe*uB@$rR4uF{0^b6yqkW$%Tad^l|Y1ZsjCbbd%s#<{7n zv3K$Jgh%)#j`6USr8L|LicYEzpPwUTmetcsTBF!0?3lLbN|t-}G(&XM3StsD@sEY! zw+ICWCFubG55_*bLy(2GBC*`ZqwkGQMu!Nzk!N?;>f8Gi;0CSR2&}hjDK^&Dt@Ag? z9x8D)AJEl`+$y4^K1(W|0w?@nR4^zkqZy@7FPfBO{mw&bGbOEA`tmYb&SH-An3iD$ zfrR}7s=smovC%raq&#vmc_B0|a2FxtiLGgKLwPu}HyBjq$k$P`h z6t_$TfuQ!WbPhC0udAyovIo<>92|=;Wq)t5>5&>4^<)7xPQCI~t1)k`ZNqS&-(NR} zv!l(g3p&~+8h-eLQd+@E$~v^sldaRrs-Us+f2($zI$qtd^9FhtwtT*qfa=0 z{?svNh@^yjM`zA8Nix#dudKAcrU`RQAajvhBmb&A&fct;dN{lMtPQC_6vVFA5w)2; zrtx6yNjm7GePs2qMeo)Z$Wje)(awDF8q2{l%l1m-0sjrY``ZP8{I|9_2FDG{7P8lP zaVUBI%M9i(U(mRF&g(i)$K3kwFin4EHjB0nBk#^?(~6JJMSnlPFrI~a;<1&tppa;6 z((AbBl$)1ZK#460x(;~(#GF^UbxR*#CpRyG!qGzNAtZu3?At@onDx5tV!Lt}j%RWD$69IBHHnNvr=-;<>Pg(grqH z7;dt+k!@aK8so1fJ4T|brAcMed^pJYMaTq4%Sju!RMz@5z!@NqxmH;Dy-BX}_JjR} zgNCQ)H`ZR4Jk-H|4jLaou2iPxv-OWNz~2q;>w924P{PQFM!v)F=lqwT<<=O<%W~(j z_Y3=urWTY%nY_Gwa$K#&D-;h`m{o@=tDJ>_FDLc(Nb0i<>1?*E#nh&19Pj*!`YrOg zwsfNqNlLESOF=yI7E1h6DQ7R9w<=ZwgBwA0v2$Qx)_MRziVb(*e_14MSKBZ9s;m`Z zw4RtQF7khK9nIitK*4v=#Q0QL41MxpMMgR+GNhd@gYx6(_ z{YN82VEJww1Hnw`#t;gyXJu7Y=Qt>7KRAA?`mkL^S{4P$mP4E-F?L2Sf1b$j7pC?< zW^e>O5;yMMt2YwGQHpIBRdGpsQ_HJc<;GQ-pt+}&Id_-;&K;5H=y{iqdDL6aFEr;; zO?jO4M)~>K85>AN3Z~W7i6}#%MmqaSpb*?09w=}}lTek=tvVX;0U02Wa_q;s7Ho(V z*0la2Am3V3rgKlvrdL}Ftj9&?aboX1zwPf$pKkop(QhXsw`{c??cl9mr0H9=!+T-l zNGqZ33jo8iyt&wwghN*WXj(`GqL{uCyKFG(Io{p|hFq@9`j`pw}@oy2PMi0R?h-$ThD7yS@U$zdMxnKlO zorx=PYA{Qyk!MdjoHw_>{5fC3N8&$Jxk4L{goK3GeEe%L0;K8eMtwPKQ~dv$X8DKF z|J{z3+|$@~NN#9oV3Q%IZeydm{)Z8+(6}-({OLlSf_KDsxp(!f09c3?e~Htq`6(h$ z5wr{Ayv3@*R_BE1Cm&nU9O4JnBR_``O4L1;FkAm!%6VaXuhy?xT3W#^ujMaYxfD}C zuyI@|aWzYT9SP>Qvb>$Ze*od0(L(UPUD zt?kB`MTHLa(B{FKPAd{Od!>RI*E2BTf9;mj=FScS9UXGb6m9O{u)||Qe)9C`#sW7m ze7=Qbn*5Q)eBwf@SLCs_wG2&Z5xDhSN{YIA>uLMQU9~~9!pEI|Y7MRp;89Xi_TfUX zJChOxj~*VcjJF!1?{P4U{fs$a>$U+@HZ$sN!LVPa9=|!0kyV!7t*!Ql1pcMF_^TJL*}_gdQI?gr zsM4gp2yvT~Drmp~1n30^lx0Kkm|#N2$UdR)a%WIEt-STU`wW zH_Nivt}<}mY)5sTcp(46Vn$+7=0)wd)adGZ3KfDl3HCLqvV_~rPfbmom7n1qNr~{X zN?q+5+#Xbg6d&y8@j4ip2yAo-ruDsl|K8Ss;eQyFjpPVs8(Cp48=IuoLzEXDp zmYvHrv9wf&y57iSp>0z2gVIN~_5g>Ndq=KmhhTM9I&2*iii>k$51DX2DkkP8+#!Q} z$YT9gZf>qlWRa6`>YOQe3k4e@lv2JbY@qpdT>5+b4d>$;q6{_;vsey>{j%){R{MpP z^7Nfi=#h{3`@d$C3q8a@DH-^5`*fY%m=w>%lS9mlh+=&u9jK?@g^8(mZaX!04Yf~Z z`j`0fonpsN&MA8805`G}ip7 z3Nyco;clP%h1p|KPY6divqQ%l6gl6O_a{YlWer0-=$HBPWXy}6gF4vRMZ(orQ_YoN znuo$PDnhf6-ojPLER4mp=)wH1=T(H^pv;$mZtIh$xEW<-jwC&-Dj*6XN2KrG=*_>p z`%*?`EVCc%NTa{;i-{Ke>G?h>je!EQcmYsB4ehpM;iITa*9iEEi{0Q0^bYt^V6;Qb*L0JYWC*T<;CarHT7(+Y+r0lj7K#yy7U8uoT5TU ztb?0{ncR~SCFRso>-jLiaswj^F1i{V(yn88I{4RhB9^}MQ=^9QQ{Ugk4WkD!K!5{i zKgTU)mB3Cl1*43b0h_i6lrcdaPIXE~Ts^H*=k43O4<2;PIa=qM_Zw0($A9^9mxiX& z@)Sdaj`Z>pk~-39X59|-;rC+?4^Q}+Gl4}jJs*4?J$Q05lBCdd)PLCqf0d%{ln$%^ zQgx01B@OI#2+#aV#4|%Kd9Yy_BAw>hn+^o~m0=-=`vjXAkP-Y zzmyRz1o*qD_{_MJEOTJbqE$dgiO*_AHZDHiTG^&cPQePClIe6C4H(*xwbD6;zCgjL z3;5Jj240n5(yxuOrb3D_#e8)IsIwZ`Z-C9(v7pNp#T)yb8*F8jtCvRSj0(MXK^5vTB4K9}l z8A`ki$R}wYzK*xjA-#571`K~#o9`aR2AT||52x=(b5c1fa9r@$30x{6#Y4j5`4*P3 z?ctbvOGRUz*=wN(2jcb^9WJc0di{&1rb6(^s3RK+erp0MQ8TgjJch?fRIpS>Az57C z+}!2KQl_i40m%Q#XSKJN1IuU45J27xIul>u4r~FsQJkETvhoVu*Ov?^Q(rZ`@9Vzj z2VB#yn&Fgbyc7tay*N^2YAPexORf(ZY#&tRs5#8#wh%R{AAFcHrs1Tc*+;kWn~OpF!5PzMF-!8E@ip1Bku8uj1K_UZBE#SV&KG9!LD>w+?);M3<&7S z9*W^s6IiV@b1?`AsES1SUKUyfyxM87MJ~`kDq7jN@Cjh&QlTH5qP875j$4nLoW8MJ zw+zbQ9C{tM2Ce&za6?nPj>Upr0>A{8JOE=Zz{lJ~<$5_ef~DJRY1y&mSrqGmhatD)VtDHWTJ z67w({i3AFg(D+RELARA4sW!KF$=eSU5`7cy^@SZ*_GZeZPvVV#*WnaU9qGlhN4Xyo zz&kv*ifp{7hZ%t%>>P36DF7vkUyqnP&dyOcB=oYY_@2-^3} zp>>UB``#)?0f;D4kchA0c`IKM(wIKCxwW;{c!%0y_CDEbK{>felaO@KtzH+)B?|b? z&}HAqS4AB6?um(9Y6lIzua(pl6_Y??_kpQ^gy-NDFmX&Egx@YJ^wSDUZ*zk!6Q{|L zBQ#bOaSf&f*{mc~sZ>>U9f+75AD?!mFwVGwkJ~xLq?lDR$1l$eXrcQlH)(0p{+D5< z!jCI9D|;x`L4+k~So2iNE5b-=D(#&B(p*kXL7#W9#Op~sM;ZOy6y}bd{{4{@jDN6o z@J(UOftK>EwjCG2WBQL}vzVXOgW}(HP-dB_m4kG;H_#ey@O>op^!zMmukYmr57P3Y z1o_=PCLZG({cns8$2~9_+;;d&P_tfgJc1te8_dj7<)G!+aw^vc=W>5yq(Cuj6@9K^V&&EG!hl21-9mY3b zAPlJGUBDsoMY7w0O=Z~P;XPycpz#15&^sWj}QFG|~0oA_a?0i5q+<4(g^z-+B zWi2>~%XO5`UL=sy?u*;YYp+hQ#>mQ*VeCnzR1%LJL~mq1br$`_xNI}>%GK-Nx;2pP z@Xn6zaqA`@QqkIYFk}YD5bE{tl$gn@UHo`=&(5i#)^ zXhx>z;ONRZ{m)5@8oFn$1k9r|vs~gsah#oZCxpK{D`=!0tN&#`KaOwDOqldS?cU)% zvh1JQf09Hy;4B{(m!?R);8e}IXN~>}#_=$P*@c$v)E=}sDeOrcz)Sg#XJN~*j}MB) z^NW;ci#Seu9QyJC&KtMQ2Wyg3RcD?sS>@ZS&+~2YzRbz7|Fpdx-})ka#-i1E zXAd$(FIU#Ew;qai*62BwxU$mO6hV5h9bQhhwd8AODD|jSaH}Hy+%F+bZW=~-_ym`3 z2>o*GIQU_p0`F0j@rauV7N0)PMky40O9tHL^OL`PYN|?H^!dH(5h6;$**!MkqrKtE?SkCN!=a z+S=8u8K;+XP5Q~fdrWw(UI{J)Y}PpJ8>Fh`d`s4HHbK+24>?UH&-wO#Og|8?*`;0* zhb$FtaxLtK+oFmhe+}tfL8^4jD#|To+U!EiHg3IloT#p)D!`5{!O|kVBY&w zPW=1mz!Q(Sl3v2IsxCiy?9vTZIsBJNvb)_Td-612rkG6Z8aV>}2{-M)lGQQz{xm%1 z%a`$xDIUloXch6CigXr)j&VV0lT*^?T$TaReKSsdJI)q_gkwVv3j&Zej)SnX@?bE9 z2McHZL^h-%i(6r9AiH4S+)S0Xr$n&;%FDKZi8O{ui1dMgcnglC za$sDa>*t!aTw7cFM0jqyFmk8Rw!eSq>ek0EOZ>|;QjuUD$e2vzwk+s=R(HoWe=xt7 z13c6DH~&w0j! zThvlgQZVlN;ys0=)fM=6`tn5>H@f3&Y6lu>Y1b(v-4Zq+7CBhOTlc4;@qx_9utgM3$4JWIo#`hL1Ib+MYk(;2Mi6LEOmyW7av)DmnFTIr>Y z%z)EM@td&*F3~&TrSn@7zxx-Uv?{RO4fM+JiL%WeP>ZZ(Q6;`1q)|t(nchjZa@Xkn zF<3C;M?#X2Up1mLuW4uZzRK2?{C2&u;LS!jV@iprpr1jE&u(D%3XK2XNBD>L`r}>> z8Rf&0Me4Wf%}1X874%$$B&R#w&-`?933ks*5M05 zbZgb@%o&li2fY{MxcyDqD2+s6mHOJ#TFhkftBh&U*OS4Pf}486G@E*=M7y$}56Q!f z*h+I}k_0zMc>gmu)Yi%z;n6UY{emm~e)5dcoAt2ur&)O-{L8IYTP4aY_)5*`O1z`j zC=Wk%#ajZfjNrVL0nyKVyB|(~naFm3#AcK_atAd%w@^4e|243%QZ`I4i4h;d5s~aZ zKi_frG{qSiE@gH@P1e7R%!g?H#zsm81F$c_(k?Sm+4Wv*`Iy5w4>-Q4H9 zO!X?VB9e?r%1-SDfrtI1#`aloA7!ml6NmqP=zIOwmnzt=e4Q{iyL-l|FLMK%xlxlH z+xO5$L64oY^R|v8xE8O0kwn)gf}8h3@gZ7lw?B!s$Sx=3(~XXN$}5rDq)^orRx^~TMl z%WS!!OTrr}58OzPTw+DdIdJi|77YKBA`&y2E;mK@I9ZB*;~#{YzXP7%+{vj|S3lJ~ zUWUW0sE~Ji2c@v!s3=bF?_(H$D+@C&0ri>S9%{RaBv;bxt(wU^j|%+r9f8NZrgI!j zYCNyc`#+teK0z4pBE>#}B*pOAU)dUExQo8Ig^Cpwxioc;ROtNDkm)-jkn5g4rYqC-X_98l?MKrS$J?|J>SxUP{ArIU`p z|LGh5*3FN&Ud3rn$%N3XU=gkL?Mmsi@Mf7R+CbTMYilX?h-9&Mk@}+ph0FK5{H5Y( zLkXQ4{6y{uK-Die0ht6lRab9W-Z}h9V=S!DgE+{3Zm?O{$?ct;F~H*+<4?gH`AWFF z=m)#{7k{Z3KXZQa)Yz+)vRj*aIyAdUX(?LE9)-FSm6et`>uZRT%G;4VgGqc^%pa?6B=$o3tx9P=TI$`-E(haP7tb!XK#y#(9vK;;?aq(yAyyF#6f4g#mUpr+ zpTBB9qxDsho?IG8p^M2kx$#W=PQFh)dj{uW(Fu|O>gto5vZ2h=cftvw^gu0^(J~X$m^t{A z?~TMXia=@q&z*k+pTCQZ@m=s_T%BWiia#Oie>~A2ZiaOYfBTgESgy<#nAMQE(Zkk` z8s^aQxDJf&*l^$Vy*_MlW{+3bH)>$MELYw|u7}&^E!xu;eL=arkV9Q&YK}ekETo}QI!U&M ziD43%2Lb`76^~cBIo9-_Q``@K#ewHIa3m3Fl>KwkeMlNLmhtVki!=}Fr_O=#?b7_D zPA5uz6`HQet7Ri@zOylD~f)8u;gZ*Q7XLO zSnV3dQvX?3!nGsvtf318UAVaEYcc4<^^A{?C(F+V#Fdr;f!<4pzrlYosDFMlNCS1G z9cBggC5U!&*wpyeUIHHw-+vX6$VHNr7+KEQ%`!~xnQbxp4Y{t`u9Ic_MJB1hs%y6= z0v}=o&T-At;7-aF>`~E2)7-$H;a!aHZ!_eiSOKk-##$*yBY>ZI%0wYi8|7J^eHY-G zL`#MipAqe{b3b9AbW%*ZrJ^^nyC$}X%6pZ+8xbV4E>iH)NN?qfn@}@-a~&l%0BBC0 zkqQ-u|3vI?P{ffttndS@4vyQ@o-v0c{pU2cqVtOG0xyMwkANxxl(#d7&%AJ;lhN|L zT%1hstHKe{O6hDJIo132%^r3y6jU!WrG+cxkgSLp$=Tt=aiG}l$v`1g&oAJuM`2ju z5j}vN!RbfkNC53Wu^zumFiJ8iE&H~bKpxu}Xdjtq$ogaEEw(|p1TS&QRpVdR8UdD~ zWg}{b345LI_v{JHub2gDXR99QSo^E;f;)Le$?&;Q`I^42720t1fg|RZpy??POSa|A zzAJy0tDij00zp_dWi#YFwC}QL=!WX>TbZ|(0O?5~HIe|gSJxM3BdVxZ8${oDnvH}O zpFUiwc02UyooMMbvjs|qH}$*8_q*cQj(k3=6V9U!>m-;TQKX-_T!4KrO_7gw#Y!XR zWBEpH8)0VQ4q0~^0gf^GB-D2#!N?5TQT#?;Y>Oi>3zvpP$`PpwIHCEGTd*y=`644d z!~R#)${@)|?#*uD+B%#|0?=9rXP2Ygj8cvW=3(>wpDoddISe&N=i8PL{$V7+S4XY9 z#I8zhV?!&pu#lHry1On73$IDd8ZuBTzQxAS%~%+FY1dMGIh$SCy4gaNKjT&?Gd`EV z(m=GQ34FTI-$%CO+;B{P0}k;4AlFd3H<`lWq_YczxOOj#+J5>jH4XwE>Nwam|datToi zVbIjLK^fdMLc#!&Ss&#eS1m;f8;u7|}fjq`M z69n9WlM%jaaQcYe3Td$85%1L3>C$)TD4liOUx|c6cB>)R^VV@wt_eCb9+4lhkw^c7 zKYfoQFPZM)(J5WL`Y3Ywo3u^~hswcwG^?<>I%fz)a%^TaXGmy@W)wsqUU9kdMYDHT zrs+~}Re^`1f2K8N5O|pGSGlIbZ##nTH**bz=bI>9P6+7(T1S#;B;#aQWMxJq!jUK7 zVv!=XbAm-|E!OOk9uf+oitL=-kH?bw zxE-{u$IX+l$zx5JCrcGhHXzOHe>3r}|7LguFOB$ubN}|{NU?D_VvkJX=mC8NFu}6m zPiX29O*)Q;lDI*=Ynfwb0t7$%(tc-m*Mo0-UvD$yg@^mT7wmd0BP!%;N_J0c`t1lqxZjWhHi)Zdkw|qSf zvD&bbm|Ztd!bv@Vt|>0m_x9d5Q0azH@5+A0hfJg+?Zb2pJ*(+jYCLY|YEmXcetaw( zNPQJ9_UV6kEYzX2aTBDCXpU&*>(i|@0AklQOm&}h4Vj*!j7-0-{W&ASkZ9XaGEhZk z5j8;|U26N7pmXA`+EK8RO+>>{@Kf;p>zt2{s_DPa2$Oo@q=4p zdjtXuFIz9pqe1geD5x(JNF*=s_&}6FZeco83)C|*bC1)*p8Y{|~pQ6ct;{v<~t<>zd!7Fc)97g_4sx=?wVv)VqH5 zZ)roBgF8nOEa^$|5g|ZBiZ*;mG7Vp^ed(wF;bWiUkgwWK4z8mHhu04`6KsDhmgnc87EkZCMtI1CU0lT`Rj zLstFW>X4&W3MO_LSo?dkS8xC2=l*cINBjXSiMg%2mneOobrbUx74h}9bNtLFCbD0P ziq>_FlgaaH9NT2>gnI)W1VEwKIcRbkJv@yffb?DDyh=IyGw}!2-Dy9B`>Zk`ByH}b zGf+SJ#DD()S+#NLAc(kHK-EF#y3r8dhBRgQdQq_}p{|vN9^9f_Hy47AN`-!99Tf_AX@I0>xJFSo0$I&Y!uNSPzNf6X3|9a zvK$WG>>p)r9{m)6>VZ5tqSXHf^Yync5P;VqeLi#mb3Xe``hpIj^hiUkax%N})qKW6 zHXMRvaCMkWow=XHc|Ui`BY)%)8(L1`%A?3U5Tn|pDwE&tUAcMYY5Dc2!c36Urch|J9-~Sp6xG{ z7$rT@87wR|4V1)cHHjNk=C+MAe)VV+*FmO<=L72}&oU5%54o_nRuLM|>b;el9gC4b zUdmS4Nt3ukNzu(n58@=3_^T~xCF}KGKcooLE9(cF;yNN4X=u4=YiKSNL4DJd4OrUEAj+q?e^#Ui$%L^3Z$|d#Wfp^#Luk^ z0(@*-o$tr9ty2J!v1uRT{m&iP9ibb6jkQYwCvdP68K6sel-6L z^_iXk*;NsYx-9Jn1NGzg`oI68!Fy)hCDMm;K@#}1x_M2$*t8YP_Qr+TFcR+s{n(_; zJ1u!l?kdc3ud-7Nx(hxC$pv&>eHYmcG?L+Js;A}5_PapN!6I($z(5gEg@r^%V%18p{fC#cMi-u0uAnn(AM8{nQYSp%m313;cCU2|8d3VP}nY|KaT7P?5m6>HI zdYLy=3>nZLP&?Lo{kpkrnE>?_po>n1!#Ou6yIQESB13-^$XdW^lylVlq3ZDCJOAo0 z71w^t&QS9}3Md8AG_MJ)BV0~kba_<2=Z&B$+Rs;%KE%P%S+nf#9jlh1y}pZ+Z6Nd< zfC9*dsFV&Q{X}S6#Eq{n^+fM`1)^V;+Ge ztp6_ereg%Z6$6XUR!Teg?A_pX==Utz8jAR=tM5p=im|ki8+VDSvmZYZD{>Z=9GcDA zeLGqj^ksR*S#fz=xf{$!651;jWz2YwBzUi7zHW~B`nIkNX~0||bTkn@273Wq8aaut z)IWb*)*-yT6MJSE-{NrLBrdF+o|(J0aGNgOIC0^??Pg7=*`$Ql z>W0=$8>`HfkiJgk<^0z*_gC5Hi1r`2m7ORtHc87Hh9nG_219% zuR^Oj<~B$h>1nA_U|{E-zKe`-k0?WANN2cx)CP+PZ6{Pu9A=@v!(Ldx@Lcf5a8Vf(j!{2|vEk{sd9d@6J z$uBR~#a=l7CAKh$H-y|ErZD=By@TflM-c%3ZxP zihzL9ErLi$NQVq43QBjEASm5kq5?{)lyr9tJ<=!w(vm}`IWXC_im^SWxzV3j@{V@ zM}4nQObvrU!#8)U{)g22-!7a`D-rOxcc{G>bOnzdfDFmU87Z6)7{oHJy<{X2Tbs45 zo|GOLTG5>)eblsj5^}Xk+sqFxTTRGbbrSbd%nG@7kKgDfQK9F4*{98PSW{8CS-wLv zDJkU}znTN9e3v7N3raqs*?=V0BK1t7AvZfSuK4;a!;=S6AL_EfA-+ZX(DU5T+S z5i(GmB7pMMTtgLqAlxRYDF&)Wxv{knz>O5{XREBlr{?Yc_}0ABPi_2zyG%x|1#x`-HGmri~zO8`Ng2f-30%3vq)|C;h#O(|JO4Z zUIj^IW0J4V<8R%&zg;78UT}3^k~}Ln_@ML5x83P9vGD*@&7(keG}ebCziIOZ&C=!o zwqBrN#%_q~&`M2*^U8z>Ne2p-cCtEobgiPqxp$gGS+dwP6M*HP?tp#damdsIX8hqx*;k;pbPj3o;ZLVRlNBm{$kKyd2_olN(j9ouEkgL}ZfxNv!^!a#T$$o(UF=#XOV<7-a^jo1Xz{-?5M7DFoP^T#i08XkNP*9=Y%O* z9>_QAx(U|1lU5%p4HxD(#C$*87mWrSLVprG55CLUE)0Mna?AX<=45Oy=ayG3ejqn> zzz!g1)C-+l*A$AGYDak4-c7&VLJ7!S>)$#=0@Z@=l8KZai#z~@)#O|*s$0U2u3ksS zcqja_al*@u~EOF;m`^Ck~+HfPA+U__`0GrS5Az!Su09;lTWZ56Sm0 zS5`^^I4l}f4U33qA*FmoWE6Ll3Vfb41I3{IW8mBFM1TVsV9GyA4W$tw$cgCJJ^esl z1^Icwa%r*oW8fH!| zOkyCH>k~hz_}G;8SuPC#279li0qpezAU5vSvp?BMn>?ykz4dvB_*br2{VEIqLZj#} zAKswAmx|9_*$eiR>(odE)bq*RkhcUj4m1nT`inL-j3)W zC2XA=wLR$%J94QwetOg}T}x5Bmyf(QDBT>ETR8A=8<>)w$i4`cmzsZd|IFqb+|#*< z=IeUT;k2;Ri94v9v{vFARsg|pUEu?J+WF;wAU2JM-)0oMY%7?fL6pbSHL4>xCZ)|e zjskQ|p7z|FCO)!XzX8Bz{*%57G3+%n(LW!Dx;LP@B2M4!{qwK>`hTpav3}T=fAg!W zvfpznM0^XLUqO7lMoJ}Y@mN?6c%R^#_xO}1`A7Cl@0RY;_-nT4TXIl)F5*p%Blo;w zHYO!AI;ewJt2W^49Wpd*+0DwaRXMxb*IsT%eO%g|?Knmf&bew%--4a^{W#`4Qft-} ziblC$6gI>^R0U(qJ@$wKVAb13rPAM+s0WS2vON9CpnvIU_u$jj25irYPwB^gF)3-K zT#MN(Y&FZLtv~@1vh)zop)vN5nwH29cD3&DQ{caRj%StOd z6ESlOs;5mb%-09XWq56GUvDJ9Z9vG=WS$ou4O9^IK z*1RvXg#N2dBR7MN zUwN!gA;yFF^4!MvwYU!Ez$ilZnZsHUT5m~1@3u?#+L?0R9yJpp}damrTv zld4p~c0;b&ZWtO9WHC1v7FQxjOF>0I3G&$!Bc)4%bktL`HT^i!j|T z)j$fPG=O-owENf{=HY$qIYcs@ER?gOXhGeDz(4azWA-{Z9;;CD?|*qjAKD%76|y3* z^XmDsd3@M5c0juA9k0)W{5Iw*4)C>cSjFI49mKr_QUYyAIA0s!e$JUBt=b9OP;R6r*%9Rw)XH5rBo z^WI!kK>vk(=MX?Mx>;=4{z1r~;mBcqxASPVnaSXd3JE;lzf@{9*d4QLjGz;@%h26& zS!CTE_!v4WE6v@L_Z8aDy+ETV=^gaN=bJV2O9CWB#k1O^eD6uDG!IXlH~OFtk^18W ztvfqT5`cHt#+E}~Hg0dZL^ID`wJWg+`Pgwt88S52AwsdRrN0p{EkTkQCJA`!OmEEv z1ws=VKvIztupF~!sAW4=WZ+33^$hnX#Na&ILm~jpD33)6xajHW9koD@B#%DR9Nqh2 zONYGB(576#a?0QQg^RH^#0K5MG$_c{QpeGW_J}F8lBr?K9WI)oa~{ zXJiOATvZ=y+>U`UWIfq^gV^43=$OaP%-Hc{m4zixcHQ%PBW=vFI9n=VXSl5*@Q0&(B-$96re!{M|K%7pLe{6t*1yjJ_>w@}5f>|8YBS)^^s^a3y8PzLL+J}XIjYcVGtm^4WIE1(LdHT7D!VjprULYfruG27 zhbP+)XoB)2B(!Y@M|j>#rim{()rbj(2OHU01%n1!VK1|934os`qlu|pli0Gs_p;3Q z_DXWhzI;NI>(-V-Q>a@FuM$ix%%fe;de$F5^T-VX*dO8a&lq$R!yG#01S^xd%X0NJ zLB(xuXXd@v@+oOGhXyz#3{sxt`7y0CQ$WzH;9RE3-cs04q?J?t_D^?KzDi=Yf#uPj z4zdFOL^)1dF5D1w^)(Ii9hv8<7eFotHcOjoEbFk+<%>W~7v6ZVt#Z`5?!Tdx2XrT0 zi1|ysIq_0m5WFjF#{|d43#_TzZ#zmib>GUH#lY4w?96|ooJtq-8e+ti);vBv<=JO( zZ!~tGEDLhkGdm>6j97wK$nI4$!}4d%k3Ypx6Vd4Yhs^^#;ZMIS6YW)xA6D-zs?`dr zv?`JOf+vti_r`dPuW%A;+9Yf?#$_bAxPTxpj40~%zj~-W_C!`=dj^*8EIddnkLn7{ zT-c!4zg}oIh@9D!?t^z$?X+iU4u$3BWLL5Ej~4MnIBo$Xc zhA{gs;>2z`dCoZ{=j429KHz%S9`W-L{mcLHrVzaj{Ng-?XUeEYT)h)%GEWTkevU}> z@u*l}>_@E-2+{}i2GOyVx%6pCEOF@wfz;TpB0_ho!ffvL?bnbzy&6`KJlHq54)Z}t zBk?l@h|+92eP?{WXRyN91N@7Atfu^#q6tn$9rrcT{m==M60$U6iNw-Uo|C1{lU)AL znDAfPm3E94DVAw%CqzNza}yCB^TSeJT+nC&V)^c@ty0-Lvo@pEaqA(S9uQrDjUR6e z)!5W&a!AG?2DM;^0I_L7(^X+HUaNc$K%Y}XD)sq=rY07*W3|YxWDNT)^C?eH?W+`8 z^lS`(J^>WxF{R%4#5tUhuQ=_J} zr{w2bX0rWE5&6sA0=B^L0Od2dOQ|Rv^9Wd>SB5=>g8=XDhQl(~53y81w+HUT_nKn*Kl3x-k4h5k@NOv)lvfI=H8v%2A9iI|Fo@}$YG)V@77Fs^{?CS=1 zkJhBNx{^rxFa@&QRm*3g`9v=PS>waVcPSLm} z^!5~l+GDFrp%dbDax_roVn_RtDYW@3Xaks9Jy}`xr9^UxY;wJC*WD*>L)O;=`LFHg z7E}j(`Xjcjtc9R-nwhu#Wcgiz<(~=rf9C=rU@k~>)c`PN!xOl<0382TXBBmTnc9`% z)u~4MA(Q*TDeZ34#UbwQ+VGat;`PtV$U7V7?{RHEw-kaX!FZ=?h*jZ84p13slN&?A zT|vcfDw|e5?Pk;`!_g0@oiAgU1EG|eP9y&*+peTJ(Sjb&q?13zAf`Xp5l6>X>9{qca%GuD1PV=Z(V8lq(1XtR;b!`?=WRWC|poA z0#1yUh;mZlE#$kJ(ZjNVhp|d7U))~|f{B6HlX7+}Bq35+pbG0hIFr`Z)u;6l3ffby z?Umdu1H@9NJUw#Io^ouVohb7&4U8iV!ySt^ji&Pli4zb|;#7ps2r{eC565 z$!{quJJo@%x$bs9DrcLlAR4NH9p<3&2fCeBr$-sxDT``Lvc*L+g)_5G4KZ-!)39(= z?M>SY7cO-5^w_me-!cV4o%Ih`c}x(uWzaf6OiqmWu$ECFfZbFb42slvM**epDcTx9 zT52#VgZ*K5GsN@W@hA4@_fOd}C3#R1CUj?1%1Q->enm@GFfJGfMsH4&pL6j1Y}6Xj z@J!@zaBrXq;&{@3Q-lVLAKCjR0l5Ynhir(rYjb}8B|E6&$7+jLAp)HWfdIMZpUTxc z853J)A3w?|G@Y~%+o1w%N~`LBfUCadwMWK)P}56zxgDU*pjdGW0EeOYnFp;>RNIKp zpFhv9tWY~3o9CU=r}@x|!vt zSN1-iNKAu{dxgmO_OQSTt3n~it&+x`$HyWANOg0~?FcYfC8Qzu!-fI75bY)&F7DT? zeleSZg$DlbYuY6aaq;gQZkfJ6*f#?_H_bAhZ01DP#R!1|y$?--c<(=-R!EvG#c-uOn>)!lu~!2zO8eeIjfDVFtD z&OrY@x}{FVO%%r0q@_#I=gLmwd1GU4o0=%C-ivSAb*?sIz5H7PHdKVVDSEd_Vthwm zS*@))!i2#?+RDp`(qfPTlx7H_JT>^j$8n*$N_VjK3WYpr%SK(j?xgeH`Z+{7kCuWD z$R${*6y;J3p=6GSKy<3UtTUPik-ZG2$QN2qe#t3Iu=<3^uk6s%T`z{g_-fXJtiE=F zFhI9!_Tp*p_?J(417pl>w>AiQJb=-!@?7mIOtSO^C)B)Vt8J}e^QWWEtyx6~=_t(Xk-tG6!+IP#IV zwlriXc-b|wO)1C3BBkg5)RgxR!!$$@oEh>dpqpS^j^C&~7-CJ@-*V3j6QyeE8&^_MA-F+P zg@>cPPeHvgDT8-+t*&5tGpM?{x_(E%dZMmi`PW(_2yR;MMuS`*pdNGKwG}HjSz66N zsNGhu*#_0*H9vIbGv*ppFVxqW)P|Xx9rcCQ(v)jk4j6%)C}_a7_4p3dw7KAzA=^gZ z`Z@r7RY6$-3|{y;7@YoA(^WE`OC`QF?5n~l4PxGx@gK?6kw<)NL}O9SWqi%AV}T7O z76sBt1Z(FBbT8Pqf$3V`c53%yYRT5cPLAr14|nkdPLA5Sp7|o10ymG3e&#Lx4Id`` zx=zaqDPM(*mwWIyeJeVmHC<||TI)n6_hYT)*L7?&wZMT4bt)7_WTk=pd^Mx1s(6ko_6cV3btlU7c5)h7Z|}96vU9-+3R>6q&GB?L4`?3Ocvv9H4!D0xZFl znxr~9aN;e8QH;PI^pwlw6d7sKr?GeIRM3mm%2#>4vC*ttdt#=5WGRWRp6B{Ex{$Aa zkgcS><8kD1CKOs*CD5O*Lm?-;-AlB)$p=|~D@QoQXEDV6^bjbd?+<`r_npO@m8!CG zZ0%O)U$4QRKml-Vau}g6&fX48aOq)h07ItMTi&mNp83kD-a#=?!SYOMv}Y#R%#Qd9 zT9{B{fSNv3@OhFs0VIqYMpgUWDl=eTr8T*z$}uzja;2K8=2fY=F-pG}T`I@+^r!5? z9;0`1Qq2cE;ll&HuD3z;pLd56c@Jc*NY{+Ru03t$Ax#!gx#A_KFq_iW+Q%EVRj4gm zmpD!z>;s^!T(N(+)X#~s{w#_!>e6XH;*j8Bz-w3%d112%8AHrX_{4`!~-B(n*iTbKZ`4E^wd-9RTOm%B(3rEWf z%vS+R@z8EFjV7WN1j^Ko+Fp@nmjpZs5^fkX2APwlxHSahTP6X=c9=3B0Ha zBW7%l5I?CJdXKbhD293S_N3`BgLrH@#hzWcFCqv&p+U&f5cdJZUUW1$lt&mF37&)^8ou`QJpFO6P)kqen7Tc3xziy0gV^}lnD_)Y*L7XEnXu?aoTpJNzB^(UqN%0j=D3YGYGHvYTU%S#&gyggarOVbG=JwO4odXw z#nvt47*dai8}2$HYrm)79<@aj!t<)He0p9_rAL0!K_G)p-SCUS-F`41dm-~GqmeIU z`R+lMbVw5awNPeU+1qr8q8 z(Df+lw^Q!iS*yoGfZ^6^Dk|aQ@TmCs{G9v>Nx8f0ug48XuV8|)wnL?6qYgd)AA2PA z9xadAZkg4~smW?Y#H+Q{D&ih~_qv^#DkukihD?ATM`KJ+r+>;Dj;WQ=qaI8gb&2Wg z?Vc#JED%gKG$W$y&Yoo$55ilbfZ7nW5R&(Est@Bh$MqKW@HlmEylCRWKV8@$m*{%_ zsAi{v@Gy`}-ue+LcQJOu3$EkD1c8ikrhqAaI$2p+5$#P6PPOd6vwbh>^sy-!kM7HP zIm0}W>zy80FQR!ZYdDoi@OF4B^Wa1M*?EWKJp`*GD?Rb;pOrq$*~g@8rY2>n%6Zwt zAVOPR1(+{5ca69#FJBIi)7o07$!ky25n?z{77BZaTPZeoTbnMXy-M;=da1uZE-5Zr zO;Z=1-)NB^EeZc>)#et659SXLP&GHVO1;C!j~-e3JOTm;il8{?8MLf%)QLRmtk~|? za^to8e{3j@)cMU~LNJs)+k{c=u}VHJ(Q5DKWOfw=wWUqZrq>ONdL$k_bFO;WX5`Ez zF5^Pft;0iFeFNG#2JkVbJ<5}(vZfbt<2Yp|`Ih$8HpA5#}VT7T)1Fc2Wr=K%td@yYP0W@;dej@@}x0nJ{pYzW#7Jo{q&mT=_R}<55 zh8BtqzKiG9nq#f-WQ5fXoqjkc!~yx2kAvHd1SM?%3d89mBih3IKSJ|GGXY;f>GI{v zx|b51##~UUc$QV`;tgNFI_K$?JbyptJbeX(%Su>yG`h9uyA>|KK1KSR_c@IFu}>T?l2<-k=J)j5rMVTFZY_2s94-EMMuW%09}rR{2P)O?xj13-7Js zgXbmpV<)kSghqocEw1!;EMxo0;P<9}mDZ_PBE}GSIkFOU)~Odf?`g+pzmsA-v$yB0 zXCCAy&W5o^#4vuSY-4#nt9Q`Z)ovd{TN$Jb-l&1zWr-K zI&Oh|^!C^Sd9FaMH(YI{OJ<>ZeJd9l&RZmv@~8`y#iuS5?=Z0~$cx&c3x2aKMwAFd znt-rHN3WlH{%)Yysb9<6n}pP*7K7TEK=toz!m00p+C}~E7%!J(+hdO84q9?IiKwVW z<`KJ2p9>WTN_sJ=qGLS$+wgBVNhmzbk9Xa?Ff%8o=CVNAu4OH1NWd}@j;HP>{orMK zaLgQ*r=*``0=|6w5_OR{=p5o#FS#(@PT3g9Y z5CX0l4HRVFc@^JQm$a2CZIpVO^S&QoC>OhoXc@`G)oj;VF+-id6jn@qnZJka&ABUN z_yvt_Z-G82B*xtwZ-x3Jt!%Bwonf8bLdQy*QL<2ia2~5<8!078#jQff=I6G{*gOIz zWhX_OH!Vgs)1$R0Dz~+4u91;1W8n*#ERq!skNR0_6cXE=er3FH0I|QuGXc|QLp2_i zJ4=JpI|s7D+sZfT=|3&6%1PY2=Z(6rU`jw9aDFCIZg@6mX}dXx0smkj!lPMAgX!VK|xH(#KpmMmtOL!xV#+L?rSMZLFdxA)y^qf z%fD%=#uzfW$hp7azQDU94?B~OjWNn7W6M})4JO-N7suF&O zdQ(k ztz95kk~v$|Zv6;rPZ#!(^z`d0Di0Y2)63NKu3t&&z?|NP+~Z=1eL6zS{B=Pfwt1L5 zU@mpRN5pG^IUoQR-Yh<$)L+F-ch2jvw-L#n~l6m=c*_j_k2>q@| z=4`?4oHFReO&2N}zK08si*-J$P8;+bGN)WQ-KC4a0eTj6Ki&%J!pJBoa|;SM`rz=& zo1i+%MJsRV@XKZgR?=5>qFP3v;_xj@MQi+rBVB{fontI-f2;sI3`kUxTxh85yHQK0 z%7|Jjr2WQIwSk_ga+Bz&c*dCaak7Ja7#Ts@Eu#ucNge48*^ab-)hE#&miu9pCmrO8e0f=~!L5R+Hd(i)cN6p0R|7Cs0epr-xP zm-rw4*_0)!B=rgeCSso3dMmAPJ1dK>!XiD7V;~WqT?2VAIVod_pxU`EBmIrz zWdiXx98YorVnkWQ88L^bekvv?Cn&~eP1`374i1`{nQc06qf{{u7asibF+vwl5jE4w z2@YFPm0dAjZX3bKFW?drG)xRE)^Yf_HO4cmcD5>C!z6%**YxH6m+p8n zre?)CX@gnD)BW93@)AuV=_?$v(a~Cc&quONDU>x8a$bHgrTWI&*Y(A=wA6{|q=}=J znuw&~)BVKcFVj5rnC{V0wja6)p0!%S4tekz{`7di|FiDVH84q&URaBG)uA(9*cIqd zRCgQzc-%GDWj@JoYbt=D!9(yLzUW_LYjpSAIi*_>8S=k#En4LcO1`b8y$DQSIC|x2C_G znwlDzx?H4odU*7AuAjunf1H;kTMUuqmeI^;Uw>U5Z{8B_OIfJ{I3E5ACMG3Ann>bL zg!As+!CbH6U$#x$qT6bIa#4iL%auC8)A2#ZGa1BSo51PIe4WD-CHPt@x4drmL@MI8 zfG1l7%P(J(-U@Wzzia0lVa-cCbz8m#K4rDWr6>=9%#VSc~rlRVp?Qicj zTiWfF+KMR!l&yBa9~UbfkxSe;Yhe{p3!BwRdFXgKSF=>Ixw-lI-}p*ry`)|-F(pOJ zC)_VQ6gG(I*}_XrQ)Hg7Y~dIl{xIiL9T;x&R>7|4ySwr?mK zp$Sy9=-GNNal}t`+urZ#fyE9|J-)x>Fo4A$CyP?Ft9SG=$2tAFJ;U(-_~KwW@BT7k zDz-r*FzVh6-X)W5l3PHST*9v@5kjAx%Pnb)-z!gP{9Wmt@He7^AJ+UvTqo)dlx-|9 zQ|2?0K@e?JgKNJ2gvsF53i?X*u5Npr5L}0E`TyYq5-<1i^_hB2KFgoT5Z>479U0=* zH-q>2WTi41*UdWd6^`O~a-t=?r2K44dR|_YikD8rRMNDnXjVqKf8=fBG=EOIG4&Zo zTYmsseRsJ3iC31nbqOXvCGPRkGm#z?$EeHpvG%1iZ029y?<_083>5X=7`n0CK0b@( zDX4yw2J}-d&M;s5z#LZ0ZLlB#JS*IRK!Vj*I7P1JRIn_CZCk&+6)r4md9TB?9$QNDwJ^=c9zm6&v=KDHH%6DxvxtM^tl*8Ab*R8(lT zx3^>A;i+kB$1=#p=VoSRdaHjB_=V(#Lv_H!^X0RNAv{0+8@#7o(IQRS6#GQ|O9^Xo3y!~{ms&<%@p5b(~+PHJ_x}Fb^%zDo4Sb9{ z<1J=5+kEN@t^$!k&&1f+*z@CZl9K+OU`peqDp@6^P?S8}@p_0?FW{d2z|yqk)3CKI zYV)cD1@G>ba2B2O3M;0E!!^%3sZoHH>)|r2)TouXw5)7LAZ?$^>hnvC13-k%2IX+G z*|sQ;{wwfCpf!FxiRbmbmAy6>^vnzp)y!Bz!2uIF?h#o4cfTXXe3VFzz~4BwKfR>R z>)+?tqCbB`Cfg!+%j?5=()S;=vxYqSUyDZFle@QbU2Q(Jelo0m&#>iDni~jhkJW~_1B2&}W*{tlq-bOi2wD?ZTiy{Iqk;F$&(M2^JK8|xi z=!6wC?7G8v*kwTet;xgl^NQEm(0~8_{raX|olb}kHi@I~4C>zEFh$Qjyt150J9!^$ zzV~M$j^e$Ijg5;=3L>gp^;vAyTZ$HARf8c4qf!9@0S-Mfd<$T%Z8wnDyu}rkS=&^- zJC=hySw*TUDNV4a*X#_L9rPreq@q;eP(_Es%Cz%jNI@LQXH{#w(qms!-kK z4LzW0IyF6gAwIzS7d&{Ns&`9;;tXk$T;*nwL#DHz@8NE~gan+4KDEzTu5^_K&zAIw z9Dpei#xQ}~Zp^RZaN9dcp}FM*rNA@|Qud}4TCJln|9EHp+xW$+U#!;7P;C=xUq>ww zH=41iqSwp7i?(nrm}Q^Fv$&gfA*z&M%6QLIL%=qWiUreTAD)*nsd?uLw5JvU5{viT zz4IO`D{BQ%l`1iVv!PJxqTyv>Dc_|dfcH3JSo84mY8LtNjk_ypXlS^Cq9Zpb3QNsC zy>^CwKK17Q=hqj{ez8CHfAXn|rI;Pa-Fz@+SBu8$sF0JB^KSUgpc4?))gnIE=iu#t=x4fd z<9_YY40|On-rzLgrprfa+rFJS|HN%~r4L9KmprMz*wEC3H@IE+E7eso+V9={x%CZ| z!0T7q@@1D5nlhF>8H|41TrHN^mY8wf?cGaCS%JWq-@FtplcdzFjAH)?w~_SYuu;sY zsWW5o`2-Ml9B&$O16%lR80Lm$pHQ zvR$!Sb6C}np}Cf$cPt5l>aK7#U9x$?h(NAau!}d&W!u>Swbd@bX3A63bxA9J>-HYY{=ldH+uZ;7Q=R5{glCS~a?M8z(#z7(Pu<1DR3nC|9zI;hc8BrXU!QnYXN}{1 zupsA^ou>smC^)Q>a$FinqCxI8p{fub(Wym6ilE*aCs%sRbO0uj<^8iVa~zefXIQ%# zw+in3Og%eiilfCG=pvBs>jyH*)#2Tzy379_*Vnxv$joKwTB$Oh`C-|g^dW8$OsKYj zPngv64sx?u78%Q`XU-b|- zI5d+0x6wErUG-F(4FW56QCuI4jtUP8`-bDaF+<8_9LNSl%vp7-)YsS7fn;}Ix=c)o z=do*FZ?ALQ)XCv0oUe9Y2B_1!=2aiARO)QHv2tV0b|$P2skypVAR30m{BRe*ptdch z0ibIq=`>>FhZ6MGizupM5E9a=MINtuMDyLbqkuXm;IgOVdIv2ikf(89^Van`cHrUT zbBmis=@{Bi`j9Zw(|vSn}U%rG&?q{CaC8&OJ9G#D@zYk zMr`u41P@i9Rg6$I^A(KpmdR0r_5)$Y;XLTvV<2h{*xL6{0_wt*6>%|#U_ye|U|t@S zfUE6WZsoyZj+?`{=ixF+Lp#5EN0Wz}duwrZXf0;^NL=J-5)YKn2J+iqZR`Q+T)Esh zr+ovF+NXRTPR^p?8dMko=q*wtI`;3u%eqjXXK9%q?=l+&sdnwR{e^ORoi)L86SMxw zskq-<^_-=mg<-gnmico`1rvo5{{RCOg*)QwhB@3`le(?5yAH`~@9snkTE-hot7k4u z%Z71f?z(*;iF?Ofx$e2?NQc>)U^um>ZbSw$YYTDLd;=iZYLZ*i1MW07vH+O_{UsSNs+ zQ!Q)ujpd9YcEckg7JW!`*92Elw+84*aRPUuM}Kr@ruZ^2+5IOkJHbWZwh3#&d0F&hz91`|K^wr+%uK-w4l2$tlUAvezA@v- z(XO#PkL$7qbk}r}_pLW z#Wj8L!QKXuu4BAyNSzJS1S>iaId=H2&1*T532NI=1S)5=qd39yHfUCYmOgF% z(2)*VWnpIC>JVC;Un=fgDIPc1uJx=QuIUKIz$TuLHp^eQ>)_@EEcy9pg##*u814T^ z)hU8JO7s)0KH6wCH8ENL#18Niz;OI#aZY(opX=^Ye&x!BX^}|}1C^ltqfiE!MamZ8 zvmWrj_~m-Z#N;w{0hd9LQr9;K^*uQ%oO*5r%Spbi*znM%kyTtKXr*?bEU!Ji`@{2_ z3~K6y_AmkSA`|zKr>Oc|yQEHPp~7_K_MzI z@tm>n=g?j;f+m;i1|hR)rlzLa-b`M5^)vturO8T4`kc{v<;s<9&1W6Q+fc7jqNlcv zWY3)QGR^_~}Bsn4^R`7~fM>jgY;&{=j0i zqR+6pJiij3kxO(1kf?U~$Cj!leJQH??TOtHTH_n!Xr(TjfX^SsNK2b!H}GQoaLTJ+ z^0jH{Be>O%%Lt~-vq@8cP7j;8~@Yti2v(J2{(|^ICjy4-9H>XA^FKb!m zIBThF0^GI~tOPv^pe#2V?&Kf#tNL}?MG`h>5~C6Qux1vvDFYr}o{Z@h%MI?9Igc%s zk04z@!aPHBSRf4A#*E7SvE1#gq|obr)lk}!Gm8;81Snj#nc$8hOhHpK+ij&>A_Q`f z3CaVkwSiy0+*jjhE3+6zfzGH%7!HCQi+w@qH+Axtf6UG0j9P<*wtJl)9 zrf17vCZq?&us@Tx7CoFdlb2L1sj=z0(H<3_YeLYpqjI0(tR1f7AOi6HN5~zzCt8U9 zyq+Mwu}r`SJwIqED;vzxI|H8{P(*3^bnsJL|@!^GQ7&m{P1#q{2{ELcS-0~{^;+c4nuQv)IoOn!XqV?Rwq-z_l z2W_J9a<40^0l5Vr#TBX5g^{5pal{w=&<-Auk$M=PQ(j#T6q%_lUGG?piNdq z#RXXwOU7lqSDFB@2I%(k^72r%8w`Z_@SJ{}*PD)emAC95c6wv?6G2=sX<^cO;88AH zIm*Vh){;<9g1m4E_tTZhjc2T;XMWSujSsbmYYg(bTB)&B=I~|fbv6f5uU!72;U#94 zAX^_uIAK$EmW@7yQ%Rd}`VM*vLri0nOjlc&Ps<|>l-tbvba(d5JchIZ?ZPI-vI34{ zm0mR2IR_9lE~3&B4*y79*T*RSE;%{*fi`FkmQ~ZX@o7tB( zb}||IkJk6`@heh;h0*KNO_e8k(f8!ygTcU-9Wu0;)|#Wy&~iLI?RMcS{gbT@47YBv zaC7%US@&(@X6@{Le7jnEA^m6w^X#DebWaC-V^f-_n?&VCE8eI3364~wncdOut(2P4 zapLnzC6a;qsJo<;?4JAu4{#KEo>4EC1@>Icef%~GXp23L7MP@e_*&cnG5250SPlfG zF5OAc$hv4Avy0s4T5zHcn&mvFctNfvM!{#<3(}ZrtXGTX)70MY87HhTl|4MFf4kMk zYxMvw3Q`aGr%zK^zBAI%i5X8QCQ8`Y6vC5?*EH?zOW+mx7`T4*Jn6~F$!2!Yru}`7 z+K-6V8UsT^xAwU_S~j*0t+yk~&4{{lSQi%sr>CbqhJsx3dNSm8vm(v{Y=1g%f5_%6@|gaqFud?&Wv9=ZQlk$wj@*-cfoaAHbZH(7Fi=Jw9IWWRMI@H z33>~59R`gen{hr)f>JV;GqyK5NSVHT`SSGS&<%*HZv(yt>9}mq?2c_uPmjaN{#+}L zd5nNv`fx>!prn-4q3KG$@k*0%v4h#>3_ZXOz;Od|>j$wdkskXL(nf4gSDJ?vbahwq zz80M%RmpDNaMu_w3NZTL7tCAXmONZN?{|89Syme888GAKif=2(m`WxMCv42_z|<$t zuD5hz{I=2?HqWvZv?+@>U)?8G>ePQ`t&Fq=mFbMn8LKbne)^;iu%6AtB_>WzMQHU- zG|1;tz-0+DqXyxYRn@w0hw!d8jK}^u#|SDXDuuyXaU7W$8P^sRajEZSg8Wf#cigL% zg?@R=ZB+$e#&U)fK{um$K!oO5lGBLZ~rX%&@{kYadqH@gW{HTmIWI1*qD1gNcpTQUGCl zjKHj6NN28~nL%wWv_9*)M8OhoWscA@FM{*(I!VrfXGJ;R*}aqYS)FUk+9s0l_fY5p z*y(#(g0!^v08muGR<(9N#0;|jl>nSl!eqNo*ua`NU%emkp>wGy3l7E=-fZJSfoNMJ zcAX2oN^&6KUo^8Un~ia-V*8O^r-RvTJpk8#Ey}~gQ~rKIuNb77CTMzJ%G{&h{EE4- zd>`w|KiW8MX>RTBNTEjN&H(lfmt<`$DErgFk_^mdAnYf@3xeUm?-~C%|KoP{ zR0T%Q;-j^O2$v|M5epS0)MsvPpgG7c#CJCN7%P#e^dNsTh35F%#yLj2uE(JaOl)kj zfPrMW`z|zex-0}2-M9FK2(7&FcTE6Vba!_XUcLIn{ZPgSP%i1bW_1~WeR}LQNYVSFrR^~{2@t%n;6B%z zq&Nt~GooTf-90_RVD8uKtm_C(943yMw@FX>0t(6omC4lP=cq@m>#qPL$YZDHO`Y{s^)mCQbs4#v zuZpNP$0tRptZI+TK$vsM^x$5C6X-#IRl|V&uO+@ftM7Y13Mt%h2GJ7}$M?nl!LEMw zMF=Upp8TKAz}o${bwwTl-g#!p!X!P)<6{)b7ue1P%hn3FER30y;ovpH+ z*3YJR_~fLJjBYh6uhdOHr#6OjIz=&*j>k-~gtNymjF2RKu}FKF!;E^gztk6jK5DQl)DC&Nn$&PLiFZld|&Qf&x!3e3f(eE%*Fb?_#3KD?)> zq(m^^8z`p#2tyW3>=Q;FZRQJu);|dv&%GoT{1zM^&U$#nZH?q@1e`@cU~n12#l=-> zIVu=u(@2(|nAzX23M)1CBjeph!Q?No+1Au0Zk=I(XZlIhF9LDB)B6R9;52~pw?nae zsc&p@U${wC$3b2`#$ei1Syyn4hOMabWx`ACNjc+lVJZ^u>eCmpx}uH1;4VOeGW@d7 zsheXSb?K}WaDoOzImh)cB|I>dRaJMohN>67638u$>91=5aJGqzg8br2jKp0w>04;T zSkeg)omq?aXr6qH_H$+6BJ=U{J1l7-QqxXKkqB8V8@AlkL~gXQ6QIqgyhZy%M0<9= z^q+@8JYUqp&gbeuN$00!6&J&?cRG&@6lc!)hz-HDS5YtzUAvm;^v|Q{Kq11`Dx60i*0I zbw=Kw|NQCe>#tbLWA9Cy6?vmR%i#_#H@0JXadzejUNgAQ!y?>>>FEy!i@d&5>tLr< zDsd*E2A{MV>|M~l3aY;5dJK{*|rIC!)uHi*6*)+X+S)YIL#fg;k@t>2ii=C^O> zfzR&@TkSeGmzS3V98<~N-5n-4j9)8n0^2>lB2e$3H@bG4pKKAqjXc50sMk8D@3k}3 z^YZsg9?(oSHf*W*ttSR#?~94?fC8D} z+^Mb(wd1*j1EA3b1T?`i0-i_CQGEZ(SM=$zfZLrs&IPQ@1^q6PmgeDP(2J-tuFbW? zOG!MIddF~4MTvm=`xppKDRDDiE;Y7)f-M#dTeLiqQw9WV)+oNmpz$F$I@;wltGj*e z`Sa)R_5~*LQRr=D{01pF8KgX@g^bRy)n}s@6dXc)6%Y{6HA7={o(|%B>Ph!DgYM?d z1|>cZOiDGhj96QLId~-)5vxlpPfJUy+f)XFx#e9hsT(~ou7nkoCk?12!WS%KD-o-j ztIKmw;F?}Eheb8Dc8dQy9Il@yYGIhRGizlB*n=zs`TaY@W+bF92Csc!0Zi?pVXCJ4 z36zcq$j&wLYpDcW44(ioxbv}+^3(Zq+N?_d^3*@M#);Y&4z*Wyy)487&v}Cy0kqN` zpepvD@`q}|$lHcZ0JGl~q!$(*HM@d=Ny&~zMn(qFG4=HHgjN(36zEzAQ?!-ziUv0} zibLy1@>cV!5wE?MS|f#{MBo=$C`}Ucd(>jS%jFy9|sW8Z&zpU$cRH1h3!t#k1qzy@{pj8Q$Jtj0RJkqf zE5}AGKm-I-RHUUFR76DS5F|yUyPGX4Ag!e2MvyM)4waPd4(aaB-@KlCzkBXEZobdo z`OC9W_WQ0iYu3y&&oi@F`p4Oc!h-|yv^FKVh}x!1pTpYOM?hnHDc5Z!#I?MV#X(VOMWml(|wXs5efl_4P5 zMyyiX9T{_TJk7t3Ue~;{u90F`qgP=wx)t~(RJzK+>0_U?q2k1_o~CY%c{55sgS1*l zj+geEt`uY^>zA<9=hmS176VMFWF*Q?-vqs$Ai~V%3?!^i*J`?!YL0issPM$b){v&D zsVTkB$jHb~V(jf(tt)01UIcs-#`+ zq4sNCHsC3^mG6s)cm~j`q*RjH3L{nDN_oRDU+CulY z#_4mgHk`L#VyUELeE!_HaqQF0;gEOt?38~%!x3fOx`odxD`vlN^P3 zi+DWbJWzPs#^#mMwRJc5J?k4(;YFBANQrH3GKI&pP(TteOU2z9e>8|Pr7>2(v>A_( zRIjU7_|1R)s=t0Y=&hle);UCOR3wx5WYU?~Kpdpbm3`2hV!HV;eH#>EQX$1t9uK!o z0bU}v*W?T{zwX}7IN$VK!F;>zd{GCmGKxzarIZOG4mlpaaQk%|bV8gnUZa$X*&s<6 z)4e2VW@eU}B_$~d@!h4MyOQBvMYiiYdpn!q0EEwWWyJR675D-mywU|2d=hU@0c)`B z()xnB%G}z>ynJ!1=8G3rv`nZcb5P<73JOxz<-)eh6J5?bd-m-7<*x^L5)9^8dg!Fz zlW|?kehu;sZFjGLF}ZBGlHXC)ko&)r*6s>*6n>G@<$r-s1mr}Y5aktd&_r-dbe59$ zu7fvGFxJ_80De^8@QN@S#0PVk7QVW?b|<^)66ScyISc{L=O_ssh#!fHS{&s>%7nc7 z22gcpG(6+Nukq3?b|DU$3r&B;rf-G0FVf=TgDpE@0vd1YI*Qk_>ZRt+Uy{H=s0nkB z6Tsx^Z{9mNoIUyGiIp5&T(P2I-J73#b_=bSR{~3Rq7@Vxa5MIyLNwPkv)0>VH&!~? z5?g>3w-sF0>zkvtKG%oPDLT{Civ07pbai!$oQ;Bjvqra~%}ebDp|#RBTk5wKRhunb zEJXKe8(iPU_Mv0aoay(iHtJ!i9iP2H7b*D$r2_DV;v=)G(i0O?u_c6zLAUT+TJ^{s zAy(S`@dmsL-FjU_#lm=(@gl2*Og#o;oq~hX#Z|wIDd&^bUn#iwLbS0`;jH=mf(a|f z4N;&MwIXsS)~1~GQ?@}q2LFfE4=2ta_TpNBHWP7hLyY5OXJu(a(-qQb@5>z5mM>N< zhRxdvzJFh&a{1H?x{52eJFzqdy>q^uW~HK`@gpo7n_h(Nv7*Jn#%}SWl-f#CD7MS% zI2XqUjkHtHM_EIX8#ruOi-DKN6JirauauuDtDyM}IBcV4oNdiRdkUypw8&U7(DV8@ zYKM0r)0;LfuKyCg*7j6wbT?hSW(Cfhb?6dOxHRuj8c&PK$Vf|*&=~xXqxLzqrN?37 zm+dtULaHC6-!avVI)mr%@iO^F^=XujR!Y{G38!8{!fDww>J7>#Su7um##sLND0%bZ zx=v2Ms<2j4j3;^?aB3fJkl)XfiB&Vc3Po5XgsKu0!jE|`?)Y7*)I65tF!$jsrkv&3 z#!0Jv+U<&k;w+y5U-p9Jl=#{M0Bz@_Y+(DBJdZ5RC4EDF9q zijN_@mA#alYwrbU3HrJSSo7I-iY+?0vrvNjb31?uJT;jPta^TE=?0I(N?r|PB_ zJt0suauhQ)yBM?zW4>})+uPf_TBrMG3ih_9ku2TD?nm!V!79>`yJI692jX2u!(U#s6{40$`9KOSsrMpMW!Uzjpun43raC^wolnE79WlIQAs?;A8p?QV^Gme$A#<%M zwBMap=@|CrGq6*8Z%F@$;^xhKMHQfiA|MQoeI z#5FuLq^SC2tiSdU@Dp{Qp(;YPvpac(@moOg&eCI`m$gjGLvQ|dGiXC%o6LkAK&ue* zm@co(oMem$S`Rk?bpHwAL65o5_n%>kmO98zIQ-G_>&qIQQpOTV)En`| z+3N(akL5RRoKz3ynI}v2wGGh}mV?_Z1i9TpnFa5jDK_j4A>93yzxadx*JOSWoUw?F z4k}mul$Xdce0zRuIu~(Jjr?chHLWV2B{y)MIQ^zngstxr)D#V`#LTg1-n;k8V!nU& z6!uhGSJV&G-Z*Mym~1~+FmW}uL)kXjVKq@@liB#`)1Vm9uqmnB_hPvDl&4OfH0BTq zX08?~9|-2#9Kn_C^sBrU)pA;%@mo;sZ7LOSBv?n-ep%du`Nv7E?y&V5@s|z@PL+o5 zbKN`Px2@^JHlLgG@!!DWg8&FY$)`7;=ltq@Pk5Lib%8CWs=a@4Dk2g~7q@uRr<_ed z3#IXfSHEfRYeQ>$f4BdQ!Akw)GoXr#|8%=z<6fs+7W4Mro+Rp``LUeI)|MP0*7X6x z@N@o=gt!-pzKCH%AYF%^jQ?dF_nA1fAR)rjxOn%Fqw2NN4yMz-6wg2DtAXIBJ6k_w zZ+2i$1O<|ZV`kf`s;XP%^bQ{l?S_f}$Uoytej5)9vD{kmtZ@2wXUmikdL$YvqBhGA zY<_L>c%wm)|DmjtPs1w1EekUEb07;6o^H)+Qsn$$j(Py=9KYi-*+f3(TjVpkvs+ zB_b*-BxjK}yvVaL_<9SNgD)J!?IXOR9|wLv-7);%Q0T9~tG|Et?oonC78iXF$O)eq zbSc+73=R&3dX5CB9=|_|YHioZ&@-^a4f-L8kN@7yGm+!TbI3b5Eu0}TBOiTH{@^C* z^zcCv;?v2nsr=J?k+Nc9n&H+D&QzA#+4@C?s54>?*8jeQe=hp7Lr1!MddxGKD}Po!eW8a8twMCl zP2qAN`ldl~Oo{NL*O{7_HS0*I0^Xp=)-hQ+3DFgpAQcV!*YQO>%vjsY(m_O@HV3@*w&p;l?tMA16ai z>ziP`Xi6%`b_S0lImS-7CDh1lR@-Ei_DPdZzhe9&A{@16t*)-VSwvx4nu^-hg@LBi zXHgsccZU0LIA=3kR=~T~Xe`h6_-oXMP8uW_%#N*!hO(zAgwRurHMGGT(44KUj*=88 zU8p#3Nc+Bf7lF8SpjGjh*-Ew4?`M1usp&Kn)J%32&R9j3oJEc~65GIO zJ;v?-?MEY!)?*>@g(lQAeC#&9*lL{(A)C3y0 zWx3&$Wk?VGHZ8vu(o<2H$c~wABUyDN-=2h2jPRoMX{TSh&%XjHe_Qd=hZo}vZgu>b z8T=uj5|`W;Oh2xtZyuL)9i)6nauA?V`Hq2O<_yP-DEUIQcJKbI+O<)C{qNK%ZDqy` z{Mqyuf2}P3^Wq#hqDNBkLxED}#1Xp2*49tZ^fKtM1aNUH*xcFiNJ&%T;%;@&E(8&} zmQLW_e%(?G!iVYTK;=`nPJf3YAE@s-E`!1%;1Fo0rVvO6L42mBbV|lgX?zJe~G}$|8572Cpid@3EkaIibo*YBDduHchh`diD&5!-bp`3 znmVO2XgMs)n#SypM$K!{yH;#{*Lrvl?ZsC2$f0({X+Vgk^ZskRM*>uvq7F^a*xHP$ zZ~QG^u9{TgW=_X?*Vwgx_EL9rrF&NsQ?+YZ+Q7%E-@URE7!JeXo_&Y5^udfd%@1G_ zm=0D}R@NflUgiE<#O1j{0P(DJ0j;>u;;y_BX*Enda0!WFqr`uR5d?>GTQfrNpIVMI zasN5N0Xm)$Sa9s;1DzYh2V=S5@rGy(tsg^!&p|tdkLZ!F5hhR8`)v(pde|0@uYzvb z7-;f8=QrnqWN+x}oroVPvk-}XKW?x)c$|-&?3jPgbeLs7;r=Gy{`dzD(=nsRYd!nD zwnJP4o1!Dd(hTBPqz(Gv|3wUKr1oe=IJcAruWRQngkw8S?N^Aj)6cJzkkkIzL%wvB z3P!Y;6fLib{B1*^f$%fCBT!26E_nLq=w-*W^B-si$fv8<%HGHD2u-xCSxrr(4wnU0 zymAcH3EK-9jNMAm;^8QIX6~YJYkjv;IG^2(u{wxbdBeV{cuFbbxx-ElpS8V;NXq$t zb};_1fHd9Sp-qR8<)?04Ig;@5!z1`>&ytSh7Zi}MAN>N<&IbU%jqIJ(gp!R_Gjn3$If20LJ!&AvQQaIEPAlI)A8Gx_al&W!!mi=Y*dn);|sI zFjk->lx(6~S^qUX>UKL8?xyVv5+%*yJAG`CKm_B?WCDvYpii8p%@j${Abi4SM}p-I%x{&pgYEQ7j&zECGuw;lc82McKgUCIH& zCdRLGU$|n#aRN{2Oh~36bW9eua0C{pT#=3WUg#~`>SXP<1es0K$Rx@fFEFZ+hQ~!Q zp_P#W?sQ$5(9O>sWRs%}k6Ghg|9vq3w?6%-uYoDuyd9X<+8k=;UqfbRF zWp4%(Q%CZiM1}~zovUBmzFH4M&V$U1NA95+C!2OD$f^zFD^e{9FTmd1mF#&d%%*!NGs`EiEnm{QU0ImA(MK#{lD39>%sv>2+lP zs^j@T*X*yKMB!_(TVJ;z>k9{cqkHi1@dpZXk&E7QiiDmx{_kgnlv*t)={{Y<4-)?A zb`p#Rl1rZ76~M#yREcPdUSK-@D?5PY`)~?uKIW@Z=pEb697C0zl~_K{<79TX!>xu~ zI}0ogL)nd=m*Q(M2F0bMcxCB!R8)8p4@2zg8CW=SV>2S|@Nu2rZmRD{d85&H)|Fhu z)@LwQKaK081E+C}vG97J=&CZPNTsRp1ty2C@5Hn3f3Q`0OCBa#&O$v_8L-;67)hMs zQNFB+ADKjiY3oZbVMC6hHdyn~PEAd1=NsP+KeWhKsXoq!V+pc29waRuZTc13SOt?oJ3hV+&hf%s zv2tx7U=2eeYGt>Uf%f$B@_J>E29vsqK|(yvR9r zx{jRi@Gh;+9_y4XU?SUDG22as>XSSjWNn&7wm<}IXswSqdE*-fD zsYl-Yzzr39))3FL_7nWbM20D!2Bf_nFVdB1qkp}a0HD~yk>!wg7$5atF8VJI|K}&p zbjZQbB(fZTjSlDBnr826XI}Xd_<4NiC@uLJ$457`h5vOlV8X1>Iq9p&+@t9ybZ=>= z17YOP#rjpX?@mTCn5I7n-v=j*yO<)}$g;Y&R>9T3=(X{cp!l`{b#IZaW#NPXy&D=t zc4;=NQ^pD|u7BRSzhO#)&mGrNvf0Jgw;u_U64lp`$I2PszxWU}~fP3=58Ra4DN+B<-vk*>-2{u{&YQ zz&&{@TQnt;259&P9^19N8q(i>=4Y3Wh=yys6;+bfALrxgTd-PrVOyb!PmVT`f2vs< z+IBHkK6YKd5DG>r!K41+AJ49`5a1xm*I;^t4sxl1pG~2mIjAsSYa$FHNa?~>vg9*D- z;RZG|7G%5Tp?bP>8<&Tl8uh;lVE=3=#JfW2Yt6brDS7m%q=W=#N`Z~49{5())Kss{ zj2di7+4C5!LvYcoMPt3OU^0Fl8ii@KugTj3p;(Aa&bn z*^Vrpy<Qs1W5W((E8^F`qjPcE<}7hKEkH&WNj zw-6dp=EAghhq3HoM#t3S@+^WAJHvc?!`RAaSIZ$%AC3tza6>NIrO_Jl_58v@ZW|k$ z$mC>NQ8X_JGMoGyQ}2>1|C7y{9G2*?MvpVSe;&#-J6W#}i`zjOqXUM9~?J1>Lt0-d@`M z2|+%h_IF6hViCR9f^rKino*aKWL#09?t&1+Zo6l|tHExJY4cg^M_~Y#i1Sv9X7vq; zA6eRCUS147d=SeqAJ;{#p=lO|!e_I$FRdI+CP#HyXvj0S%05t2k_ly(Ag%5wQ5#L@ zmNPc5t8({x_TpxktNL>L7(cpQ(b!a47~P`jPfMKOf)8g*K(R>A;KEKO4cShnosbOL zTnTeHbQlf*jFYm92?c?S2+f2XPQt3nj;GY8r`WGKdEAG5<6$dZ6azob%LE7{RREMogFZ15(-0n)2||CZ zDyMshivD$bPYG&2!rlyXr1EbM_1{}xI!e>n-*s_&n$(krrRTZfY&`^J3g^7S=Y*Nr z8r&+buYl;8F`tv^k9K_?4xa9w$IrVn_g)7aqq%)YZPdehvosdcY5(O^Php$EEJeA5 zTfSZ*9m?nV`WzOHiezZ`Rn&FRdeTD5S)m(v|EcyOcJ~1@a6BOQYSmhyd!0rh7Jj*z zu4KtS74Q0?sZHOa)nr3Ys5Q_dGkLi8mjoJI@+;hr zmp2$IAXBqF%{nq2(jhy}A91lE#QzPi420P=L*4$^*TP8qNnN-%z zVV^$T*w)5ku<=|R%%Xnp$NJmTeWgSrQbxsRMDL+Z zzKB)IhtqZL)rV5KJ73?3|CmB0FINAyzyE!pL0l@4@d@cakLS%VVr=EXH`vOPCNl(! z-|;>eyBEQ`{(it!-JNP%%iQ%pYxg%7DQ*Q(krP{(Bt8sZDK&20Z4#pSdH!(3Am#-B>fW9`_G%`EEyOJ#U`b0DD};zRA#zq& z=Qc3c5YJL|3akP6Ma*Cjz&>YgaZdCE> zQvmNsj)bVs^J~-wK?lPz@gR^+iBk*RWLDKicI${vSGsx>r6JAVR^WeqtDF9%UG+@H zgPl=`*X)$tHZn6q(^hR26fiaQvQv|7waTfSS5i`2Mqh?pp{Z4x5Sw6?OiqSFO&Khd z0W9>AY<&qUaxfVp>?9U1w{H`+uxB*v_NDZ7rr@fiNWgV+)=|x?y^x~t6t`kjDM->-S$p>pf zs6+Lz?~5CAyV7N>tT-L(3tO3#D7u7vlg}5rF^88@_u$C;<$9_WxFR9xA{4*U280k1 zu|D;EekG;8m+Hzub2-zQ@*bllV%P^?i9xxVj|lXL9CdSwGx&Pg5S0q@LokPp()kna zl8Q@9yVfjmxrRX5_pH6VX}JhPqTZU(qlsV42m9A$@!* zilzv${b8mk@Ww>b8kJocdpE0s{%pDoGb5CQ?JoM6Zp~hb`1+A?&&)H5tz+@*F9?0?INwamc${!UJ zYU@pZf@$X2>RNvspY*SbiHg?NU>^Ww zuv}cYNB4HhKh;kCE)@dyY%V0c!h@10CJX$fV{xpJAJiM4>_p8p*-f+*ce_Mo6ck+{ zRZJ%sR6q=ASRTD6?jK!9;ye`l@DGWQi1K|JlLpqt2^knuo`_>ePasdZ?VR$}%v8O+8)C#kq zd$=RunJOD4#<#&_Gr|uX`>&@>$knu@QJl2@ew42b&CVUua|><0=5zc`w9ow*#ZIXN zHj>F+wIzDXkyjB9OkXfg=FUwyrEP9ab=vTo(|lkuWghjo@ylI8o=5m3L{CyftldX` z`{_LV?^ue-BG)k7Yv&8R4cw(>Y1LTT1k4i?Lc@ocB#0JQdO_v=D;4lg1#=LHodj`$#zMIjq%Cr{9Mx?p&d7Y&Yga zE&zDai`!m2m&5u9&rKnAP~UBq zE735P)j#v5 zb8pAV3ET0z6VK>RCzp3F?jDIOZ5W9Hn_|!#KAFnzQ(uoi!Pf~p;Hho~gXy_i2qs~K zw=kX!I@ddfhK8-#mpvdMgY0uw0<{$Mm~q~SB8X}J{)J9IPd!$i-jDAeMPDP^@q5EN z`V~;B9~~3A&5^>KDJogFvi0Ib>(yae@B@`TMRJS=cy!7GG9{$iRwDJWumGzLu zSdk7oS3>n0PPY;3~BB1?GqUlXMh40g>82*V&8nduHm5JS@x4Ye6b~dba8(lK^V#eW`k3DyeC%6>4 z6-vt(si1M92a7cRZn-vHy`AX}Pitv7@!iM#!cqgOKYOC6qq;zq*z0^en`ZCBUK!dFS^FzYkdW)UaNraDHBfrmt8Q^gzq z(_9i^wDIz4+h${|Cs<*6^U$kSnFH4HnH8+1^LRAd@B8PdI|`NYNVW$nH;^V6nHU>& zC2?}*mKV?-s@7tjPo_J+YQ3!eK4Ix-+RdN$i%f$8?xZxQN&-zEW5T&OsMVFXgMeHU5kf-Jf2uWz8n8>A~Ct7Q;^y%Y!Lyfiw(t8?TTrX9EM)j838LCdYe88 za<^l-r$WTmyBvO$n^ECIS;(Ch@?I!FN)*K{eWg=AphoDCf!x)_hXZxC>g(&RHh#Da z2PhciNlS_5T0;`@9Qq_%zm{H7jrmP8adf48hi#p};c$g)D>GKZ7VKE^ozt3zR8zAg zDH0Fy_sy{PJTZ~z^YR*E@8aa4qN-PHeNu%j5Huvmi&DCfwOtfCp_!=?>dEzjR7}25^^9aNQ51&Q_Qmg@N*S>^5su zynr7!UG3seyO@<~PwOzh?qDc99vWAU)D0?t$w>l)g||6?*cwAm8U)s+m@-Lr|E+ki zRQ*O{TMHqeTu}a=eORw=oWJ|x?%F_~zw20xo85wxn!SLTInEx-Rz-dS!%4?rWlp9P zWix)>jmZS3>8MX}W}UXZ>8f1#(3I~h_vNyIe*^;d` zD^L=kRG0m1`v9ItZN;L9&>LkGblnOmv$1@;+J(a;^SN<#M}HHrsdx~vDGjEZ$%Eo; z$>cH_n5Hc=PB6D&zWz}R$K3zDEpPC!wC>YTeQMA2$!}+)fp-;9`!om&TyDB4xg2Bl z^4!7Eip<$@w@6fSr$r110DAlZ%~ zf99xIB~`b$o3=e+_zF#=;C=ChbH5&yyjKG^*-mS}vgEt`aW`gG=?AO+tK-UNHst&$ zz3{P9YL4?k==42S=5!}2(-SM=0kzfvY0nT+AY)&TjG%s+W1wU=_y^hBlAW58nG6Fl zF|nO$^o~$3U!kF$??8=g@sj`DKzq>bmjlh^I5KVF_bm1`cIc#* zr+wC0sNW;mg2`QRC-&}!fcyNrGb2eJ2=2wE@HXLfM^&9ab{G}a6vu$I#pYKz)AE?> z8Lm!`@88 zP|5!KJ}DU>5ksJo)C3CVdJpOv>t!f8?@gn2Q-5wSGAu=D2n6V38;vf!%vyXT2YlzR zel(#HvDumi33_-i$)fP^+=R%U=hlRXz;yK_r0MuL=J-(R+SqB-r_)P_|H#Oy#KzSQ zM;Fc}C2}^WW+@K}j0ZO%a0yRkIEto!ntmN--1O&ySQFqD^->bxp3(rGvR;nVj&eZ| z2M)$cP!YAe`iB=`LxmnIF`QHH;dz+GFx)e+XR$=h)yFSiUjv1wjpo2&=1;|(znSMR zgx+BldXS*qzpuB@vZke%Y@K3^*O1<2cG)l--*eR;AV2k`Z=cJf;#l)P#!FK%PDbAP z?pHRH&Q`&tyqM60&m4Tfl~;NQ-t|~%5S0k@JatFvL&}S8viGH=S|WNlVHQHK{d(V~ zDU2^(bY?5OiZbX*zX1fmh~digZ@`%zhEBAdd7B-pde5XR_9`?P*6N+uF zwv2X+?FN5-=OpJRcKrO z#i~wM85gD~keMU{m!k~4PW284aOSMKYLqX+I1)E#)l>n<|0U=!$)CdF;|0V>Hy}!m zH6ZG{#SHW=4JQ|upd=NHHL2eK#XDJf`S5z;9dh|x%pm124>(;~Y0z3bQW7WIF^iaM znwTQ&vb3UCv`|pcW^R`i;I~%~WmhIGDk?dtkpFog+A2&`6WEYcUc&kFsqJWv9zyRq zj*G+{ba{A>88I>M;0h>Jzty--ArpADZ1nRe^p)Z?x%>){~bn|n+2`;5E`j%O7+M_7LO?!P+cf8JINr47Xha6 z=E>?AemESC9QU;<87&?OQn-zUHl)-;d6ihZeaK1CZqg8jGnPyEg-%LqJj`;~oM3_^ zU&@Rf_D&)t;NnV-@d8ck}ukWZ-1H38~ASL=VNkm*v?#Lf5#`N_`>6O zieesP@k%s(Ih5%tt!|F~UivRg2P_m;o#6G#xnO+&$51+)G79f-USv$N_e-B&h0`p> zuf7yEy13nKGv{K+G#LpM|Ef=LG6oIVO-M=HT=(B> zefT|;tfkZB=9$adUWozq9H=_ngRVTM3+J$8ZdD7(X+ksgMVRM7HCMbnx3N*lM_TDk zhJI6!YgG-RgU3Is7EnA=?wvUGTW021BRnWk22jD!X+`1eDyceYhXS$j<%1e~V zAgQC}N_swNHrCbnCji2-33YqOF(XYhx<0o@F&*%2jvr6q&9x+dtdARba{&s8Uh5zb zT695exNKgDQ@kQAC|LPMeps9!15R2l=un108EX1TGHN?3{9}FmZwBtJw;EgJiEVYV z#~Xm?8pXEB_C)aMQ$b;LKdENDFmWO!T|&tJoDxb`Gb(!t^n1ly-#-m+$+$nSah|f< zXz5KLB&Sq6eb6giMM87Nw$?ehr%wsdJ0lds-uoNg!-o57hQsrFbMuiOKdMUVZ9xv| ziew>34152kZxEht_QQkr`JZXF6JFJt#;{gPFWSZ=znIKC1u zTq9NB$&tAtm9NFNREM8L*Gka3kO7AmS7#Z-s9tPElOl`RIYqSDdR>ek+cg;e`P2t} zLg#cx(^Bwq(g41#mw`J^7FT5QaBK93>;{f8dM#IeHK5RmM*EVv?TY+7EUW9%=;gEP zLtpcl)RqAp_X$N1*y@g=y)oszDYthlsn=hjkt0ejP`Jpu{&m!0t zlf$CB%6(hg?OZ*pYM&d6AlBh_7`JqwP=-5A&^S${lQ582Rq}WC%F!5?z0ZO1UyS!S z6ivQueuX}513EWy%FNHtLQu1Eq9ih0R5S8c*b%-PKg5DoCZ_uN!kC25VzeS8~| zNi$ApFff%1vjLuiDjH&&!fPpcW5M^tqDT)g@4x-%cJ&d?0xp1|$eZKM@HAs?TxBRe z!&PnYan$QhM0^%2La+d^8L?8hn?y3ntL5APpnp);#PLm8p6XxqJ~zf2nbOO zVcM(&bUFmB&SqsrMFqR6h4hu@xe<0QkG$55Iub*4HzM}Edw6p4t z4{c0t*&S+KcCWh?hHt7t>lJM(Ti)hw@K${}#Z{V$K#(bYIH;n}IMaH+>KGpv6@j?F z5P^WPgtjn!7$H-QM&-BB+oyI#>f;^4<8`vQxvW=+HQb8Hc~f;B*EmgCiMt+EFFNR|F9nXeN zZrs-$>hl%KiQHTsubj;p20Py`QQBQk`-2UoZUu+o)sr*piPl8>if(eA)14J ztHQk*)@eLQLEyxzlpDxIl58B4#$xhANcHfCzPa+&0=!+Iqu}d%-q}8=H5Uef@A$Q+e->p+nQg# zCX^$&`fSD812M6IyLy{1Gd1~v*q)b~zTRi}(3dB*WoR?w`Lwe-Qq0S>7=86H7F%lF zTFUCt-`@PMdBC$;Bn%@={(6w>{KpyjhHu4ua&z;G*FKxcRbFxqle>2Z2KLUT+3$M~ zdDZPRTzo=DEEL2*=&PVGXSWk{-1lXIu%c->t?_i$KI@n)4T}8`!x1eWOS8un_@@1tM_)nC^VQCtC+t!A)abGU?oMVgX99stIJ)7$ zNXJ(s;x;-`BwI;={%>CBR}t9%7+L8X+^VC?+_~E*{1-~A6lW@-eE_O5M0q7#E^DLW zRV&Sl_u+bCyr-vJHA&q>@?qSd^I0U(L>U6 zc8A@9w2+=P)j_0irJGGadolA>LyPQ}iuT}6Q!`tjp3*`T>rK^}kh(Q&uiyU)7Z_O- zWLwgHNKm2l?uLH=UO7$&jTbEtHOHm{AB@(fA6zhaS!#s@74uxr9DeKC*=UMVp_Dt+ z%2A&Z?zPcP&x^m`lZ+ywzNljE<-K<4aDX9#kM;OuZ=6m_Ak{CKzj(gEB#RNq~Q7STkW9N?MM+2t?vM^Vlq1p-egoF0&*(+dU z2R~ygGx>*UA8-z&gs%8M!pRR)av^(rJbY^|V)sGwM~WU?71Utru($HAtOSGF+`~tT zJ`9K@1kP=174D9bxf8Z%DEV+u3|&{^30vHqMSBZ|7w`MfrGjhSs6-V}E9cKy4o7$F ziJl&{=W4{_D(FFUd5G`qRLEfJ<`lzewXf1gFkm?qLJ?2=w7rTy-ivCwaA!A?)Wpi% z{3u4e{6&oVq9Or+|H}aT^YU4c>9OzIu#17=SB$N;rI~+Amn*VU{^YVya`jt$>Om&% z=z0?>T#jy7B zhkUx?)x;9loZ=O+O0#fR`NEgq`coupK^cHfo;OD@RcOviIVQTCMKwISvLyeryu z_#w}$XZ&}a)%At$g*p>KfEW9GRtEbTo5nO$sTK=s*pAFf_VSzl)gk`PR|<0jM2tHh zE_ePn+`_X*C;ae4+;ONKbW4jy?pbyhbRU~BO)-{V^pHzi$p0Y2%9U@+8O3pOnxl2$ z{>s`~_;mC9Fkmog1sb+VPt)TEcyT|f?xzV3#YdHR0e)tEjb_{@`G?+k(8iEGnd=s= z3w>AUuEM0ahb9o)n_W>U3uDvjgvbcG+V&9=VgWUE+h?o30K5z*>^yRGYx zt%M5Xfqd_CSB&!DJmvS!~glh!*Dy#*^G)LM`_(QO?`Db>elkt)RJ({o(ls#YwJLRsLY$&<=xwHwLNta z%Y3;l)vJq%V2wkTwTC3EZ`L?Hnd2mj%}pI|vWW~86aiW9Zcz=@k%YtoroBl`^oMhY z#mDD_=5|;5RQ8p2LKODAtBzg`;{t}$W$o@|0ROVt9rE`STjEFy3fv-r`A6g>^T1R} z-ON1tcydRpIN|#v4+L};&SlnXxq&Wy>S2 zGlt~UrZ*2~P{)k(qgg2qORyeS zgL?U9Q@yZt4CAoOYp8@v;?_eLU{SuE z$DXKXv9nge_tr_KW|*sFugAfrXkqC`S+2=uGJPTYQI%Qt1>dSZUss#noN3*~mdcXz zJVmcv{BiPK`lU0&LrHM}jFnTtkCm@EM#3oP;++5eyvv=sRJma2{PP|CSVP7C2s1wG z$H}k~ThmW1PRSe}TnsQ>a>QSU%jqeF=24BxY_9 z&D`AwHtw>v%T~1MdYy^a;Pm&JvesRK zaza2ap3La73$wweup@UorR&)-@4I7-2)ya2MdfVZ|AcJ+`yXzfa&-0;d_Iuoay;pF z$#ZQQmCpvHyba8Aq@Q^@sP9^uXHt#*;I$Cphh1% z<6)V{?3Ue7Xt5Gmc@5&n#8}h97_toOf_(5G#n*_Y5)nsYli#Ao+A-Op-eeKO9Mdo{ zVd~`*shwu>V+;Il!VR{uZ_QgX6TfI7@t2!u7`d!_Cq#Z{BR|+-A<4{Lf+Fix-Q%-T zErqB?-pAu9uEI7GDwayz+X=UT`)euI@iNPsb>2Oz#gOoGgB%N8ToS3|r(4#ifZ^5uDW0TQ+R0(&g!+=Ht}0G>L9) z?rdYP(E;?ibekcUwgwtEuN~;~t{7Dxv5T~&7w|9(En$Bv9^)kh;H-{t@uim$2Zk2gJ_hxb>UVAtjnsKy>5WA_>`Gf!s(DOYu z|Kn&nr4wzr`n_d;_acAUt|KuBIz(wguKzoH5NOy?(!1R7es-47C|HDA<$QUN&*AM6 zq7X3>;>-rbC(W9wy0{T8gFSLOypP-AwL8V+&fQm5)k&X(kcYV)YZs$@P^Jo^*)cJDItDLz1>FPoL z;-59|OTrNl?O9yjSbW*n951YwzlHzD8b^m!%@IJBzu$UKyUI6@=BK+XVI>Yz zdNrHlfm_v0G4}kyCe@+-{8Ma%Ts%uFVid#a3qTV;i! z8yfy~q~Hb9G78A^Z8(q524WWw&Phlz>o;EIKN&%kS#^T*MMhU=_P)z&uTW)1rhqyH zF0GR4&W}*vupn`G#z@CgGA8gin0~Sw`fT`JzbmF5Ju1PQ%n0NX>{s89-s9P~nt1=M z0X50-S5x{&B;flnLhC|uo8*CS`=8hN^PMa1B5X8=to7rs>kF;h49_Bdocg3A(RUz1 zl0M2ISgHPWeBVY-v!QVeb@zuvF#-#3bM0Lv5+xyuj zlAYtQENkWsnI~ah!Z8#!HjMnlw_hm-ncd9WFMlr-czwKatZVVU$daYM=TPLz2@U20 z$ZRhUIWFRT2u7drj*ZLvo%fOw9&F<0b8ngbSE^_^28OFxa`qAXXNU;` z4gJg7dpGF|G%|R3jTyoh_WVR~_^KxL1ZBC?3s?>0Ed{=`N1r=zp8^rJV6-tS_JM!v zzmMKC$x|JpWiw9uv`=|w!_;a)XrhKBQT%)thq%#?!hR@sk zt#8UVWZze+`8Y8(Igv5oM8GsDywmUAPY_++LZ=tZn5ye??hQ*5n}UR~M(b7ac>K}lf@x14OrLHqj!mc>n=l9 zWQK9cDxhsK@AbJ4kik_%^`h&fY+`(t#n)u*TNWeV=e@pADm6f2mdB>Dn^5(M>GfL6 zM?**I@l?-_ft8ZV&>-yXx!Z3Nfg8m?!Lq01g-#*f7w|AS`t79S_gy7C^+!G>-q6|P z4vHL@cb%U#jP;aNNECcbEy`CU=BkV^2QFY56;fa(R#Rd=d1Tfj?~&##C5c7QpXqsj zq4jRIcgznE$|2>D_gFgQxa_?P(wFu7*#-xPt`BMq%jBHx6ja;C z%1PPHSyY(rXP%nB7tU!;1s##L(ORMMkbMKMTl}(iKZ#37xCi(^V|k$;F|>4ZlR0vL zxL4x|NB|pH>hh$tYVj~j*p@xfu9RCV@lsQe51+)Lvq}xCwhIK5t%pu zvDd{DZkuDLv9kQbXaCh0Z-thQ?D^#youLvDcl7b`xkXLwo}HcTAe*b-!+KSGcS`H; z&KvWuBgcjQLCgRiUp8ByO?0qr=|^oW*mDnU-sy_mKMWe|Hw#?`@;?FUS+zWp~k6uSaH#^#sU)}EcVQf3{ zGbnI8Ne7YXtAQ&u?udr--49oa@JpYS$p7y?57=XnZ`4P;PLa>$9g|UH0K$^tBC=+!W;l0BBVGfhTNh!!HPzVD)6+33WIn6jv0Ly#CC#jq1umv2+LOWwh8%dd z-aMC+^_T-hV#s5dyRfa{q zb$dk=5jh4WBA_TOAxNiEDls74NOyON0n#ERIfT^E-Jx_hLnAqK$Iy3=C*E_3^j?El)UK8h4`bgufrNwTTe9bT-{mzLnHBy`w;aWWiRd1K{R z&A5DF=N8co&hM9r0S-+w1}`xvNFN07+}7jjQ0c5`u|m@u){Sx+TGir=_BN8I2P-DP zFfH3-pSn%m?P2-TI~2o;s*$emTpfl34pzV(2_h7_4~A+31c3BjJV5o7t!IGFRRF`- z{(h0SMbzChlhVF@?ZSsRaD|tRbsc-SKm zJ%8MTU+Ru&*T<&&ys$D5OC`e4L0;V`YMB#1Qf{2HFQ15Hk#xx~)c;}E$Pg|A^yH07 z>4-&5fob7KmkJyV&X5q?x(YiJ-+G?yb$*J&)ZFrE-2@A@Q!ThDKd|7~v2DTSjQw&aLzt;q%&ct9T=a)-rYy}SbE-9ioeFD>l2IjVmZsyz zYtpP5%`8st>+7G!L|NnsGGR)AST2SmW#l_!IS;(Mo6@w#{z>uwEmIm4zys8c*OqvO zWjvIh8hg-l@7&s5fd?#gbO%^W@0NncVGq+Z`TdS4+eL40Qc zYUv#R{d*uS&S>SVn#0Q(7AV1fjfd>8UcQcA#~@)_q<;4RQXyZR(}rbhUhrV^2`4L4 zS8w@PS-1La;$@buF*#x)R<%#qk(?V!ZaF^Nw(=rJ`!>LRba7gkDx@=dxC#03Ph8Od z{m|}2sHv_RYkYj~>f@sWtVBGQ|NVwj9@(9o_A@s2{)!JoZ1|+z?}nhwUW5TcoGmWb zcsu`r_nExcvLoEBuDW5bZ)izZC=Bp*k>iJIJ@vWqM=MSFc}T zZP}bka4?j752~M?R%8=D3?EUJE}4kvz&d(kZEY{Ctoyzssg(Z4j-cp4pvqcL3&Qoz z=ur=bhPbR*5?GnoG`9fr_3sy3lXkMXejQi5c&3qt&na z`u)58vVD5}&U(!@$z19YmWSK#2QQ=?$Bn_qD_5*kw?h|K=%)R!TzW8Mbq=T{!o&vp zt0mnIC)oIi^~(Gq?7XCGo}S%ZgKBuofHeoD@Dd(2859u`iwo-M$)=}OurH4!GQL?Y z6wwrFYEJBg{R_AmSs)qJpI88Y7$k+~KoUbcYq}Q-&;SPMx);QGawwAbzi4^R!`C<9 zyRAoG&b?gfQdFfi#K9cR7{r*GSy@0V@l~q|iLyO&K2orKrmbK28Z%EAazC#L?8itE4=w~S6Kagw^WEv~pgsNGw zlK#8jiej_RH-K^8la$etLU-L{b7KZrnt4=C!@=jizQ>gAtL4Z7gMhr;e43S^5%%Ts zaEXCpLlN9$Q7wIKjb?H__T!@UxfN)%WqaCt@EfE_4eiDd&bBEMQC*3p@t zWCRGcHQV?Me;7SAU7-+)y0IB8)iF;HJFE|OxU#w~!(B}rNPFi`r_SFvzZ&goz8%yn z0TB~zn|xr$9f}EQ9+!vY6_}3On>{T6j9O83vZ3BkQN(?>Z^kG&%5J)Kmy9U;>1csf z&!rym@S^ole}XB<$po90v9q)11^ zgn9>rcf~|@>%3Dk=PLqtu4%S4cQ4pW*FXpa86r-RCxEKEhdI6LW)KDqz`^@kjT)Ij z_9pY<`T6Y~h-+T96RBh|SVB?s@P?3E8OUzE!KE^{&rRW5AV8rfdE7Nn`O|5C`C}91mr|s z!`k|3eFNpmk7qzD3z7xO7Z{rX^xz!R(b9oR7bk8I>i+6~1nUDr;4ZkQ%t-&^D%kS) zS>Gbon%usI#=dDGN+{2_9CFBuHWwXTQIId-7%GE6O7ge6n~-KFoA9ac!^ajQIqSzM zN;wq54I-18>dM^kjZx%rPFBO*Kw~gV?)$;c)ZEDp14Ntl4^qCp@E7FLK@x4Q+m`y; z_v`CLD5IJCsxK8C*M6u0n5ysa}q({p~DvPqcIzSm~vKQS?30ND%-aDIfs;dqH~0m#{(L`WR8yDcvL-Jbn}TL1VJ|C?)|L5V>R z_jNu4PUTZ|xbjXpTdw=79#!C_tnZ4uML3DG*~PEN%e>@vhL=&^Esb*;e4D&in)_X$ ztcdsNo4u(FPfZQv#s@9<$UR7HqR&itXR>V9m=f{P5jp&N2f|bE5m`89BQS4ZbnoF4 zCES3q?U(@>WtNG;lk|1@solzTH$m2=w9lEw0FnutPyPkn{Kx-?qy94b?mASTRXp@8 zAD!x+{)3wC2IyBj_NT(N+_>7(KV>Yg_?AXw^t6|CVHTMbQ|7+m+H|SV>F5v=Qrvbn zV@B)?rB2$zE8U_oC0~gvEP7H_a!ed8wRLrA=NI(XT6q@Lv7OkOIj>f?vk$72-M@&^ z|K8o?QgSLJGl{Y}@!!Z^c{ej5sG7H(ermLQa|y%G__TYJ;6{`t*Dubg7l)DSwmM#M zA`s3-foN^y*6AX_?ypqlUf~ z{1r*4%1c+$n-1Hr0(;K+M(Cwb`D!4fmY{0AshE3!2W|;ZAJ5~5 z@bvDE5QCC>0|tr852`ZS54uB28c-RNJEx!Jg_7-gwZIa#`7hnfRph zU@qTEcuJix;$iXD-8O@asp`&~ZlCzv7K*0o^2VXoW6r;63v`SX?jjzBZA@k3U^5ca}&|ETQLJD?FjC+&&DOsup#-^)P`A z|3W@+qDV%Mtgyf+++*Mf)@Sas)gEz;-Ibj&N>XvG%;P%V9Y>m~EM!Dmj;UY-c>zAB zSfh@1Cf}kCe9>Ak&9Zpoo^d!U`a<7j5G**26i&n-)AXwDG_;dCZ|{^JwjT4iEylR< z>Rg4F8>ds9{sc^b0EaAJ<$gXS-Fu;}PJb$w{4WisF^^U5eC*lflt}B{LoQ7+I|lBw zJ04HEJv`vMHH-@9=N2finDE`=W8d-9H{A1_TjF6`mh*a1Ti?f}*#X@0j(*Q%9(4e$ z82>pzhT?(&XQV;oUe^cX3~Ly>9IkwAb8du!Y7Vbq+G2w1GI0)Y9DSR&R7gGrxU95< zUr3>4j^K-Y3DjUNPi42BRQ*Ga$eFyDgZBjO^_^?_xCDv1`L?n`sz3tWh$x3*d??kC znq=0`0yi*gNirpXIH zUpK|K9)y2<(x$Vit=qv>nB`Z`qpW!{s;qnO!uAkPQQwO9Qrh-w(R-Zh+Q9vYo+B&GAP=d>W}|ZS8R|;hGROyMz;9X z40(nRch?EVPuwQEMpvU5(P$j)PLLq~Jy3>sT`Skzci`K~MfdK{Q&Ak(b z80TZ!5}p{Y?P&X&c4$=5k!k)~b*JRv@rga>@tW6a-2f7q-lBr-@=EtP^!2>hbs5cm z7E|r-Fc{u?b*6aE$x22E3OIi6$P`fuYYTL#IBw=eECmL6O)q*npPGw*%m3^ei0B&| zUMq;ZtXM&PKr*d7JHk<%4Xp6a0q!t-1Z>DTKd-N8xnositRo;taCWPSnKZTz5bj|> zoN0GVou(Hm>YSOmIdMIij-2PSs)#XSU{MydgV_l(Owm}*uk_wQZ)4qYm8q)ayt~^g zGBrkL(NJg;UZFR#Q!Xw>@sY2MDD%gu(%Fi__i}h6d;WkHrTKF%J7%f)QbJE&aStE* z-61Bb$c(PCpFQ#0=!O-e#H}8Gp?)xPYF(wpZnCq9GQlmIty_mYJi`uB>OvNjAB^SF zjLxSc0R7U``^{>T=d&2aWF~Pz_|IgzlC@;)MO_lPkX5zb(}Rj?%Ud_m3RGggoz^n@ zee$s<)gm;rr-JI*F65!5K)WV zqdon0bdCb+dtS1%i}9I`M(MB=p9MUl_~Mku-aUy=czHuHo$r2fpExe=g_D5Lx}hX< z%A#;lLHkmeuA^T-Km&8bF~zQuq7ojhXxhWT7wy)2!JEPj!kYHhR307}@ow3a^~`R% zmL`MTw?))xn$UT5{uD2YmJL=tn{DrbFihwFByRWC8mdsvNt;n!gI{a?5(gydebYE$##NThe0x?mw z(=*O8`9w^#Q@>Z(h3q`pE4XZ;HC!J7{&p$`OoD{6gx^oUJUXU(s<7NFHUIz*qxk`2 z-~Ss0j>W*p$$A%P)=$PirLB~uqZZh2YfWikjW?35C`8fwwHcXqeB?FBGWxX5qMS9K zq$yep^i)^hs9W2Sv+IlGpd2HS(Exy?m|(lz>>Mh87JiK`3Kq&kGU;@(|D%31^0oTo z*k(y*G6)c@zC#2aUSgBe_sV=lfifi~f!NMNYGqst@UdH|+J@adk8a=c16gYZsIx^k z2OE=QWjy;znU&U3m6Oqk8$7Ae)%J`pubu=-R7u6H>{J`_4>1Xi27K-V`23^TTF(T# z?Xl$6Zz_))nf(LE$pY^|!^Y$?(tVT{Y)`0=sSiLzr(4ib$^DVPFH?>1#!6;DeZ_Xp zLH~B31f^b|XVd97eV?%kICryfd7JY)b#JS43GUphyq(Oc-VD$Rguqqjs~S=dFsqlS z+%+-j>*|uj(VFtIH$TC0m@kOWrzlJx4hLYN#6_9oM0h$ZRxqdCc0W-6$Cbb0si&$Z zaMdgZRk;KQ%WuqZa1?1ZgT7nr3O8 zGcp!`r?q?dRbPL<`aPev-x&e`Ib1L_b~dIEC2(%H*KPPc?MSf+^KS{inIfOnDt}u` zKby*1RF^bg25|H~OOf7l=v_@G6M^17W0wMBHCOz^Ad6L1qAAiC9uAWVYZ^kQm4Tan zPON%r%9`Thd-+CUXFc5y&>bgxrz0-5%zn3QoEU6?Q^+Ra;^ds%gq}Eqxdn-0geTwO z1TtfMKcms(S;_geiTlS}b4wfqZ6@NN8Z%p1^W-U0F(c7S6n82TE}O7)b*V*T zIz}LA^QOW-q!fxV_Ri7~_pE#o`=;sAR#=>9YDbSWu%fs?6Cx&FW=f9(%rWFHT7jjI z1$XD#tGOPRGS)ip*HtoB^!@#(k0og2pi>-^F4ry8S6-%GLv51SDfkR`0*F@ji^}77 zLG8cNbok^DdSZo(hxZH0b+LZr4BGwO8`*f+X_ZJTQCRUA%azmiSYr!Yr5Q5e0AqTp zfJ;o=*A;_l(|m_8A}cHv(wvpuMk=pOk6v`JH(aDh~G5r`Zo+n`QA?S&un~ znZ857g}S72*Q+F8EqQ;SG>2kbIYs0j3iO`_$_Hcx3jW=LyU zI&99W)e7Ljmz9;p@h^LM#wYzvQ;E?7Q*s+o8j(;^KuvH6%wOUiBGO3bXAgXRDJM1I z%^>m7gw7``ax(LU!zSbf$mLOawb72_@ohI&retJ7r7QLn4vkhv`ABv8PLEYCxT698 zO7{MZ=R`PIB2>9tXNC!R>x9cFmJE}0Q2Inq4wh=fJ+q4p#6yQo?Xzn{qvDlG@c{X zKU^zbVlFqv{VNp5ylQcT*uUW-(4a%Zu!OWcvif?{L#KCu-^6|aP~sd1W507e+P3bQ zZzY)uUwx>?x3gcudzP(pcLqkWIXOwmf#AiX+ZHW!MYVYyyj*XV0pdV?4}_1{&(H6_ z;ulj2ci`-?_pNI_En=Mk>o&?D4B=C|+rdIkPU6=2eS(l(P#CMq=@0VK25AzU{a_kz z>sdXrO^4F7-3<+HyL1-Y@j}>n~X!c*%oOiC~+(anjFYO<%N=oxzk}Q6W#soMPF1w4aThkI~a-n)o_0*PTF#@=Wk~9f z_FDe7%6jX_7^79M(9JN-mi>kKq`PP4bsd2J5yJNB6!6t7o2!k4Vrf>E*?)It z%;MMN6{^-YaHl*l3^($RgO<5j^k!*|UU2DYl;<;_BXTUwnAP79;z4u^WRB0d)X zcF*`0TEWfEoxhoazunqjCeS?%IChz@?WOh6k1jNF=t5QaSn5l-FFTB;j%f%4s0JB=4=;Cq@tVl?X*7|>UZT`P}M)k;jXNCEfBIY5eNqGOUcg)w3kWfMo4GN1-$e`flqS4aU zp8YNknAx}!q>wOOUL{#&5t?vz1gUX%kq|g#3X1Cta<6{-cc{NVj%_al)__)F_FQuY zi|hR;q0XSvXBEx(^eJw1bo6=+=`kY!i{hNJjAS#5Sy#b1OUL}Bw5v6*TC1=40=BMQ zFRZaRJd6Vcg1kE>tqvVIIoeSMieV^ol8Gw%M>;ggFl{FQURjf-A@*v8J};|pc=#Ks ziJ!>Iu$+Fz{8`}Ap{aoKI{O-%4(X^px9iBM07Ra)ni0#>r_%*7n*T3}|HCAIX*Yh2 zOteb^N1*=r9H?ScXkfh+6>Ze?P67I9oqcD+;|`&O_%MViu=N9v4jPce7nD9^fmG$4 z$x9Djr)rNlw3OnoHezk9SvNvXX~$yhimPz^=RLFyd84LevbG#e)YYtuU{`I0uz2p#`o1Acc*?MpG7 zj_qJ=s(%rQf6#JA!`hsryA225a(YHtFu))8;3daNeF0b-bx`>S>PCjmTH`DG14YTa zD`O=Odh8}VWbkqm#?FWp?pL9GJgQpW<4H4Yk^Ct@y5Gsf9rEzp=X`K&OJyFEV^}~? zl=~eLhT(KAr_Ev6z+@(=*vUvB%HD#pzCk*$@sN7j2qYy2+;;m_kUb{Ka9qnY$J3Kw z7u+LdKx3cy{?D+Q9J8|JLgbA4@yv`luHOI0gnz1&|F9((YC*#-X0ql`;`9g08vLXA z?x?zCTppM?ER>Ow!x=Ur(sUYK8IUTXY!;=pBI33A=~ka7(%!y0%O=SaEm5CIY|~Y2 zC;D_HHDIWZ)-2YW{4BoIm_RvcQ3cT(RFZOX`0D%JVe!Sq>;bYNWPd-_U;6t0zHpCJ z{KcMX_8bwz5tH_d6RihYPh$40m{k*nW^#0viNiaSJqB`)R}@sG*e2GAcKZp$uFC9r zPZqDX`=N5b7ALBaWCw$%S2V#Fhk64BP2Bjl# z+SJrk7s#&@?Dv9pPRe0uz(iNvV5nmErgW3Fu}a0mLO5$JolTk5Do8))>kR?sbUhHo zNQ!1t$1#-8^2M-&Hyl?e$4~T^zDQ19b*OZdI7dXH+Jno|OnZEu!;JuuK9y;f3TmhW$K7Qtda1HQoP?&)m#hK5=Q?hFu9 z{Q=Bo1wiSih}Qdyo@c%&m;2c)sh*!t+Vt_4!(O^Sq{iheU7qz=84o zIH+^P=Tl{EGE~0$|le@6am=_m!Hc`=CF2h-A>i<12vcGXzUGcsrem~#dr%lm!~Tk0dz!$y4lURXE~7B+#ph->p(2Xocr7wgIBk1>?!WD%DfF_A z#R`8>fVI|u;(=tHP-?Lg!lThcn5WpuDjlYjqd+%&rZ&M1272(F^WtP z=t4R1m-(U>IYAuE*D`;X@THKvo-F+qSbWCm;rSYVk0J1dFip6>^{gz@zJ~KsG@n(Q zeVp@v)xo}kloIaIr5bpFjI7k%8?3z|tW*nQTx`NGbFwnjJg5JC`^E1UKC*>&%yyJ? zLVMvq4Dgrl@GmbOXI3MT_8@Cp0vsQZXTC#N#w(+5Q5JQu^8QjC$buW1rM08;%hnDk zY#4Tdwm4A8y24kn?`X5uTT(e!Y|qi(A6(^_rs0|--BfG_PqN<#$0>Pa6mRU?%k-)%hf;GOjXvhco_T#6&%B6jOv$XnTV+jJgaVm*&>bKU zbvqDMK~)n1D)?bErki(bMN{S_4CTpavr?Rqio_q*ouf?9GHtDUa{VIfI0*9|G#e3*v#hU~6DR?jit%+1}p%1=|nMmx*rh z9h!4*bqbWkrce{yJtK4Vrxv~7PX4fyZ-9V~m|+?X>JR0^C15CG2p`kynL{d&(W`;Z z_5Ouc2FY|U+M_!8FiITYH^@lQuTc{n2fq*_*|)ao3K+Y`ZV%HCh+P>hd2$}T$mj&R z9rD=nCT-A&iHX;_sy4aSMclSTKxP?^4LPs?ZIu#H?14^4z)#aLb=A?)IqZcT$vA^x z&KIoLg(%mYI|QYA><%0W_8y0)zzvEA3Bh(7mInLT4$E?359zK!#b_+og}HR^z&M$e zuc$^C&Jb7727}5?xtDe=u61o%>@`aUhaIuYEC?GZo|Vr;zl!S*w7?Gse9_DxECTvk zw|&S7HENY;?a!YYY6q+H^Ij-C`qm1tU4055A{&>l`c8_0|1o-feM9_m zcdE)y(4Cf=h>|~|_VFv1<>s41zRVe?d4ry=V*4DjdSRSEklPFq6L~4cxyyXOs}66R zQ{dh_9JHUVG})s`{p~Ln__r657tr<&Y@c1cCya$fxduS1$x)M6!T^W0+|2Q3wY&XX zoadzaaf=%v^yCnc5Ff8Owwr(P-#+c1o~syx`MjkF{notw@I-4zjfKwK#+Fa$n(tW5 z#+*LTARt2vQg4m5yh6Zdu+F7=O+Me+u>@zr*|hxu1ndKUTjUvqdtiYzEKr`b|G}40 z8)M_gya05(!R)Z!>@kw?*E+k(&FIxy90>OgoxYQ^GxylZ@oocZ(6(-Xw+P|GP?=+D7p5*{tkkR4Lgao1R>}2b2%vl@V^3P-O zu_iV`;`AL$Sis29n;fO=({RatKgDM~AnFcRRNH5cr`!$$eC}S@hM71yIdSg{Xss{h zb=1RT)6^=KDbIQ4tF2NR#?Ft2=nMXcXTkgzYKEu*_Oa^lmD@{@uLThry{)Vu)B3A5 zX$u0mXXXj$wh2KFY+mh_3CMS~ z?<~x%rhNjQAS%1)@KWj-6xD6I-olrc73sA7(UsqSxJ0>I; zUAyN7f*MjY$DYV*Fow2s9ys>%Wiaql+q6T#6bBKvDBqR@qjge^%d;6pIbGT+bLqbR z{tI&p3Pd#m+r|Gn$>X;`UtZ={ zTTs5oZhk*E>v=ivW_Sna9je7hpghY>{9U756V|ZTrjoF$#QnWEX+;o;l~lB6s%=k2 zOuXD@kz-|It(xMs-WC5Mkts`%iHR?C$>wJ#^#e)Koq2;(FiU@TiBxDo=c>{TD8)_W zdp_iQ%oPc@wD@>3)S5-qVgoI(F(~k;+;XP*4l%Lc@$vE8;-XJaGNd6#s{w!pQ3Iw) zV%RSw1UoL^QzIo-Id5t_;C{$+D;jAOzbi4sv|i@bc@G5{h+5FnFaEL1F_V=(-xSob zLWIZy{bfKkNYFzg+}UWuSb0}`o16H|_NvEv?m8Pb9y5oZ@K220P;)zJ6l3fDcC}f@ z3t9E3trK7lJZ60;K_sC5*9w`#^#-f&63mM1yu2iX$F36uu5T!M+Ro|iR_Ef*Ef|4R z87jQLR1+39!$HX9>#17)GwC>Y_xosj8>iMmD4{V}IQT6@Ts3aeA#L524qETDr z!~Lui$6p}UsCnBGdDdxz(#1hNC>|OM@fVBp@m^?1^^AKA`{1$j(;}9-oK4SoFju7* zZARglO090kvSk1gLsYc%Ri#TCqll@FDvRj_L?SY&Je_@I_*ZcfBw6B}>ssBYr(Sbs z#hmqU)os8QEa!5DtiuuAV`u9<@c!aGP`Z*ipf-Gy ztAEvRVS^>8f3P5do~Btees|;NH>+zFv9>ls+#q5C_yfPvFhL&&l48!t8iL%~?;+}} z<>S85QJw^?qb|}TDK)fMwX#95psEsCR&`K@1Z&HYL&NY2s6han&icGqOqXdGSy}NB z4AFmUDoW5wxIDCcm@f-f8H1*xfMpwO;C61a43mN0XZ<`-uD`)*>|f;q%XVwh*kK3l zJhhcP+u$wBk#cQw&J<@mYAqrlt+e3~x@q|ZHPEuSGF-$khOB3k;93=?a9+|PjbueP z)YhIBex-cM0*=3XOYZM(&n=j=UFNO0d-#-bL|yWz&5jw2tZerUXL|7VWsMTf_xicD zJt0b$`Sd_iXkL4%(nVmYV+#(lj_H|dPsa1lkyN?-*nDc-pX#T80i7Ef9v)r~{sHC5 zG*pZWAxyb6R6xsWI&w#^BPKe(R?IQVI_FQ0`IjK^KMMW}|2V&0l(E}`2<00M>-_>< zqi$~kITg!IjO6<3f!L>ghv?MJI2g@Vf_x)U@l$T5(4q)7rIh&cMY-VHJ%TXm_jr~v>bVC4bZbfZIch0lp3u@?V zGXldBjOol71)xj4`$Z`Uw8x(heJlUQgzAC7+0W;)?OpbQZLn>!)QXM z4_@M{cvsMtTT%D`=dkytO`r0^K!sw)Z}nmqdY*se*9ovWD75MN3Z8#NnN*pz7k7WE z>+EFb;Jtb*-Q#G1mO$~aPRN$r*0`!Q@&ygkfiSf)~+RkHkOEM1EQGK4cTfJne2z-8nvUjD_{S`@MykpR8E_9slKMFd`i;=~xdHp$ zOf|*N6|2Lle2AMV;svrGO$$n5nur~c*L-wsBR%5m8b@dhs^+lUik~<&e!M*Hf3RA` zcLf6@Yq%;ekJ)vx$d#5C%v7tev4## z;5fRdFfi#g+g>@C4L~x0{dct(U{eq#IjYa79_%+jZ4yI#n98dd93LlLb?9)KYz=3| zp&!lB5;Tp_!w-p& z`9NC`Twvi1^+os>)XSfgl2ko%rOsEUcNP=#zvUl8_0B|F@8KUNs_!Qj+plXnA5M{0 zt$ePjsaemotDrq@tJ-5*XE?5_YIF`n)K@iGE%c@$=S2AKc7Sq}&4IRAZ*Olc0Z7Ko z&CNAdP((egi6mk*{seZ}x0@FloRyOMJs5Z_;$G=4Y7xJ@hw82c>ln6G=9T;#qBg|9 z6eMt&Oj7|QyrG?tL6>*&2p?`oxjF0c0lboU-0fIwWm8+I7daa6Cl)|y>A{q~sK`jA z&9b&M{`N6&GuHWzS3WK}D##X)k&IMu{m)FSaHFq1fwHr6{;@kNqa_ z-`S=P_ckq7MoLoB(|u!NViqOXfI9@;Yt^&NTD-Dp|By&<8?~N=0k3K!By0Ld+Qw_K zTW4PdN|S|tKuLVQb@^9J7ejAQE6^|N&3&}rt``KF;x!jb%Rv8NnU=_COo~bljeOuq?G43d0XD{^xzcpJkbYD2hMu$F zd72#@2tYfHUH6Rq8svByGP&MBs_u$mbyj=RAF3J*7i|NBAWC+_6VOpH5Wc_4(G*giH=5aN&RghkV1v z-YiIo{0QB^gyH|Lo&WA4|KlgwOQ-_UfymO^^U3v11y|4zd&vSe^#YX)E1(8rEOb=@ z5?1kGs4gdCClizx=@xed0YZV1TomZ|O1=U0zvQVDKR?rdhlr?3*O{019{VG7Ltd4$ zT1HXQlLan(=D;}G_b;#D^cO9{8rH^7L4vLdlw3+T^?m&U=p#4VrtjCye@Rg<+XQ`f zrK3{Hs%~w$>tgjiqLf2Z)f3X|G8De^(}O;6Uxr#wLCgXajKvZqlL z+!L9*a9#?=pb_)v{n2w9zgz<1NQD@T=xG@+K*^e3N)DoYkeXx7D&cwE)XS3R=^--Z zDbil&EWs)|EM=qoT&zM^w{*)XkQ77#wQODc?_4^(d zxH$WG*6&yHnQrz{_Sq}Dv%jIzWGif2E_uj4%?b^ks=+tqFH2t?mNCYDE&!#XB( z?Z1F2?g1qydDMuGa{sD-bFM5-CiieZ&Nz=f({I|KE9IV9K+gI)CIatAn=xZwW1|9) z=jZw&H_pC3_jdp?NwzaU#Omz#{*p5VUNjIqfVMvSduBXQST3BPyn>_qw9xrC_V+<3 z%$Am+=$wko2Ly*Q%Bp-5FyqFaz}X#k^K9}>PCUJ#Qpza#Pize8gx)m_&;Tb})5o4O_ZHQ2jhT5yPNzB5vOf?i~3YOAk!@K-i-FSH=>JLa;M$qp?hA z*`A|8+T0Hax)Hnc*B3v#L`eOX&cFxwONqJ9qdR6=*Lf8p5=yqs?{$>#U!fN-(jN57 zdaQ(JBxmf$q61o8LNT;jW=(=(a?;B9D#1uuvj+u_)3ot*>;MT)J8JBE>;tz?_7Qi( zUbi@&1ElRSs*gwLO%e59r#SsZsAn~B3-#yOzhF5Vnwog~$1t}{Y2>j3ya{n*WjnyB z!^ElX&3DQVr|u`m%QSrmQ8P9TIN1TIQ6I`7EobK+)S^4OWVfVL74dmoE=iT-S~tZA zOr6bw?jOOBOTo?Nvzl1{JbpFPEiiovTyuX^OQE&^v|YuX0~FCho91M8Y>nt zgZ3gIM0Yh+`)1Ry0!aB{Fx^Y-w>P!T-M}#HW^p5A%se?M9h*(vE;#>-RvfL_DwMZUdy#kOZMZB_Dam~ z^r8eqwUxi<95^95AIz7!!goq$j-`o{Z}MNbh<@`bru*>(N%RtXtcO1*$Qm1%=QBLo z!D|)&Mm`;N!-Ym{Jf(4ss*0hiB3=^0qmI~hxWw_sR_Ps$UuQDR+u7uCQ*fUl=Coz2 z-vzX*_uroR&tr#+0Q2dPZuaeY-u=f1zSVQ@@_NH!L&AgxUFaK-%E~-uj=^~ReeA=W zHlD7rq=j&}H&Y8PX$9HODq$GZe$=t+Hkl?sHN9qzCIo&v!sd)B$@& zXVd?i13vg^K4i1M#Y7a!|vxJ0E*@$=$r>i?Vh~R3*0Fd2VXrG z65%=p1HqB6EqS-7*>noh#baPcR5 zh;*EIH;&7CKDxI&e`{v&*k^d@M8MzAZyqdB>~RCjW>ZQ_m4bqTj@4@$2#J653zr53C=TqC!JYCg`nB*5XJOFn zlD+$!=DerA$$vi<&DK*DaJ(I`{kFY}`Rk6BmQ1lm9z)ZC6J0c7%4Q#ULts=1G6r5H zafL!z2VUOSg9y5dJ8%P9Qvms!+r?w!C=Sj z3k7qlWEiL3(BB?NAstP(T3S!Ng(wcPyv^C?@B+s_r_M2oV|iG&fSOV#gmw*3T8yKK zJXa~hDuOonifkz2fCZgKN65D@bT$=M&L7#YLrXS-|EfWU;k>figAQ7%JifPtUk0n| zIb4>F)FVsYlAN18ZmyN_k4qNa+PD4*wxV<@K*!-%w=sKgHdW#vLg9&ju(6_9$<)$Y ziD-^w6CH;<5+dJ=vz3qWn;Q2bHf29Ye*@ZDHe|-At=SsxY!&0{e%`Y`!vR65ey@++ z1J9Z`i0ej>&+dh#d%$#@i{$P?WQ?4=>C({{9iF^4H}dJfL?4(%dOe|53eQ>Z0A&hq zpI1@a)4E@~y=G09Gfnz4BPdv6R;08Azs4Nbk9uD%YICgQzwtuC*Wxl8vBA=o19fB@ z%#WN9_L%J*)K^4K>0O5wwIIrro#Q-LiGgx|j!^rsB;mT(q7I9Td!W7BP=B63{q^a9 zdjP*DcaU(A?pr4zK2WrwC+JuUeQZd%?{WkcfW8U|{mh))0~v^haSg##b=P{1DYmz; z)fqbmid7G86d5Wjj$K}k+em^vRv&vFs~P{JKu0T+#&5`^{vkaDnM`2^?|IKMO9d)o z_a;JnvO&sOlY=-ckLoV(eXsgK4x-Ca^5O0Uh6=6aSm0Sg9 zlH2(|-Yq@SNhD8cV^_CO%Y`*#fyU3+FNeXg%fK%_2V0@u@{R#K#9A9cpEsB>T9}*r z+83jxshRJ3$*-Syix1=90 zvBSZi|F^?or_9$%mg(I=Q)ngEtsb6}VLp7m@Idl`HS==|+P1bfUNa-mp5eIm9m8hS z2HvEbx5w@O{9Mw9A`^bz+tD}h`MieSPBp|eSHoqK&Tru=`M#McYYKQtS@Ke|MBgE) zn1h>~N5~HX2fsnPR)%!p2}N$&YMbJuDkdLZeguh4k1xLS$}=qIa_H{A1x@Y1RU#aj z-n(a^p;4VYQ8eya2G9^Ken6eO z-!;wipk}ZEV&mIM`4CCXRiYs0W7EGWM%u~72$NrolvV9XukdWwHy+`w7&gml(@4EB z>R2*9t3`Hhy3IZW;_kc^`duNpXMQ^v6BBd)`wdtA3&HZ#*UrnL_-WN8ar6(j=HK4h zmqA=}yGydU+|Y-==rFp>1y;K3bteDF-@bOk$6TH@t+0pKMR-m+h!#0-u9X}mTrC%y zl~ibDxV$89>-ao2Ow;w;M=!qywhM*mI}iZ!jb++Z9gMyb6Wfv#<~!&0T|n~@1~MQC zWFYnXbr-`d`3{YP_`$L&)`&YnhG6@fjFfKpI+d*Px82p-4$8yV9hhojU`7!IX|uuY z)>}?Qw?7{*4C#*A9@W2XZTog!t#q|j82A~3`gAozpq$~zvmHvOT;4BvPMNv?mY)i1 zfEdSco)=c{?o&h=MJ+3S6ee~vluc8*P-d>rRnVHnYM4kF6uoR_jKm_C5l#Zif zNDtbx8vWALHm^$%|>fyB!}C9QxG= zJqZWU(=-0fL=B_(MC-gO-!5Ux(n9Fk?%wUsX&4g|N-+l7-DSv}@2^4*M>2 z&|)g@jhzJ52!86PHL5A*$n;>Oabg*75qZ6yu7OKMie0ueCD(6_fTNAG;A_aYRrUmJ z+~Ks>RNNuDYtqk#96x#rAx&0+B8Q!#%EHiR7MC0lKp7h4pb*;z3Ho1Y&m^0b_h< zZy?^TYohEDM0>M&N`o!B{iP>~!@7^D<18H4=!1S@tzN~g*pNyIAwnJDPyHh`u-T%> zI<8W;Vi6m0hQ+yX9^nhUnDTeNW+h`52~jcF`1K=X4Fz07Xk%O8IdQMqxLH%AqlM>! zZXvZnCGk3a)oRJ3`fsJrq27K~aUtPEI>#n^ij*}@mqyFrCR~2!A}G20kssH#=OqZt z4Y?HMuj4^ZY%;p&s7*<*xE#s%Eh_j6M;nWaq<~X&%*xWM;}+3|N8Z7xJQrw zIk>$#9*^<;C)b>Vahr0Js`2*Dp`KXM738~2+A7gu_!xJx1F=LI?V*8s!U3sGM9a5X z!m5KqJ)46a2<%asCcQMev9H6}ev-UOfK7?A>Kobhu9p{;&2l70`s{|T8t(_vmN!B5 z7>|q~72isw8Pf9C#7ip=P3uIm))2R?B8)68UoRshqHmy_gvZQR!FtR${pK@?nYJH! z@pOV|Xi@spO~9|z$uocWstYXBNUm){u4bU*(I=653K^$wvdXNp^#4Ty!M% z6H(R+A0rx`GIpmCvASr;Z3c+@&LuOr$m;&|tI7g-xud0-toaJFQuVYvhRxkA*YMS~ zdAK!f^YfThU2;B?R)YGh%2E(})Njk&Yr$$9hrUrk{%?u(UI%5~Pf5_*DSj7Q7_35& zU7#CLY?s}GXW2KPrpHa%*@*lpmMEG#O6J2Phg%Hfm$@YvS|~SB_~~s_-N-(fqg(0+dRJS?oB;hy6q#Z$Q1auM@wglqmnLI?E&3L#meG2RM?dN>FhA&2ve zsNX5|UdMHGgjX#*WOEjxURRWEd$4VM<%D~ufnl+omM`r~`o`xW2(T$fUa-KymamUv0ILafXS2aYc`>*wR$=4>OQYG|W zb){jBVWXj7;-yk97)%F&^ktn4jmMfXGr2Y#Fj%9!jzgtftq6veV*fcKoSXdB3BX8` zIN-Y>8XG?u8W|ngejYY06ZgD9+Mnfiw1_op&3>me~;7j1a*3cIe3VQ#UQlJ@0gxv7&n zw_I{pGi=|Cd?O24-uK{jtbqsPhVddC@HeXq$E5eYZh5e<4;9^SU?0dGMsjf^l#L(m zzWOAw(?UXVZ_03)^JtZGPkaDbW z@GgUI{9{{eaz_%+fy4c$=r7ta{SIzst447~Tz7K%&Ji0yV1;A)sHG?Kg71YT(Kh9{_NNTSE_TUGp>BiG)%^WH#p}0vua}vZi0iH|C3xr!d ztuxw)XCTk}?Z?)3OsHaKWscV^lszb2gmuu*@#1T)Q%qUy_wcz2Y3&4lR5(72h<2gd zpIEK@dF^_ns6wDs_x!%olbW{HPRh;Ij+H2Oz7tFPTe{ftYD0JNbiT$x5}RKsQK!}2 zy)BXjYlQE{II=w2nOvl>WWKGlQU;6XU-a(A7WK&zNIiedKL;%D<>;u(x6Z@4_j@m+ zlun28Nd87!spS#fEB}si4POw2y3fjQl|?{{pU3?VWnUc@<+{DC2#R2U2nYzMY(VKo zYCsWCi4o~WX@-<;K@n+@?vfl}q&uV==@O9c?il!ojJ4Wi*S{!;s>%ec zeNB?RjDTs{$6e zjItrZwT(t^4GxSL-XDFEru>cva>g1Cn_J17h_<+>Zax247r}1tk6-VXucmjT-EoQ@ z{=8MopLnvmwK;Mkb}14*(kEfcZ-U7XephnFZq_1+dS4h4H3`?v=L@>G{f;_OsKJau&1rSABM*I7^)8XPl0Vv4>d4C>?-Hp#EA-&LY5t<37xLf%lT#-}CD2yrQGv;~-PoLa zqAKE>vO4b}YPy(0?jB0Sj!+|XF}riF`v4N))Ob{n76MiE`NrH5Y<-?T7(TWl6Ri{! zocEHRoeNdj@z*N3q&%_%m6BR$4DTP=ku+^pm5%cYR)CsDtXpZ@27$-|A$Z&NO?^a~ zU+%l*M^{}Vl>4v5*dK;mpKnsAVD06YvkAz64TkQ{_nH!e`kD$6gWk4x&qr0#9 zikq@mXzx??A^C`uv5GegKO1rfGlmpD$J0@5rKR7~2v1u-uMvoaPUQ5Meib|WhsN%=eO zdE-L@1630yjStJ)IbKn2D(8lQNg+sC0>31u`WKag&&)}(s!~8jfs2m!9#c=@QX$fK zO!n4SVA6QFHW@fLumaz7w{?Sf7^c<1XMm>@P$@4zRd zm@Lo**J6BF=i9Vi;m#}R>y7OlVL!jl!^SYAWky~z@bZ06z93mj zG}%YI9iw$`BA-p`9Vm#ajzgcB&wC&6Zw?+MTMc^oV0RA`IYGLkC?i8|!hwYYMZ+{l z3_bN?DfB5%j5 zRuK`6{bp$owUF**^~+l7q1+v-FvXz$rpI4@2G?}xckMM0D!TrucBvw z2%l69?4Jg;I-QGq40W(-NY;h6(vXQxq9X|+V9Hd#08kB}gU$;PaZY|wfq}f%IW2_g z+860L^LI-46R2vex`SavKk7#=rqki*5@HY%>BWze7L!>u?5<2ndfC_XlR(`H_acS5qQ=yzi`-~jwV~!u>T}Uk62fp>~IrK(b(oy zQ-19CI`KwXA69x?@nWj|#o=1Mn~v=xnS3DEKqhG)P1&9+t9FBFl?f7LvI`p9Zh5qbp=t%BKP5O&0-(-jH!a* zW~yXh0bG-*FO#{kqQ)4%8LE=P;{3&aR6RaGaOEKT|Li*1DUczP$KB=YaOLYl3m^ z82R^22qMv{3cJ1_cxbQdzY{fJ{;hz9HkvfoKh|L2lWWQ^T1Uq#5F zY-q@*#L}$esC(7-oYFOXL;!h(XtLl!#q3m(1h8!8y5Tt8-1m9!as7m;msQP$se!6q z0LcI-QYsnO@rgVRh9I=X!a z?=^<*+fL(Q!L(LFtVhG)z<|dpxQK(|L3x&)U6YE=3acz4$BlbFxo!8>xS)A_iAd#A z%ax9a&#$QnPS=GE9~2v_O1;KyZ76Im zDm0__61^KhUD{vlgs%i>{f**93?grl{@{@$WB6DyN{mXf6L)AqJ!4Xi=K+&&f5_N; zzR6vjmlL^LE5;vkuMDYn+utb2^XT9Gj`U2f-r=NY}Hz zr(B_8XgeBK`Up?USLo$z0{~?oC_Y3)U1?`(<-F{QSk}qiJN6af5&$I3OTCCx=bVdQhii`~#V)v8!rmF*<@C2Cw=OaA$KN zdklmMg~r>GY&9ls=H>QFg{b@xAs+sSukhAgar{@(>M-JjUlKdCEouCqLOmE}60r$s zJN#+MOFV(ip2fq<`lVZ-p~rFO6X0NhIJg+(+{zyN#Vt~F5Xys~&=Q)ty9P4J*F0n{ zNm=UKWa1xb^AD9Aq?EWFsH$~;sjOJuh|6kvjh6WRGS7I9KxExaE9T^vR#z3k?2g9O zLjZdF!u$gQ0st-wn(s6e)a`T{S0k^Y!RufUbUhBBSWZQz?Qk8buMQnZgIfCa`_lx2EurHCXb z`^xM-%8!Lvy1#3Kb%}61q=tx#kR$=BNbo!vAt8RQW+^2_^!JzxDfvK$5v2a##@Dfi zTY9Qf_ztuo0N6>1uoXq$OH-qmEH}#1T98k{rbPRha6MIj)a<$g&{3J0kdQFlUcHZo zJUo68XY*B9@VF*a_0fxIdIV16HE3S%+-&dI#sgJ-km{&qugtt%qQVQj2E4c!GbFXL zXu~d0xMVl;aG{ZyVC#H}ubIUNK{ubN<1o4rGAOPKIHUmD&YBl;!-CDs?w;jilla-w- z9YbcVlx<4wisYej>vvc#xq_NvFp;YHx5lQXXTFj?jaLrlmtY|^^F?uEBV+*OtF&Dn zBP0t>{haZ}hT6gew!oV}d%R^84zn%I3G}iQpKd4*ZYvgKrIFKdSFk5HJ=nf#0|)iG z4-FT~`Xcol7J^L|YD|$ifWBJZ2PGUZ;sPVUB+IW3OH5w3gzGfiw5Y+Xw| zNk0x*acbZuBp;8}Oeh!)JeU{P=ma&f$syeolJAzYanRkH4?xX^!oU5jbl0L(EcKyWkROtRpWfo zz3%gq6h3WsSxIbPS)AT=?AFKX6%0qG3yx-~ahhTZTJ2mpusC?zVO5kZ<8&eMxsN~2 zo-(?8Wt1VtMUg|Qi*gM=VYM>1jDpfnmd08csPDR!@wT9mtbz_q7L~BRS>tYyz8qxY zYi}KR67#cN@xQJ=>zabP*ZJ)B(Ds#Md4ZD6SPm}MiF|&Vd+IhCNEUt(4!+5>QYV3;n;-No@)ajSQOi1Qi~ zruJInjLr46RS}dAmvNkjEY|Wqd?>#Zk7aC;tJ-MbI6K{Ypj&QrxS-_s1#QwHpc#1s zG}C)pDjQmfjp}LbpJ|JflP5Z2c08CqC*gKB!18qt(?~wjWXRFKr zDJ_i$qrI@bw-jg7Hh;V*bS~QzC#4`;lbu=czT)3^h}gdbFgW{6wNy8N$b^o`ARKEJ zi6kmD-wTBciuQK)XMj}EP#-QQ(~r+XORm0KsEP}?6J{z;g#5?Mt%Z8?8Pl^2x?BmC zbB3eOt1*WKD)i%F5K5T#p&kC-89;&KDpA`+vYOi3Pf*prv~q{t=GWN_bLxs{9HHhu zwj!gvTJHKafsF*vO>TaeiqDXzvA9V$`=0&bP>Pr@*!Ml2tKQ%-d~dq~ek`T*jJ~{& z$?{vtk4!JmkNs;!vzl;l*%$@A@tw|q6%3Crmv2ykad7m3B45Z*EqymP=GId|+XGI4 z?Z-|+{fZx(GUdGiM{O}@dk{C9ABK+U`!mTA`UHrzI5#+2-us~z({Ilu`Y)iSNzEB~ zMWYd+Fxa^QEZxiRV{hZ$r-s}Zh=Zfl-4}_IHGpFvI1fVHhW$1vP+YN5-!BAd%w#^- zO`KM|%pW~+ynQ7dlhz>8CmEMOdihGMDb9+g)XaYm&!q*NT-qT+%jA@Ap|5(1@D;UsHHv2lk0QZYQIOz3iU0B#voh#30LQV18wgZ?-*%L$mjlc&={ z1a>2*!QrsWQBT1@Z0 zN9DkTaIAaP33+s|?p7@4xnf7H7$YwD?Q1ENy12(TK@KDOGZ0YELQFA*rtkB&k{(HZ zy@$3lz-9;oI*lt@X-jmx<{`+i`AWS5a+qOKzAY~)AyFhC^!&M3f${K6K}Uc6w{MTi zx4OjHLmKMq7Z%_M>+Rj$U3h7!3K(6)UC}rjX5Lyyu8Rbda<^wd*fqg4iVhNu$Ksm$ zQ4wL`FLlK(GhAKMW?;KE_E>~*L@b57)X6wNq)^uRG0dW%X!-iIDEpI^P*^qVig-9& zw;Hm7GYu+6y`BZ6fFQJP$KYOZIiq7Z`Qz88V_vh0gv12#U2@^+JYBpEByNHkEVNC8 zM|tICLU7#fsmTVuJu)2;v{Hg!7#?;JIH&Wp>b$sRM;k3A$em%mD{t9qVx8B$HSV*s zTa0=fubv#6jyUa(kWx@oXlDTyw4A*B0Tya3 zhKj<-oW-&u4!n6&m>08IH@Czs9rO?Ygs6h;U*p((!Yn25T9kiV=AH(}A;V9HC?f8Qv<@cEhm{a=I`~*Nf;Ity&F;D?a z6_dpsuBGpk*N;?@R01Z423wUv&*a^60=>A7f9v9CK~HyeF0 z*x}68Yx!-n0kXrYX3@s^910a2Gs#xms$+JRmWRuo?D?PAR*Z~z&&=q8-3F;eMFVQw z#%Mf9%ZocqhVIoF<2ih^{4vKbTHxu*o7VoE#5M(%l$#IU-sgdAPQ;sx34&_vkjtBt z;Rd3WF-v zC)IN)_&v^pVuo^fWJX{LG1nA5;{*9%kRkzOuDM^AzEwB6#^4qj0mYvV3qY{X-kOpy zrj!aa<$s)jX^_CKAPB2uSXDL{} z2{@tMVbRi2s_+=CvLc+NUPN+hB%hpYuJ&nZAaR{0vgkm~MEsoW&&F`Hq#kA5T&bz? z3cg6yyu!G*)=1^wIB(hhaQ7_t_k);XxT3B{KX)joIN`v~B~eHz?#L*&3%3+7ZEd%#bW#@5OgV50!!VDoL(B0c=>APe9pk3x?_|G;tyGy z1?T}{WDKW?J?xR3jnRpVfV1C){PkWegXc>4iMg}7$61Z4{eGaf(AUMUFxMS&m4|K2 z0*yrJUoqt|=qPBW^b)uA1J0A2e{SFVQ+Bp1lCuW2QKA}Aw&695m%6rQW;{|N$7iz= zWqwpuRpsM^4w0Dw>N-_)C77t9346$HkoGN=o_gp$cCI@;gm;h-laq}rH9i{>TMs2v zV&qaQA1_EN4CRm2hxamT9PUsrvvY#14MqQnOmpa4-n{Bc8hz`fu)|13|F%_CWM8>T zz__$Y%k^02He-kTToHGn?(>UsQA$~IgOS?K6wDRPJ{@{0tIy;popvU&uv||37%AnY zo;SHI)$C=m%7siX7Ki%n=^$HsY_%#tx%9b$4sgK-I_fl!{s)Oo20<*R+PnTxY_PrE z@J-hybELJkwK`jS?A4Kx5sDN(`>i=2Bo7Y{`%XAHm^@B#IFAOFuNZ^TA3Bx^sxO>u zD~Bfv1$;C;d;_d{U*X)N$w7M=Bn`0;ht|cDnppF-8>@}anB=D;3QGm1tDYk8A##fa z%(A5HH#X&>Foay zj%&qB19HVSo61H(ZxJ|KcKssyF~>QK3RG_dyoW7waejW?ext_YxAsPV|Y-=Z`c9QKxYQ@JP)1CqyGKi0o z-5RR^Vs)OB^W=oIdK*;D^2cjQ(?jH%&CbPJ2M<2O@<8u=Fjak8PO1)G-5M#RL+AJmheMb_D?T{--`<9Ev=JoN7 z$&hfg(7it;CnUS7b<9nr{{C&G3nm*o`^l`YhXQJPdcNJJE-L1vNOxR~h>Gd~8Bfa4 zkTT*O8ylN$21uJDfm$i%>iRY~^P+(;ysZu;V}Wllrls_D@?zbLyAEnh*tHqCUmJS&hB*`2}oKS`^;FVYqtUBeczx(s=^ay&;!z{m;TmgL_P-bl?5(8 zoste%SaS7;M;V0YmAs#!^Vq+c9lu#u->4xFK|u{L&YF~jWZkwaJDYXHcGCBtudpJi z@_4Z_rk^=zXKQOti}Elm#&P4BKTOIhOQq0cVH&H$;~II_e0K`7e(7Etz#o`QCIryQ z-$SvVxHrtzw?I3#M}OOIw(Lh~rBvc6P{Jl2eA>eiA}Y)Ux6hw!1M$ex zXe*B)b)w8mNbJj3xc&!yyvvN|rcr}qm#dTi2;Q4cpH=i2jJsd6Us+!_cbcN2?5so% zSI!?FuJ*Tak7a!M@&{rv83&Mjw+6KQ=f^!LFg^S**2lekuR8H2IX-s^4;kOJx%E&= zP)7*swuQcHW%(LRLiRgMP`xV`vp}5A+>hOc;s`HjA6`Z}pIGoZwPU}_@oBf?OQe!s zFzJvNQ}L@8OFZHCmvOir(;3M`AGKNLdi&`)**jL@WaztvBhcCqB*+;$r@Cs}z~_kL z8o}%A8W7qfG{Zfq`tpG47hW>XnF`bkVvj0vw-qC9Nnt_dR zU^+|%)#hO>7R3<4{Q$Bab>p*rkS4!a!!^2l8@^RJ-W{)N{~R@eAmv-f#Plmw+Oo&G}wVGA;)zb^B)_yF9*r?JLB0m#s^! zmxre9F&Fs3f-(QpSVm{a| z_6~&Z8EQa5yCZ0erC~D}wYX^S_929VYO4v`neB176h~~ic5%})IR0GFlD&l0kdd{! zm`{r#KY)FBqPcjMxFmhVO4OmfcJ|Jh6?Li69Q7Kcc+&>A(Vmj>>-|qq^^p_BHN%Up zQ&ZZg@{#@CxQAYu77MevQ>YE+`y;p`G@RB1^eZ9rlD|WQ$qeTxbY7ASIZ)`dPH4!|as~Nu3b^ z>w}l>P}e2HAHr3E<)v#LNCg8_@C36tIM}lKuSfM|Aye{Zf0Fbcz8Kg7o!emA!no+y zOK5%5FE^!_2LzRINQL-^FD(ttLW&?VEvtM-+1KCyl8rexe8h3Htz@<0kZFD}9~O!t zuL14-^yyO^YTSHxq%C?=h;s4fR!oC?U(3C5oM8}c%k1HqmtEEL0MzGs+1a}&+Wm@a zpOU2R2Yysby18z#ypHM1H$U73c{$-=bLNkKRcci?^ z{=OYFK7|6hlGWwQSJ^L@%}bhgCHh0sOIJ!-_Fl$oCFi^?G##r@10Ycm15pb#47n+I zlABd=VCV$a_Q>k#%~B(EYeFlJbU>eP@p2+Q8fMis=?_$kFQ5wmy3#HGZeG~?2a=@m z&n?R9m#OOc^?ICFf?!@Cn0YSQ7(kLU<=^rna)c-T@?p`3h67H}1O{q9Sp|dbu>oO- ziHtYF+H$FPj5Ce;zMn++n3q~+q=}cpBEj?s+K#REoq0AqKQt zRY8T00n7FKiS_Wy7wV7oY&#rA6X)jiQIH~7Cx~;gA1Fq{|9gV&zhKy|?_8uljCdE!l$h zARWl#aW79F1wecT@ohd`>0@(gZ>k{u>J{0x0C_-A0~1TYj1Lct*^C{{5AygJob+s7 z|4Uf52_!x`<)=FWgL}*|SKS3^g@WdX3Qc1XH8XO=6ckw=6!uS1BWAVpmCM_M!^6|S z$|`l#upY?Ceh2LJ>&LMEtX{BjXS~}Mfs0m$y3j{#(2y6+^Mc^D^_bNG!s~SG)pfy;{ z(4LNltpD{ZRriPi*d8!e! zv&)^2G$PUKJNgq3ajjzlzUOu2JKUm^ub}!rK@j_01KF!FFL<&kxD8ejzi`-DT5s|D z92jVIxLmk6>af~d?@KnzIXfMyk(CnbY_D#?atj^3cE1_+)gPX=_!U&xwB8H$ro&HK zADhtX!#oD7k8=}W z7uQoD`!YJm`kUOZnT_Iu53DF+uWqX1gs&Vb-lR{-xRN!Eody`3_ z!-$or=m*fFl#}zaU)91lt7L)B-NVCyTXs`Muqsfao2+MRUkWT> z%|7?-u*QOb1}r#>Q+c&9KJgZFJvR)e>n0g_e0;!qT@ey@=QbHZJR{i~V3|V6I%XW0 z{)ZPl=UR*ctZev1OYra0(eYAZ>sUvge*xC48Je2*S5>{Oudkmndw7$CamXYcGVFb`69z)-&@y)sA|2{XS)fbLbLfIXbCl&5yRH+ar{7nU zz54>tdBMpK`#KR%(=hLMP+vj=&-ot0;slf-WrEe zO$xVE|KCVdDazsCZlO`5G&of2QVV;F#JU$fId@z^S11Xv}w++lGt)(G?~(LW+`21A@_Y2<3`>sq9@kn8)l*i3#%ctVu ztq-oq;9McUKc#WUpeE#;`_}cF&p+JYE7j)nBM`U#gZVQxbIxt0zT?W>64q;dy0B7( zgRuEy#H$` z=iAdibfb0yTNZa8oV`Yo=E=|6`Q#^aGLk(sk7KXh9w8qok0!`j)iz~wnfaW0Kh107 zbF-pa-rWrgT$<7^dBZL+x%n?1LWIl_o5PKGXm46W)6NX*7!^5jYezIbeW)t=bX~CdbXQxpro1$7 zNVq&Fbxr8;uAoWm_lTUFdwoHS5Xz%G!+p2QBV)Q}e%J3aWyu3d5bxwegINI*xCY_t z0zm~tlz<7LHPaeypb}es#;|Zr!^>N{yA7Rv2dT#C&)!3Np9r^%&MmG+D@$JMlI;A^ z_4`ootCVrQ$#L4!#!EjBWmaOLZ&qOVSn`hc;%!QJ;JzJ?biTC#-JU^9a+l&mgjq&N z!PzShh5nQapX|5L318MZS=ALW!2{`|wy`70p#p2Lp%};Y`EY3>%jx5sC9b;ebE8pzt(KiZX!qIE>>lGnOEGuHmQdK6*#hJNk;XXc&Ss<5r8?@%~?AyL#m<;LM;{nG^i zrJ{KlBLZsDV>z^h2eSRL1dPTWVx`)msquRSWNZj=?m#%gIDl5{PO(lSGaZ&9POe41~aEWy-#~5jcwZ!giJs??<-89nL+Su~ep zW8u|f|A1glCUKP@bf1*%DR|P0BMeDr@4IRG8W;1>*2uJn1|!yx3^t(;L*{8VHMM&Z zdS`igO{CvvPV@P^PtRVlI@@7cmsol|`Ee|T!-83!jis*vv`Hw<@~Z2NbDYVAL$Cin zsZ-rsqav7MPvWqcY&wo`-iB*UHwUx293O>oo2s`)1P_jB^E$15X+PcKv%GO7NQngf ziR2GU_(~l3#GK79c0>tUJd+IqhcS3N+`#6d9(-804Z5d})9zm?pSkK*Z)fVqT+@2wn`2C{7d)ty)nIS;+`Be%Gi z2ep|@F-N=U?WEKh$1gPVJ2HWa0P`{-O-?%;$zGX;Z}};fk4m1Yn3wP<=!%On3tS&l zA!#@0`1qsVqRMMANns->|Fzq^tgo5Tib&}j8H^+{53YmwjXWOIK7eSqA9aJmp2^Cb$JBtX;xaCiF z7)Z}AafzAR(!)VloYFl%gp_yok44(~s*2W$ScuESLxJu}7uXt4Y3m&~@RT?lB-pP8 z45+J7qAdS**E+_(%dM;%wvd8XAlpT(&`hnFCS01kX@z;p| z|Igce2^BKcFue__U^a|2d>IcFlj=!QaGjKuw3fGKP*KDutdo;ge@W6p6eK1u-}i8t zE4zilz{J}leG#~ZqxrO%1q6#jA6r&KL*oZP2G_5jtX&T+N}oSNpjf0aGM5>p`}4qjvpm7|AA{Yj(kjpe z2=HPecI(Nqb21yncMxVT7fIl4hX;p~;TIUQ1%Q1oPrge^s?uOMZ>~nG1P9c3>Vk)7 z9M``tNHrP6?=9~w25Yey)Jix)m^COsHI>w?2DN8xXlUqldlb(FKfY&Y_-_LG?<3uW z+l&t2=<_8x%`=wQYKd5$l}^u0i40Cy0Y4Uu?UL?8Z4nB6Nv&0BtYBlxpFL{^BKC&% zcb)dUn{H*+e>{#mUYlXm^4AhMGhF%Waaf)M3!8{(xxa^-p_u*pJ6>w8UPnEuUR+NG zcnIsvwb#x%^f;lGG~s|Nj_F`~*};rJXLrBA&$2Y@H=RpKU@h8nI<4u?+X110f;!3) z7X<{OupRAFOCW^jg0HwWPbJO|%s(`ndy=7CaVK#21s}SnF7XIFw#-_xRDK8!4$cQ7 z1F%-`aomTSY^U!EcFiW7n=|ykAZC;F7`QNaV&-J+fXSz1IN_gR4+yusKBtGN6tPIs zmzU)Lq0mB^P(+%4`JqYzqVi;e=y^BXddUbZ(n{VHRXXEnekVwHVeMeQWAf%m$EQ(f z3_p){Qht8EupikOF_CNM<*n^qkx5Ua^ssCojx(?2(coO{>+_E;T-Fhh4{Fs2#&Mzp zE|(Cu|B1Erj*v`BZ86{RuK&bSi{YTTX?L187CDI<>7NH)Dv=^=o0^HG9BBS}qJQe$ ze>DgORA4%|v5EWRi#NauxQ_x3J6AX?k5$FRTlV|a|0VqY{X^9NcAiz#N2FvaDvOp5 z#dcYkGo*}k?{*AU!rDwIv+V{eVrwnNeFhEG*9U?Nbz}^F>@t@&n0$MtY#{&2xy-W0 zYd~2p!k&qtG4jiEBk{k4^~|UL05+U)mIaUm-d_twI8v9ClzhmzLvnNajMqh-^wv)8 zH!X)mF8n>&VMPKll|`{a%@3;4fzbr?L~4Y;tsnKKg6`94Ee%~Zjt4TranFf5JQul8 z*@?K;WN=+TV4%Q`ko+0f`VSGGyaUkk+b^7uzHtc;OVcxqJ|0h2$T9S zryee9;aJ0jcpX!qvQpkV-ZQ}>BIY_lp|YZb6ZrukoxVX;yx#OS$r2D9*x`4whV$;B z&L4(Qr@&(W(Gc)M%;H~U*v#)AqCzhA`j^?ycYq>A~EqgV;Q#u z*?*!fd2{7v^-BPCW2JkZe#5^xOpe7=c}t*AQaun`xw);$FJ`nNTiw2Pkx`9}wm>ep zuJ6g6-UF|M#`b2?^?aH$BGm|yu$u-p66MS{8lO9U-|PiJy?48v@TvOkI{(M_ z;eIj?|5b00=gH1>2M}0wCm@$HmJ|!V-lOw^C-CDR4-_xPVYnCLR43A zpp`fpm#2+_h9t2BGu86{$jK2c|AKY;8U4eiR0Z7t&LzoYcFRC_vfg*yo!Rl#0Y`?+ zsc)QICM2fdaQk5iJpi_b70HdsX_Fm0nl9pQ;AqKpegLS2+yZN%*?7=O>B`S_TM!r( zC~()Vj@lXnC%BQf#7*+UlIQM=!9gW+D-mglt5I&EK{fJK?f5f4!z$m%Mzl((TIE~C-w zGZs8Cp@64~9kLaFz3!6W z-nYh<7V*{Ba$={dxa)b$AzrE4<773@pD7Ee;-FXC5_u0#|LwfYOwVuF1=zxu9q#P9 z(D2cVhEx0OYU^_=??8Y!wHcl}azWC2ne*!U@tCunw$9au{Asib%o=zixO#P+lO5L; z8{CTB0ot1B`85@|DJo{0g^PAqv45VGgZ&*&-J$}@tc7V1zK#zpWmi5Jr15K2^C%!e zG*r9eHt12oPq(|8w=wPQ9VUKemrF-t|BB5)o;~?h{D3k1S4@-X>ow8*8?os|G_q!; zW+yRHjD`a{iOIJNz}50{r#@p>myvcC`KZzJ-zdTM51N=2FQg7qx`&zxIXz70?6wvXP=^#b`6 zq38D$IT5mZf`xZ77s-vykx2XwNdFChA$)NIH7H(lCGnM~??sbf*gXXRzC{M~$*zG> zc!bhSgz@6M+f;Lf@>D^k7wj(7g6>!y*8D@1T#qZ|vx0a}^v zNkzLrlO5qX%*#j57s6P^296Vi@8ZP2V3t}!mS=q1+q(_ymz2R{^BIyR#AjPQrXP&K zjslA{H@TkJ!p#)JO?4%w9L&!2&g+Gm{i%a}XudC}^jI0K{Xxm4NZzD&=7?Mdx*M)gLKKdw?KCw#1qBVp zhRN~SV1NThEvksq>KX#d#gv8zUwnMThDaK!t>r@eaPWmJ90cp$|!*xyeH?WOP~=OX}nXZsogLJEB5 zkN{VutuRVseq(`j=BF|VyLYNc{PR(&ZUMFcDOKZqk%Y^{urvy*Z*Vy@JztQ|osH<^ux7^M@3`o`zb+A-wFSRLQ!x}W)2 z$Hi0svv!D(N`vRXSROqXltyj^A}7DBomRu2Z189~7UpJeZd3v5SIDF>^E}*c&{IYa zIA6XS;94L~4*_`3mK%VZKB_))2Xg`AIy3q=VhMKGa+0rX+}gE8l~6*RV-o(F1<(dl z2mHwO+FJ!4Vsv3`e0AV7#nm3$8&H_nXR z5TGS%{Z-WJ69p;!y}dLxuUSMCk(0*2)Oe!t`cJJxDw3luBPa2#U5$qk`v}U>LA;ZW z;R0P}D&Bue(ix6GlmUPcwX!rcSVP0ZlYkPr{Qjbl+WixalSLO_H#fJvK*9Y$T&TmT z2DIuX@k8^H2ZVb;SDbmxENv`zt)Z%Et&*|Cl^wNJdjH ze!E{oaKV}3-~Z(Q2$hgM&Yv6`s0bU#JF|JE3b<4B;GfnyMoHCSJ0~n)b&`z4o<8{0Gy1oC5M}%+ z^FEO#3WAHWQ|pW>jMJ^=lRb(>l=U(0+887 zdarwY^z_Csxm%~3_;53hM2Nx2LR26F_YafwI@KMDa-%Kb*M3De^l)2?J1 z_fxOc%m8CqZ}{~|=dTnYr%AsjD}T-n$FRqvDx`AtrPkg3QAVUoR7N~aE`V$PONV8SsPVl2Si1os=Cp9 z+`1hx0xehk*w(kaohgUtRN*oHXLpe6KinHV4iwj^R&NFBXKN9 zFIR6ZK#k@VrXI=$MF*3&5UG<8)&&FKl7!n#MURg+RmATKbisfOgOf^-TRq3bz$lZ> zn=$ga*2}#hjZLT%2d?HON zc+|Jk73u&HSgSKZ{1-=tBTeqOrSeFe zgPYqw^#kzud$qI>d{g`GNKpUaYl zKBS_jr&AXqnY3Qx5fhy!+VXs+E~L2&R27I|3XGmlp8XGb(%UK>5E7Iy6wF(7qPE=O za~>V^S3)X|w-F7k;p}1@E^*JWu&~Sl;SFT~DY<``3E$>A zCa+67?x{wFMm}!8Cmw(iKwkTvIBmwv%`e6bV9*#8idcl8Ag_ckY|SAV{$)`|&(gII zj3cjTMl|B3E-hsgl_cCRkPo4zNXoWsikkPx(mM5v{!L&tRZ&?W?qZMfR=fuVX+eCZ zN?|=}=0;;1c_d{T98tmRyjMe6&)Kg^05)I>uf%3Rv3ou}xp*sIR+z@2+Y!ZT z;-G{+^#U)f0z)fsfZYk1>^KXi4rGI<;O=2-d%Iud$zdgB?zvQAF7bfT8ClnD8~isC1` z<<@FDcMxX$viWkbP*L@)LxJQfBJ}e&OeTOoTH$*V5aZnDw`H++@Hn37LA4zCa>Dvl zZ3n~3kITmn%U=n{0fA%DJyiO5s>!gx09fQ}m+ma!Os#r2mwo)QW>c`*euH0t$q;PM z6vgWBD|ApBDG!JZWgl4k5(E9f=_`WWldiL=1(Q>vJDP9K5dn;&G~>aj)4uNC^*T%! zRSzA!TG?YA85!wOJ_UMuD6QM4r4;iG<${8PMVy?FapJcHogKfkUO3ef`AEd26w+R! z7>)#JwI|9d4Ds%*&WRSh)@^P6k07qqr@A4n`8PKPP^gv$2Jxu9bsNZCwie8Xj4fL# zKyV?eDU&lJ&hy41S4&uAz4DS^uxMwsQs|rF;!~gG-<_>1Ds%>C=)pwW;y`ZI9f_WPC2rwJi>?0B@Y%r@|N%03s`zGog>3Mj0`g!pi$u< zFI``FYL-aCyW$co=Ql7!&UrOkVEv)_0&yiRr0WM!(>k!_-*e4fwWG)y zm*URW>Z2lSWX?c=oem0Q4zm+E{?cpwKbVZSRqo*an0nR@?zt{4EqS*~Zh4N5j(U!E zIiGe!$zO!@b_Z><>}>9B&8x0;H|>nqSR`XS`H!%VbHu-3wxhm=?|Qvo^wsqN8IK7g zy^gnkM5_s`*LHhof$GTl2HL}^z9(w$+=Uncm$^VTq#gu{1C~0^re>s$S1W0UTaHhA zmVYyWZ-owkfq!1<@L8U)Et=0}!2TK~gnucjQ{v&9sm4#C9Z@_Mjppx8`+v!BAr}`H zj{H6`H4xOM(U2m@1k*E|K{XfpG zIxfm}>nft6fT&2Rgh2=bQql}cDJ|Wolyo=4hyl_{_aI1@fOLy6bi>d{4&9yK^Lmfx z+n%h@B8fi?7j9{YrASxEHo4vO~wI$HEyR|_?gw>E6v|sB>xpvjUcLQ zi>x{)iaRCuF8_;WnQb>|Cl6)hP zRyx|V>~w_A&@6SlmN6gkcI8s36<;-~Bj(kMD!aSPI8_$}Au<^2(LFCQxPQ z(nhnpyE|s@im@N7I!);l6%}<4qd6;RYisK=E04UqnWGE;Z8_8V@K0X2j_tg8KT$M4 zIhddpz4iuN40-x>L)-R5x;kdi;u4@I>ht{^9MDz0F<|%O)7H`H*+ufj011kbN1bo; zugQiE`r!U&?EEH$fo~b!1atvkGtuG@E3i}=pA0KFeGdaWavTJ@PRpgTEzu{V)39d; z6M-_db6Q))(~FA~A(NBLh@JoO;@;MOCyaxeitt~ADGQncwy}Qe|E>nVprtoF@qIG@ z)2j(|$^Fi;+`D_Ig7FD~3g_{>@egB-;LW zaV?mL%Z}u^WDDk+00c`mcja-~bw9!imR#aiJ&NOQyiT^Q@JNnKPohUmOZrgZnQ<3C zz_3yBd15-K7Iz1_3w1UhicGtVwHmNCJlj$xVkAPFih>%OsDIm_|JNHOg}AJD1l@1A zPAa=nhul`Z2$qFcTk##NcdlHiShuu=YYU&sUS$tU&5 z%A2u{!3O{v03|1)R)mJ9lS~|wnRC8^DvKMBMNmGhqvQB_rCF&{1s<9D{B$4iZm zzd^Jagy%8~dexO;66xD?bZ;QbNRpVNl zuSQY{`WXig65%aGWB1-1A`44ToLa5yu}a?ISEYb9%A339x)ipj!(RY zqd+sv#qqm;=HJgTlB6R{nz77-B=65`umt`1cCih>=kIBcY8^Be9B{H~7! zRb}*$!MNbsVzMzG$-Yfa+0N(DTYEe(d(m`_oNEE!;{6*e=#VYA=83sM76BsQn%$zY zPfScXiA-Ikz5jy1r6Eti{>Wwf=tTVwet7H-86bn$9SZ!E`3wLRE3HTSttsadX=5Co zel`ZeXV~SV$%_5ZGb3pC-<5${=K(vyKe1=m8ca&N+uK$pL3Eej;Y&aQLRd5|oIca1 zqsI6K?Lbx(V#mDz3`)!QvSd&~-K`=+RfH9MQ7cQlB~Eeqqblj@WjDv2+kcZfe)tv@ zIkwAnPV&Y3?%j8QNMai}v=T2lwmOe3mMq1*oy9$Sb_`en--oPj33HHAzq`gY=-OQK z88<11h(Pb*DJJ#FKBm~hp`o0q2??gCoHQ>0&_4=IQKkB{Q0dv}>4lg*8BuNRqc9LC zxI|5LzRkv=wfW5qpjIbhF;J?~4b?^bgJ7-j z0fRi4kJSWc+uU>1rM~Zb^#(~8o3*Hh=A%m`F{YPyL=3sCs|G(qilFR_)Z2Hn)Gq_M zq@hYe!0(>me-^GFvYA12tSyaxu(JT;qc6Uk!=%Xl3EFYGpUXrPf5L$gvxg zI}i4!%nuU@Lj1#1U;pM`029AIAiuUGz_g9aP15<8B0sx7_*n!J4FMpc_?i5r2@>g) zLz?*Tj9@)7GAuxixPZl>ZT((`S{}U?m~En<;ZiT?f2p;Xu&}TJ%9WA9 z2q22LyKAmsOh^YV$|7rw=xXaKXtQYY!w@4GSy{~IDELUl{H}U#Zf+{P83Zs}Jn-65 zN>{l6kl{c%w1bJ${`QIk(9evm&vdo}p!N0Z*V&J&UIZNh?FkEox?125@d5n;^5U^2 zNenT@eT25Y8~`lUN%EIWIXq=d^XKqa@se`w7+}0b&hG00#v8pav&;BsW2w^^QzIau z;uFMIw&MW-{j~l?VWHe8U780L4X)% zJ8vx(9v$x8;Iwx+bXrshX8W$hniQb%IQ7|!bVchoS z%VbD?v^`3IAob^4eE_vmh21`6Y-0(`aKk~1YW$Rnw1!5&{R_>c4MccA>nB#Cmt1H8 zdW#eRMDXVc`qW@gRBBqL*Ebjn+U=5pnc)7#EI{~MDJun<;n_%}9}q_xR^>cv$$4-J z!U9G6AdLW<>ya6ZQP9KC%J8~m3LxvTv*({G!bUxZAm!_LB-R_ke7(b0HK=kpfqR8>L3uhQOuXWWNu2p!)6fOB}SSvR;~zGIx^S z-#R)R(E;?OYV*6EY(W1(hy+;pSk}>Q)_7Nne5l65oC@F!7rzoxP&yfq741c`Z$+iV zD$GbGIt^&u*(eI<_Ie0-C7Rfd3mpEQN>~NIe(@yCMjRck1ruMo^aa?XwN(6t3p_T$ zwXw-V9mbc$Wv*x}L*;VH&~k;n@>^xnG_Siriw78OPt8U@8@Ai?a5$pj{?q9=|uv9G2Wm>LI)UPd2!#Mwq&lFDg)mk^)0p0Zz zs=#9-92*-O;nkxP<;#Ve{l?5La~$CCC;A932F(cH02gT3DXb(Jq~G3R3G*Vb#on%$ zalKe?k+>oN%I9&2B*3@cCKOtx$og^iU^)`XIy46Y5)l5+nun3 zPA*`xf}6xN!%cSC7FcGYP7v_DJzKC_d0N2YAzG$uV#0$KaugCVHC@m%-vPT3QIKZM z+@bo)7%vMb`gSnEhoMle!I>sr?<-`4zIh}ELq0x=3HkGCr%e2 z*;UXW1G9V2d6Rpv6RJ{OMZiHJ@G@2&V8=lK8^qW&R$_`s&R_UF%lW@R0t-(FVY|>| z1CF@lCqa9A`~9z{VQ(O+J%Hwygjus~;`{(qRcoxZ7V*r;=yD(4q168ekt@ek<{TO2SsE9@R2E?Ib|naCz&pK2ei5P?>vvO-k*8 z#2w@1N%*2n7Tu{eERdQx$MmUdL11Jg`4xJv8)O1`c{%J}#4VQ{v-SVU1%x9R0g;5_ z3s}Gtb0-F#;9^h;TmkcQMnI{#yFOt_%5U+tGVGLKne0;u35n80+bnO$G&EbkIf6vm ze|u+hz#JL_;BuDDWtKbz9r?-(a2fmD8MwJSa=0)u#%S&+5e2$BN#ouUdvhoe-a6V@ z!}!ni{~WEa6o%9z!qtFj8H(cyR8W5LNtbDi<{l2w((=+O-ByFJX&;Df8B<{;H7rH%VTcsxbp)A%uvDyqh3wTp=pxRCL$*%r!m02^h-=k zY}9x%-5Bs*iF3$Co^Q4828&Lu&wEFljSqC%6QVh$*#d>1~4 z7WS`n^z zP32M{u+<2n#X<&D$$*3f$yZcv8~dKQ2m)y`40wPrMiHS+=>dD&v^0kc;PbS}R|dW( z$KO;+e|^ix1Du@|+35Smpsc6`Q`mTb!`Ed3*4UV__MTelPO~|MRm?psY$oDoW@gd{ zJ4GNv$-jR<(i*nScC>~#Y;{;~7em>OLy-r0kZps3XXAy1&Ym7(#PVsxow%a-@O3f7 zx|n4(KIa#}(-q5YO3nn1gX;0w*_H~%c=js?TO*WU+-bhwV}8(MA@BRsC-hhjo2R6; z)@^F&5!!)idj?2jK6J6PM^0f{-ULhB{qQZ9Unj$wBvt!5y#hKRpCCWfl&yr0sH>Tc z-G)zUDa3gXX|M)QGL)ILTaD@OE5AT`e!vW9tTVQ-j?4)7-X{K)W5Kt1$cXb?c2+;7 zY2U-{(EQrMyX3>WSa`$9Np!2~D0_Yl1mX(p5OqY_9snw_6#xm{$^>?I3eV(L`^6|6 zZrJ%F*)oTpF=iLHFD=RJ`L9f@b{tSe80tJhD`Mj0>?2#LXGY!%i< zEIF8k-!&#_X5jsn>Jz-tqSE==whKv-=cDYQ#jk}7E^<1 z4^!C5rdd05>I0~6L>o~Y$2ww?RaWMV`xxViy$3NQOifLz&2|7ijc>7RYxwwyuTZ69 zj@TO*2k7xtt7tzzKZRa7)~|-i;P~?kY!$WQb5L`HgoctJ59STV2@by!Fh%UO`@6E2 zuO~3q0N|$_x@ASiNn*CgNGih4VgOg4Q(6 z_vbN~70)C(fSrF#SBs}@0T?h#0bN|B&zjEs^iq+0CAim@pKRP8?$9L+6z-tojCf;8 z3-|xL-jAI|p8BL)xwEEn9r`5J{B(ETrp?th$)$0mTS;+ug9QqKVEn7;5ggD&NZ zwbV^PD*lrJqeAtM-;er%0qhr0cvw6-){cv21S;TfRz8hdn>oEeO^cg!+C};l6MSBW z(p&d==VLKelhnLC+mEAb7>Hd+KzeNB^Q)1ggAso)OxuQLxFpi}+fjH)yAZ_?u}X_z z{krWC4G1C*F5_25NbPp!vML0iR1SQQ$Sf8?9vcU%GoF-AP&0c@L-h+dV8LJmQrl4%U6q{h0B|UOJc@EgB%zrspyz*6oDugKeiZA&c2^^V!wb<9(zF|@HkGp^nNukp?VWKyFk7#gubEmMHuvsieG_~tFODe`xE2s zZ4^x4*|9EM655xekY6GSu}yF^3XSuTT$b0rv1ve3?>~+Tq^;PUjq=r6bn$Fke)H4six*0faw znNbXECHS~O0dwUO7Z-PrHUMm#P$tMi8TH~M?wA2=uP;H{<%}{P*!Ei8qE%$2S2~Js zPHLxR^_f9y#Va{A)oXTk_8JD%l_Qkw*t5VaVg<_eb#s9)5&I|vn5dG_lme40nUgYQ z%dW8)=w5(m(4iOIIoo(bt;}v7)_+U~!Q}I-4`QHL4wx2*sH+dWT|3%A9ytmhR}*4- z`u3-%>4*uE&$s5ydlmyegQ*T6mBDq@dMCcA=fUhFsUevz)29Zg=e^6t26VNx5%Za* zJS6);I0&8&zV!-2f_!aAQeTd-Fi8_{W#)HS+qg3BZW?Vllt`5QI8*Rr)nfeGFV5^w zg6x>v3>~0%@(gcCOgq;4kXasVpv72Y^$N#ll91mV891APy5Pon5W=3o=yh_Xmb08? zgN~WmzZJe{Jr2$ygbYd)mQ@ zO0^Z5o?XLeaBWAROKAcS75I5(h_SSq!1wB$!w%!GL;dFFpdz?B6S+<~^GH_qJj7Ch z8rQ3<0jDWlWC5k;{!fR&q|y*838Vx~;Mv9rBj<+?Rco?R*OIiV5>I}E)8xPP?tw7s z-OEcfEl~@8v=`l^1_JSZCZoWs!*G9Ps`jNAa@dh38pFr`Z&6pPAeg@^;-JeJBO3c~-K?!9*8eF}ty4TU9=sO-n?mRkU~l^o zcE%k6t%Qo=Cl(+81ZG6mxU`2OtH7TE2zbCS0Y3i#B!VUWi$nZyo#BS+l5?WkV-gfO z%%hFBYfEu(|50mnzEBv{ZJ{`K$&*GP_~v{N@Ss4{ZGt)L@HaAEu7307Z}qL~wYPXo zyRAn%FfrEc8MNyH`V`*1-J1Coc=zDo)r^dcYBQ^DMP+>GxdD!Q3H02Yot?+S4PrbS z8XCN712G=FOnHeDPW8p^xNhR^;B2r22;^Qo>THPMFlI{murY@w-~~{{!Ao}wd}rW0 z`X86-zamBw*RA}poXhH8;=)-j8yD?$+Rt{0yYGlpvu&zoA$1yPn^ejzq=0BZW+NY! zUU9-hrgn37S7EyoC^|in;VkM-m9M3v1+K^}nfs+RiXGNmIKedW_l0)Q5aYYWT6k%9 zC@6V)3xK=WaG7)4-GEUR#HIm)7#jJViq-*tk^~p>BViU`@mvbHCwPM!022HGh8WCk z_59l#urKQy_y@kjfYC{M2b>ezgQNMOGcFni@TH?H{SWQs%sou1tK|><2-WSPf(5CX z?^``Q@#@sY`_&bpt7l&cs?!~C?g+it*dA%Sui8QUfU4-NAaMi@O$oV#bm@a(av+OY z+AD9Gj$x~+_11sK&K5xim)2VAXyUHGIJ&m((Vcw|k)_`s*r##tUR583CO?bw*{;!M zf!y3&;0(?iA5z+vMD|%xTN^h1JcW$oKFg2htmjW*>ElQ*$$Ef2!R$N&$-ux=s-N}~ zT|yh#MZ#0+y0}c(O&{YAN0~P8@{CQM_B!}9;Fd)YD%*dtj$v-Dw#N z%Sbl;=}TZ402Je2J?%gMo^2<>Wuc1SJ^7Io^80D$w2 zm$x?r5tXEmVqxK6Ckd*nN1|$*I60_kTjzZxmD0Mo6O)oq3;STa-Kn*em=zQhWQZ!} z4UdUgT+CUENQ>b+7bSAf*|`MuXbL?RPBA>+8I_xm@cMjm_?Vpk4^FVIgm)7Q>}W~pu#qh9Pr|q8S=B7tABy`(DXQJiXA7cdE>p6_>(xEP{NYN#**|&70lF9h?&*K@dz3oGL{$8)wN3Y(<+WQD&&bScb*wvE6>Oh zsN>y~at7AWC&?Im_LLH-`}7*lRXJm>c=k23vh!sQc|?A)9x*JlPJT2sx5G!DHfOtZ z)XRlr{CM}rv-2M><7ZZ>Y|th~Q4QP@E(QjPtg_7r3a9C7U}!KBFl2@#o&dwC7rBd| zV)y_&(LZsN^ErYhx=ufL$4Y$a{^NT=6cEfx^SRxV;BK5BSXfIp&TG0E#r`)6UNFLd(kW*sLtxDll z>~4H&&U2sUdU1I=w*5qD-S>0W>$aa4pH@F$E#NkMwc=0qK-b+Mn7Lehh2^2EIztrA zBIRPw07|>#uy|pXGNy0qv>2_sZO+2;wUvfL+Mw_t^Iq)?{DdWWA4aBU;l-!aq5+vN zgF|$?|JsfD*|FH%SX0+?z6sQ~j|7Ox4&Y~Klx^B}|eRT(+fs)gTit@~C7KTSF z#8T2cFHXJB&XAEZJ!nw>f-Z|Ky2buaOl+yXy`8U0&(G5p@M*B0JOs>pMX0Qr!LyiI zTvS3JDgX*JI0h|?V5M~90-6+HNKW=9LFI(oM`BWP@{PiZzEhbZA|j4a&JGR^pcmhm zl#)5;AvCqi!g7dMYptuR`?Lq!`oPJLR5us!_gPl3iy=B+ho0Ci&M45zqI7A%LG%-~*&bx~1|@cN!9VXfY*Ypoe_H3!%$u^S z5ux4IDSC_S#6tk;mVBzfM)ADg%n?qlBxhgyNHVO8FO!o1mfN%7kH5;Vdq!lC!EzrtjOz+~ytwpfEaJ)=0bVDsClzgU zrP$yuS7kSEn#|nh9Qve#V#xnql=_Ex_Fr4m9@}9JQwuejnTfvktn?(L{RB8Mclz)L z1dkn8#?sMQ+d~g&zNR`Ur0s^|wu%zZ0vKcnt-mi{%>Uz%OesDev58T3C5HAVzYwc5LymABgqaIREaLQ(C(hY2VWyms zQ;|blKRoMW#}Q{il3O=h(^`O)X`<9O8y3Sxmc*j_j13UEG5tFg(bu>QT3*kgs@etk z4{JGX*Yvgw;q@G&#Zp;Y-JF97N%jJ9yDd6e{^1g%mCyWRQeA|_B8S_hLJtz+;-6wz zqX)8u;tx_yhs^nme3p|ggLMK$83du;mifZLx>?-;QC8!R5?_S{S@fw@ZnALkWoqut zDeCdrQJQouPYYM}Y);-6GaBfIaixBEMXD{`_)k`|e!pV2ML-l+d}O`#n;aZgHDXIH z7F8wVaV3?VhvljQUGos_#`fGu*YdWY?8aoPev|X+-~i0def)N_=2C0I3#Y|mzj!CB zJ$rUm0n%jwf}SM>iPl>~j*}yFj4T7RbCK_jd^J(Nl}Se3m*2}Y^V7mv%|FUK_54T& zgQtjtuS1uI6oaC$u<*wMa+`xZoAtER#CyRUw!m;-%FcQEsQsA7N?r?I&TeIh-aU+z z(9iCO|M4{qIzUEyef@XqTim$MeA?pRdWk&OwIDlUfsUG5ML%Z4o>n$@k)cO=3_a@+Wf|C$~Cr|A*x!(su;jS+fU z+icYC%vmQ79-p=(P}lTK>kb_EmAdWIIteXx)X(Y!Z$9|8S{kYPpbcNt+WoKfN15%$ z=IB`)d4H1b_li&R)Vlh~w%-=C$;GMvcp?3jE`IvS?ePUbA5dAuhW{iV{PD;@>6VwV zx6wN{f-@Vvj+Z}Izi`Btb}acrFZ-myZFwt=;<5NZB9iFjs84;bx0`4?mV2MuQNXp9 z)VJDqOm2KNrZgW;(q7umyejYaJPSVhLP!U`TVqTxx46asA$?*Wmsxw=>8X`Q67RXR z9voMmcDalew<&L<&nv}XfG(jb$5FQ?$DIQu?eA4x;y40Y08ycDf__Vk!^s%EnQ(FW8 zz+jracJ10du$!@Py43MAUdIlM_b^nn)+4b@V5t7nE0O-f^_OZf_9=3NIf)G6n;Oc> zp>gr?dYfzm2Q1s*v2}?%vV&fsi>Jd z)vA}R!c;n8gWt7fz@#=PL+d1}uF$uxD+m+gH?7&{+xIs;ifzOdc;8$1#k=8jEpIoA zjrc-!Qa6A9<8`x&WSXloH-!@z*fLdASGg`C_bl?2&<*Tr4ktMG{diJc z#op%6u_*MO|LM9%w>F}$2UPE!m!@3~v4QVl=9e$t2xZ4Zo~M(*s1Z1>zqbBOG@sAG z(iaR)8zwDHn~C#Srd<^h*VSzmvu>6E6dX|~D;Ze`Ke z(wz>x+H8@_*W_7lfn?LZ8({2nC)w#>H)AQqI94y>0tN}Tq{+)~ogvb_&3p7>h5qi{ z%{c4#E>8yq;`lRKFJBW6yjS4+Xb)&s1(KQ_DFcUCGX381*y16r++pEp;?EZhJ(WaF z-sOxD^Hw9D#s?68&d0$08N32~&llowUmZS)YId$hm{8I3E2T-uMOVq}sIF9X5dS7H z|Ksxh_a}147%UdhmqH1Cf8alfeVn^)+uIJIh=ebA51cSx?q;cJ$rp+s>}JZm@|u|* z>zbHI0+EyTIk(l+K5H-HgiEuT#)n!khf!Q`0C_PD>oSqWi3Dz5o}S^Zf^T9n%=XNE zrO#oTZt=%AJ8i7{a~M&wI^ksq#`MJ{3L9K;lhL2flDlZ)rW&UBlZ*H_zZ0l@!2YSX&^G%Q`eeWP zgAOv4MxI?`BE||qMRHm5h|fv40V2(W*;^`ub#?_|o=5vifF+Gr7A?8`lW^@yG=X$- z-H1`fx_Qb8$HIioopEoL2WPHgU3`u<)1C#lffHhgqekjkU{bBbMN5 z8C4snA^_I>46K_+H?{0Q?1$k}K;bc0kdw0-=GHD#$u|jq(QHTK%xg2<<~RwqMS&O1 zy>ylN`wMZCJrC5g@;W->eM+WxBO&{;O%9oF-_@r*PsW`~$?`?G##nXHiLtdpQX;33 zh@2?Cs=b{1wO~SGBK@^%qow$TyGNNx{pV(E_I8f6w@Td*F4?Xt-PUKH^RO9MT4t}M z?2JV3aEa$OmY#LH97!6JUj`OBX>P-V3~^uXPi#+wX(1+)?hWfX?USWcxl*Y9Emo^t!H;)pH(60jk>edF76-&z{vSl&-ah4!G-Ews#-4MjII!%?^eNz!#f$1vJ~Lb@d(O zV*S9j?s^vs2jV*;!dJkKwxYLZ|NKGguth~hTjuKU-cKUXQDUEQ!FMXnv2Lw|Lzj#a zssHDu;2brVE5fFy9ywkS-Dp_}1)VZ)>v9mN; z9?yK`iB6f>R)fgMnGe*r1IPsPZzgGYlVpT21_)Y?%mEWi^v5D(!8^Fi&bn1D z_S5)3c6VU!`!Y$`(nvc#;)xyD4z@-O3OASfTqbbaoY%z~-Y3vvW3;{N!8zWfeKu@y zSy_8{{nC@t6V$S!4~FO0K67TuU~EUZ%{LAGEGH48EG1@#6H!0Qh7f%?m8pA`n$B|+K5QsuSx4cg>5I|b-ur{pJeG@QeB}Xr*~o7iD8%(KbFvMn=}z69xw^H&V_EP4%Orqiix|htpCr zQ~ku=_U0jLPESZ)AFfi8l3HYs?IdaLAhl~4e*7^ia|NYmpN9KC<~i3>MvV%pHkDAV zKdJTI$A`dB*G2Z6hi3buY%k6inYXpT@Rn1Xs$~P;)*(+)->O{M_PvVDrxkL1 zzfJe2bN7acJj-m23^pcLWJfNp18!DmBYR2dj({dBa|Mi zS|)bLkAX#yYp}vQ3$JSH2h(pVrZXtK21e6%hWRU}Nv>-ITT%QhWPblOx0B2uhF%<9 zItHCU2btL90bRI%)B(r8j8(=^O-mh50;g>$~JN)WQV3#z&e@8yn@pc2N}1+3#2HUoX>5_R_OyJ{q=9 z^+WGSBU9TwuN{o#d31BP&1c&58+%-_Psul}FiYDuW^5g}V=5IZebQNW$;_pf7_#OdBVF>DX28}Zx`ASh7%q{Nb6_x2dtLzAJKI`@R~+5D09IulB3_CpC@8coM7u+Y4%ihg^k!F^?oDkBm{$y$HW!ib zHyiXYX%;Jh^?sOhynEotu3dQwm??3YnM`2AFOnRySl`z6c5`6>lljHbwA#J0>`7K^ zY-L|X5rI7leA(a<)}$|jYHP;KT;nG&-2Sz;cJ#Dm{{xAj!|}&ZtfPuCZ;3WdQd%Nw3yjF>aGbjvbV-5Wl#~H8Py_6hUHc3v|vN7 zT)NH#!sG9H=!gt5WM7IrBOQoh~%6A_+;h*zI7GN48ievHJ z=(XRag-t*3^(!K1mXP*oPs}^nuc7q;KW+7r={TwxGo<@7Jnw%!`a8JCT^a0&pW*EU zR!&TO7X#?nOQh? z*8iF;bJ==FpObk^>Gqg6MI%$i;6~&aJv}|N+-_~oH%-ULNPHmQv=gw}Sd^(T(Fid1 zFWbF5nVGdlvtH%><_(VNP_ZJQJf{!zNwxx7T&rnwxHm{9PGGX#T57Yy@^;Ur51Zu2 z(|;Ux#%7b#F{%h)t~$P)C#SYjT$MLYP@@vk?pVL(6yodIpYKMjtFU#xDE%ZiRN7v$ z#L{fD_&8;0+88H9=|v&|+jI*1URhfbXsx|+`d=g#$eMk~HK+sPM@9K8xL5>-q5Z*W zIX|>*YKgguVZG(O0zLI0#n8+av|m{g^DHCcQ4ZU>p|#_Cr4Zfvr!KI_6Mx0#-~Z%7 z3-0oz+az6*zP`S@fE1Mkc-oRcw%vTRdR#CW%&?c8By&i#AHKr(L2epSt+D_Lq}K>|c`XQQvl>eqEPblf&&V(1E4 zeLBe~ezw8Y@K3zuS4Gi}N5stng10mlb)RpJT5zpHTR@>LsGu;$`b6JI^;|F(8#={$ z1_o()dEE;E+WTW9(Z^-LMsTLNdAkybrdCwyI`x9vBk6n`NI06{$w6Vlf4Duq3tVU{ z^=A!X<>276QpWf}wlgu}QD_ejk8d0cr`?$(+trDOV6>4^TB-qvt4Oob7169IE_oYZ z=`sr#)KN`Yd70-ryVF-L5qEcX#)G}Bb=~X+t<2ix8eos9z+MFHk3lS0cm~gJ{gjKu zLA9pST3e?RVki2z_CWUwkVlG*nitgM3;Ps3l`*K=+CkT&hO~!RvR$x2^5=>#j6HEg z=r}Vb{K5W{1{rO7@0yDmQtr<<0|U>I^R>_7S%B1|?7a0g-Cj;np0f0stmt&-31sp8 z$R1*PZkE!QWBTwDWdCF%?;pAUIVHFoS5FufN*k7ESK~c(Jlw%l+j`IAg39A@`L$_i zdpHmyniuAAS$usCR*l>}01Pz+HMM?`JpBiQfFPs_vi^9SF-c6}S_^OR+qV{@GizGF zeSi`TEi=Z)$B(u@{7j<1EN5f$Y0bF%HM$9HUb^CrdP|qvVwA^2-h^2MWMmrn*AtX& zqtza*48|rVU4W?WoK!`C*cK8YMQ{v7R^xpw2KjHJiLJGe5dnB`QPwcPSZ+DcQWm-;Az?=-fwGi)VDvZ{=SBag z6W3<0dakrXV7NeZVEvC#@xN13WiG%_VtkXgtK+Ek2VN?g?y6a#iuW@S(|L#sJA`{e+CFJH>>_SQ9 z%deRo(j^`~v~*{M11^`nIxinzI^>j(&O>XZ0otz+H8r)Dk53Sp`}%xtYXhE4&H#VK zP9t6^13kS}Nr+>6ddZz*vxE0wyY2o_{CwMAUt(hH_V(RQQ!JBW=}9jGk(df}*(3>U z+{)0d7$&O))l}lbK*1c)zQ_YWb-=dCTmLzle`|0&+YjrNi~&UqNkZCowqe?qC;oWl7@uSSYQnvFTjpM5c4r%s%y z4RnX1ufrQL2?Pix9`>Fj_eC9^p4EIj)*FJ~fF1Ci5! zA$D%N7+R@~xcKJHo9#|qm_TE5b2o6c5P<{6)_S41n3xCKod!JLh-OQWuZ*EO6#rna z;>g*c5bVrBqIQ+F;fhl_cRiZ*JAr-}-(SdnaWJK!LEPVVWT(Ym_Y`lqoAU5gK#A&R zEEpWa%UD%F%y!*6sD|yoOkxd1S8Rg)J>1-VleSDhsS8J`zMA4FfD%-|u^Am461HG9 zz@1XF+m&||i>%o7H`=(4Gy8a`CH9OPPj1R)`Kg1Z>7!bSrEcm3bPP9n4kaxO$|lY< z%dcRebEf6QwGsk*-z4hWR;L7l^+OF;;<|H0Y6}@K{dm|-6=h%?E3e40k(!<&Zz{Bf zC)!&6sYAv8d~)9&%f}DntdG1y``OpA_J$K^bw2aN~$y4 z0XYZCR$PhmPKVJ*y4o;j#VQ(e_oxg{A0LHSthrVur>ekg&nIOAIbwTPhS@(CN^4qh zxMoF+=d zat>p||0h8`j=>(RFn$~p@wD&jMZnglg@i~NQtkLyINOR;Oia>X?a2q|YRAXJ+jhwS zxFKdAj7iuc_F|kD+kszR3Pf6DyNoei0kW+R(4r2lU>V6m#LJPQKp%g9f)$({;LS3qX7q%gOZeGymvF;RK;Pm`sGBR(yE!=<&|$o>31Brp z!`aO4jwmaC+sw0zy}Q(a8DM3*Gw+BB5?X-c^Ym}~4ZCiM;QN;xmB)~gZh+~j+`U>W z>e|GpXjCWRnzP56W!MunFmU~pv`Lh={>ps0vnZeJp(D0nG%l+L=7f-r)1KhVRG;O9 zq0?75_+}5b>@gqH8{Z#@!aSru87}0QgmP&7{i^A&rV}gEM zh5La%VFk8q7p*a-3l=JcMP(yoyHMSK0;N7_A$hVp zpWb*Dq|a2|`)ohfVE?m`Z9+b`e@L>1retW%t?cRw0###%IHhIj8HA7hnAL>G2*RS z6`=ADLLw2HVA-Dipe4#@ir*fb=^_-_m*98rEEi`~VcNO+zH7$2K*p)hfBM9aH;0=n zV357r2WLv6{arRi5+ECGs*KOQ_|kg`zTR+J&)a8(Ys0OW{4zP^rV2^&i}Fz>MRMnR zN4uX%HY#F@WSHl7{7;e{({nPoIeF5H|8@_H>xYDR+JOyZ>wJ&)VMRsvkoKOh>;9Z( zDtFICo8z)H4tg7(aqr@K%dp@56LSAa{{O{u2Ib@RgFCeH?(YQU@z}-$(^-Ry{bNthO5b??{0f8>ba!=4gKcB(kZi&jAfr+n>!Buh zBY}KHiywGMv}ECTDmx;2_SM!PUk0f7i+&=jsHiwhAdqCWG26WhWQ++wH?$I!JIwFv z=hvg-&!0#DpB4g@PqbYd(4MaYX1{&E@o-Eju+#>#l54>A_#oQ#a1Ch;hARW_4dwy@ z12<}zT_0)hG$`wCHvi=JzkSo~L7LQfU?<{e%ID?I=?nT!fa^edjlalfHq(~Z#vy%V z!4{z@=_S=kO^r^S9U>WIVtGP|PT72^7GBYA8VzIK1Y>!NYNmx0xa!c$j6)6j4^vD{ z8W-daTEue43GEj6J3gdFI4$&##4eV9J;S~>(LUIoRUD#4H2J}Sev;>(qBIba_X?Vu zKXHADDduOxZU8fCQ2v1>(;wpEuA6(vdK|-IgE;8zKs7&V`HJlA#9TF=I-5`|g~Qk5 zz;&UbLtchIL9@iJ2oJ`J_V@AW0>|}RTf}@g#vBwGIX9ONGK!L-l9JKh=7Q7y3TmT6 zf;lBKvt@w?NcEux*aj1AKMQ0*&!@(00je+umeceoSL`YRv8faQ5L)s&J3FJ-BDD#d zk0lmCk`pH&vfJ3mLJX1dK}G(vk#cy>zyec}pRaE>s9bIK77FJ4co${G&zj$~@~k=e zL;Arnb_+}90-y(B;1XQhAn`m)KDNIQ%>p5X@xJ26n@LSNI6*~r^2(-_La~6A2OxfW zgJpTZdab~@Z#S%>UYaAn?bv9;`O*BbLdm(Vw47os=X0EfCN5J|JDttEl$fSv`unj{ zLNOdaM(tl&`4^vXn84Kc#hfd)Cg;nSFSRz-f!&oVv_!KkH38>1wm$7+_gmrqJAA<5 zY`v%z-m-J&gufppEEC|hDFp?}H*VYj_^q8g1rZ5}abJ!eYjr%9W!R{S0alKhx_UG` z{F)t5TSY`f$hm;d>eh{YP@5DWPSf1I2vy?M-1Dm}Q z9|z_#nWc-$*HH|_-rYLB6=ypL%W(B6m%omu9bV=c;dp_xBl8#Ez_;Hq>i-yJpt^W! zXWZH6y;pZ{j{(?Sp3;FM#UnSso&J_~TF*4dZ50$EDX)R{E|D-YeUhq!|#Zu`Mq4Y{lt9SFarGENi z;-6=5R?S)3*u`qSNg|XoHvm^dYi;XwZHs-G+VxuECB^8p>ul%K681iTIgP}gbUBTA zb&^JNL0hZ2=Ef7%>$h>EtBQ^|Byxx-4LaMF9qOEJi_0j8sJh(^B{Ce!F;UGJ9C%w6 z&hGbjewqJ_qvuEncOf(>l#C1t`T+Ilpa)?p2#Sc9UA_D>MD>q^7bfGjzYZq;UY*ll zJ`UX7L@%^Kvx5AoL2v!ucbsBo{TIy}T(sCYI1=oo_p8zz<;O`J746O54Lwn|w$A_t zj(XUzGPCV-(azzR+)EGq8f8^i6mzeVyH;Pn!B}i!Tdsw`=+>DqUB9m;7xv_p0FjZk z<2&)y&ije>mqorm>VMD|xa|O(&&YL9$fz6K*6pS^LDi!;DF57}UpYX-NA*z+~WES~;9@~fd z$Rg_wVrO}p5*<0;XcVC7bSI6pfpAnFr(4zf`ZCl-?wyjWi+yHq$VSaQN-QG1SBk+I zb=_08ulsY#9{>;;W9EkWf1T9l>c?HZ2N?-Sq}+gIh4Kj4*ra#Zy*vY{>}RB{p;7S9 zqtGq(eBp<6IE7Fw;GS>TOy1bt4%U9xJ+_OYZ7Gf(L4VNsJU*;MV)m^FtZLUXO>@=Q zlQG{inx#vVI!V^3*(2Rg+tudSgyY(uaQVizp^H_M9wW&V%||}_mr7ISQ3E%haP4*| zI(o6+522xbIp-g6vqsE#ydO(3h-kKhW5>(a@2n6!|EG+Jzuw8uKe?T^IAC%YOmDe- z?b=6|9V<9k58Vw`Gq9Le1P2Fq#$ay#>Kz}$s-k*}EEfg{^eiA%7A$n-FQS>X_Ks}U zYFp{~S@z=kdty4B^w9*}{p$AZdAKrRqvD}Vjz_PP4EA?z_O5}sbrFTZ{&A0$`bQi7 zHfbqyubTKlPn-eT;KJ0X0E;AMoxkQr2&E?X~NAF!z}9#dvI^16gzerz&W3g zHeUik-Jj6cZ~wt@*giNid;$*+CO63--1V}Bo1A6X|6e!IdoZw7m>#KqKL3?>WXDg- zX@(j=ZrY!QahXeF!8&?9U0n?uU&3D>Gwhd@dyPw7GWiD?i9T&N?Zq@v7Y^nSLs?_# ziVbX^YKS3*^y)&l z@oB_DyK#TjJsVbP0r|AEcMXMR_I&s3g8@Q54`-P&lV5ANO;dU zDYAAEOV9I7+iFet(n(k_Q7RoLhsuNL|JMlR^8?_&aHlx@a8O)JD+clrngM${K3+YC za+L7(@tKAsj~f7KSub5H{dtT)!9%5jZ6!xCw@SomjCyClpRaq0KiUX3;o@*RCb1Ha zqS&cyqS6Oelvq`42f*~ea4J9$^EWV#WQ*%mJ zllb2MCjq`@fEx9(Ewu~Q4TClKHe zYQYutz+VI!@Mz>aZbbM$duk6b9NB z$}K{a~*rdV?B2VK%%hxD(HbkqIkBadTFs?p)kEG%>bv|=GQgpt@g zv*OUH3`}UimIvTkWdu-ZsC8f}o@yloONEl2K7G1hgl+vxP1q~9=iN`c$<-;_n6;c$ z82OW2%5MO~o-%VANnfWjwN?b@e&08O%G&peOV2k<;#Vn<2Xiv;U=jl%s*D~sU3irY zNC;X5`tI5_sGElB8{=C3E4EJnwVATdszPZ(b9OGJ+OE7=PecIUtwB4;1YCzafEEai z6kl0c{*%J=uZOdC4IBrTBqg z=c6(ws*fWWWF~D_?T4}Gsbd~ieTFq5016c!CH%d||pMhder;Hi5gV5Th zj`CEiboNI=Fko zXo*5W->8)0Xj9C9H$fZ4cuP{*y&XxWFu8DnkURxU{}uiqzl(<&%geK^)=M^^iPFrvBjOly zgjMv}dq!&Bh0pKh#KI-4PFBtgFab$z103s1#@<^4 z0S_$=p^~I0J8@`NXoDFdu!QEF^^N^FhaO2NU8i;p9~pS)AzGWa_c)$ydnQ`Q_wr>5 znE5l03m-uuN$7a7zSeTO(Hn%2)8R(7*}y{?&(7MsldQ(*tfpYbhS~fW@4rIhAAG8G z!3A!SBlgcXe%+3%#9}?_d1eaEH)_a$2IH6>Zh8-J-^@m=Do%+y*?_hONHrDY(Px05 zN1I(Qs5>!z;y#gHMq9(a6qv!LfM$$7e-n*4kxHZID1NZG{jdZ z>!2;|C$%)$V4NYkp>wOjoM~Lf!OA-OkGgs|VpAhdDfV<39+hO|43L|Jrle;QjGa$@ zVJH;qwI&CtDVpcb=DApgtSluAFj`&s&B^&ixe5LT4m|`#%-Xg6gf92J;K3E6;-MF% zq&pIpHQ{2ps~8!26u$Q4F8f`UxcBV>QoSf5LtnX17sM|ayt)m#B!L-wL+PR>u{&46 zIci_BKT-h4@3<*#$HjyfK6)+xN7`2hRF!XSkB9=Ih*A;)ih_cabfXBUbf=&oA}!q@ z7D}o}$3eQ0?hpl(lnyCDy1U_9+nKr7cRYAUzyD_7$l3cBE1vbN^$f&kR;ttxlM;Md zX4b4L&P(>jr(bO1_>{CC0JKc&vSZlRmwnF9VieV}q(#*c(bu1E>5i)Pzxl}Gzy^!i z(6bF`nzckNzT`=(%nU~{#volm^Og-#ZyZJQE)UJh$}6P1F5NkIKAr|j0x%g4z48kt zOdwYMtt9$~juUhd3AVM{Sb{3e+Ih~_yf3-CU|zRZ`*vlQ6S;cY*B5l)7+t10W8i-* zJ3DRckf><46HiIw$aR^zb|g`A@{;jMEJ?p~W4nHW)Q}6~>67qNw~mS3eMNekn7}`h zJz9{4RXq6PdZs$Y?oi&dYwm_VoXO2_C7~FmGCP;T2n~v@O>8lBt!rwHU(($mHK*L@ z!OG?qlEN{6yIwGqUc$!wJ0|#1y6}K2u7DF`7wOObIsS1BtVtU~jrySps+k4HGNhSO zFYV$36sLhnld`CK^j2vcGYQLPqm1^T)0#zJK?4U1dtBa8c4*E*8sa{5zSoTB=PS)D zl;+aTnwXlr>5yroX2X&r!u$)j|G$A!c^DK@L)cCc{L~r0Wo9uzeR}3{|{Ei9kBnlC{MmDwcqw;HHYPRWvR)Z+Za z2BSMBfBua6{ed;_P5d&q>>s9V~#L4>GA-hj*=0W*B^XLF% zB_}DeVjZ$#%Lo{s#-ewd|0G{q+I5;&ADvn`RxuyNjHr4(5vwAiGQtIj;@4Y*k$ZpP zHrNH?gyh6p653^e46__K7GkGHpTJ@KAIw|)0Yr=OA zNlHm2=Hy8u6};K6Mr0JzN9w&bKUq6jWnRzhAbTqlntgiqB28<})ttc~KjHrJ1e=q0 zW%aDXS13%IJJ-I3jVx9U)J!-)^s?{c@4;O6vEZ@!|9HHi7g`?&NYJAV&M~SzZM)lZ z{Cko-t<9D-q+5#43)bMf+^_UUgm`6&@rj_6i4^bJ_Z^fO)?B?DnUUH^x^epn<4K+_ zCii1x#FEu~Smd|!_-^N@KS*SB_?={CA7+I#72-uaA<@3t^ z=^t*K%X{klSLeYmo`}WM5M>v?$vU^!*2CX>lzAWM%vB9T;G#lW%nYq`xL@U2=B}!0 zJYZWx(&Sg6$`&j9GI!`$csUbG#6uL-Zlot|FHcZWg!f)qWUo1#avIYxVor_tCrh#C z8WDRT!QF^*mfYU!h#%xx9^-uZNO-l^K>UqvSBIi1TJe#KH$HhuJ-udgQ0RudMJgb; z5988hO=5ruS+v5HP=hUdaerhL-PG$sqEX5n6d(QNTvsxrt$NQ#+4C2dFTrNZ=1VO0 z3?Wpaq$K*l8_4KQs0~=yJ2R1xk&`RQec7l#z!@qf?n~*v{^BPXfi|wBq)&3`V#1#w zjV4oQ(?rZMl*7H)+O|-9h04jHd0cLt7p{tu&k0Y zG&~nuDs)ArFn;G{@Ubr=q0d|#riR2F>{iek9?VQNN-L-`J0%O;E<*!@7w^8@D<~zE zwI23YDJ*XIvB>lvJAdai3)s8D6&g4?`J&rQ?GrgEq5R*6kv_g8_ zd-;=OP!ry4300Zbo0;HC?kreLz`;yy| z^7FZm;NcZgld@~`P2Ksq!bEzH3yr1RbfVVfqBK~Te(zoi;tz1;Ip6<4 zxI^?t^@1r0s4YGHJgIqQ&*OQM*9`{Jgd6gW!2|cak?=XcX{C3{mWhUS6^u)>k^gZN zsl~Rt`nyo?@|gZoCzjT3!Ov!<$}j5#g5Js)1>COHdB*MNjfSlGuPn+AtaHt7RSFB1 z+lv>+fU=Sw{yH)7;*y3cqJ^Z0OxW+x{f)IXe2T9h7NxJbhQnd8@jF;MSm*1AbuJA5k{1j!z(gPpH|1&v z`tS-b$)0%eTo9HJ;<&N&e21fK6S^Tfdca2Dq^lIoCLi5@+WQU9@*;b2DX@1CZ1<2 zNLOw@5xil_Z;`e3j&xtiz6($O)s6gT)VC%KlQF2MFIH|Hf;ssOX8Tk2EXU!?x4jpBQm_ru>$l=f^Dmx(qKelrS7dI|ZGtpijhVu=$2Gt!Q*Ba>Y zFj4i7_sG1AGy6pn7)?>mRcBN$V%LD4Q98D*^L#Ys%8M*}UGe{f59kjFbvD$552bF3=8a+peUWip z?$I zkPcNE%ZS3c=}f?U{kK@x4D@zrgZpbof`Wy2KLzy(egd@2VOp>=;YG$XPg<7r14h`; zkaAjJrKiu+L2&g7ptJNax20b10MgZal>tRJx-mzwS6KA-Q=@$+)w^)K7_<2(4{l0b zyVe9sbrQWWYw^|_^4z@~+=at#iHYZPR@z}Pl^{lRdlB)b)6U82B_x#wK-gt$EK&@> z#&Evlpc~4!5&l{RFl88-ql8!e>hpPEH!c`Vpz6|jeb%x`}r$%BCj;HNhM$8>*?77(h8&kK-mZB58T8)gT3+@)u$N1 z5K(uK=6Q%RlrDJhaxX!XHLw-FI9;ee ze>8XHK`Ck;r)Y)JKz_KCcbfN8WW>%3o@Grn1(NJjvQ$N{HIWX@L0uEeZkmUC-UPJA z@ITgr_z99Z4hhN>)k=kOCKeVtbQ_IE6oVSf@&_8vKC+;v6aG-)7xy znSV`p$WKtCFRbOT`k^Wsj`N%+K1b2*k!<|*2m&LEnh27Zt7eBy_RfLu>oELQu5nmV zA~Z5Mc2%@u4+?IiJ9_#`Lww5l@vsYwgqe4scsz^4p1Tahs%5xsAhtCe6taL@pBP7( zV}0bVoI{lMh`#*qJ>@=3447k}7#X9sH>B)_PZIRTH@WoPVV9(4x57N=l_neHb4cHY zK}-N%SCDSOI8BxdD})JB+!VWkC46pai)piuc3YTgH%qT)o~$;XvinHib^CnwbBnP$ z{L9i8S!N}1tgbqq8~tR%`)(=BKtjSjOkPX=`j=1g=S1}H;>cFu9v(k$bo^siai!yN zjhBqGp~fH8)B8sIE}VJWZ5^AO+&EtA#Yrj6zQSt)MG3j&D312flih4SY zs)@0$b#Zp18tpz^_IRg-@BhkpUgX1F`1lz9N0j!Gl5*oeHa0c$^ogJ2#kVqz9POei zGU8fRoJJE{XVwyF)3oPDf`ly9w3gVFVeY$CHJhW4nDg;1#j-3`btZ3_6RN!{2T#fB zE`JIa`Q;~5BY-F4P$x$#vvB_pw`b=+HMLQ?%LoQSuI}A?ZBD^9$w-HD;Yre{q(%^? zP+Qe9LqCOOJhRKYguHE>FrI#5nHfWm!jnfbOw24{4E++9vt*UcNq)-IKtA1xSIw+E zji=BJyZugR0L}VRlY@ds=IaoB$M*L2Dqvi0Oc~DWP`w%_s7AjRv`>W{^)-s^bW~uW zs2qLyNd*#TaOCZG^%_D;zqSp;ceM~N33ZK4V&YK=kSz0&MVspDP6=6Eba2$>;-4{d zaFCyGwGxzkeSOrkEd&9I%g@}w#GcyT#HG{8#<--jU^A+=f4^S&7 z?dj!91nipz9t8WtQcD@D!`5?HYr)nNOT~!zM>y_WX`Q}%wE`6~*qIoR!EJLzpFd;z z4SvEg9Cb8GL;vHkhZuowR*bs8xW{Dl@)L)$ZL;6vR@+oX#BFr;Z&@v^toVEx@8UCT zN*QK8Ae6cl8^o$7y@FSh75=%qXmEVg=gM|NRcA;sQ;xNC0v@W*lD)7SWrw1ui5u0{<>y*F z_;l3QQ$TtNV-GnvPN?mJ-BS32Fkf{OpN8*ur@5^_@q{;^^;92f8%`XTB;RRwm*drqR1Kax9=H-whAzBoiJa{! z{bYJ)hwYnGnAqitGU#_!$Yn+@pMRJ1+M=oM>gpT4mIc);3l~CS!ox;Xo_m}X^v3Ml zhlQCC0oTg-DB3+G;-gjNr{pQ4ge*Q8_szn7Ddo5b0CNj7Z?*SJgdq_(S- z%F0o)5gdh@>w`~nBy=q0_3NvWf>@QPJ6X;%(x7n?=y&b!gNH*4#_oXt%P@LdBJ6@2 zX2?V4sEy~s>o|Tzi(Nzc;#p-TO)4V>pn&Re$Z3AnmhoDC*b5EM(A!X#^0fQRT%wJL z((ylX*spuwR)VXNg;J_8cH<-K7IKzL z^wzCgT1);%r3i_MuS`ElvRit#U$-eHttg-B2QPNs1&Uzf0B0hz%TExq!?_l-+ba^F#0#ir2XVK@r*_vkM^z1a>nGG zZd1EGtC7CN}ilv!%YojXWbSu2lIEj&t_QbwZS8paD}=%vj+op=BI{fx*7UOY(c~+x1@gl{mK)`wU%{H=55N(mzZbT zOjBw;t{A-pRQo|S&Bqk7Ru1dy20fXj7YR13S_)%dxXk`Wih?Ny3eT-$YTeJ!ABF|Z z$mAk6JXB9OL|VOyB;ma$?VILP5*OjILnP-~AY!#|QpgRtiX|XWFnJZC(tG?_qjzH- zM)uj2Sd?YQjCqR;?YPXOc&L^*xj>|y)TmBe9~yy^(H(}VmI}52L`GJ4I?#Xn&rJi% zML5Vbi&=Xp$-|>(6$#*l+kq$S4#9Dqm>93~^!Y1)0V^<4ulkZiKri`O+z?5Wi@0v9 ziP_T<<_IrEGqW;rYb%@TEJt54FkLRA-W>x0-fii)ZuIA#s@@RTg*<|&cEn4?pkc4e z{u86?Ck+c8tg^BnM+qYDan5GmB@>FK7Xreh`P>-XwF%X|pB zW_MAy79K<-B~mhCkeS^lBG9$#TRO{(CDeP&GLAXqgs88Gf1;up?bP{`8%uTzhJsxe zztT_Sa*Y^cD^6D>txfE%8f&^vH>*5Efm1tS;Yk)!+L+w4b7&z-NMnF zG4>|$ne`HoTF%%VQ)u#=AZ-^c$i4fvuIAh2W-6n(Bct)$Xn0ssVESb5d|gDeTv;@$ zJ;xOZqw1%Ijut8k3xTJv5E6Rb4NbYARcd9!zqPv~B&Ri^KcluU_xvv#O#BlM?(Y`5 zVynh!sSSXx@_cZk6KyF>9TJBDqW*N=12fRK*@j3B54CHUdAn1&aD&<&+k@T2(?`{;i(C)$FopG2R|9(Yh^x`8@ZYareiIbjjwsg@YvYfN zjXed?n_Yb;7_{L9skZhBDsOUV;=btw{VGyBxl4U(Abdv%lym*9Id>ti@s@bss+ARV z_WS1XLz;$V&jB|07V^E-ZM@^M!`(#sXKqGn6xep=txO(b_~;xY)RK4Snkkj9Al2HR3Lb#?V8#WpOsFw|^FVAfv73FJZw2bZWo4tG9L z#+lw_)z)>@W5^V(L!xH9_j+2A!niL0V*mN~X~n|lFP!tzTU0J11ilnZ6_#rBEEOwy z)?OPvUpEsimlu61qNj~8u$xzW^XaAOj!#ZZAu_q|UmjpP$kBfWeF%o`X#6Lry19Tz zS2;C0T{$TPa$WOAWh+Ka?d_aV&Kv4IAX<({rqeh}yLM1*y`_3z1ifMjX=yWNA*|@W zi#E8tK!R>)Aj1mcDsCqWTP-VXMO8xo#dbJo92tT*E+O)~9LyVJ)3nmzM&27|mJ&_H z6{b2nD0&4Fn*?=KH``{7y==E@tly~{9sB!s{N&>N^H~)X<=dE*!cK()(R+0u2Jm{xxPnK4Oe;4Ls)(yk+IXp$2SyF_8bv? zC=w|Wl7K61TM*BhXsVQ)t&yN#h)QoV9ef;@1rfr#YikO+{QY)!vh%3?wP@dIL*D4#cf`Cin);SJiYhC0VC~0E=k=Kv6YZ7ciiP{ z>J9e@$MO~BRZTQ?1hvWz-jL7wdg-GPLX;#Ld^#JgnJH6d3g|Zd=YfInO&v%OLOL7G z>XKZ)z7QYW?BCr93SZq>EfYc%5XE`N64M#{uUA6Jt#U0|Xd}8bE??$9$Me7cAxw!A zR$|AGfJ}UNyX?|A6dyesu6x$%R6dx5+!b=2=$L$VI9v6!>ddxPenlY{U(3kM3X+Yl z5{zGubH^50t1d9B>el0#kKy7@g zYoxw`&t~c=9m>fDd%fKpmhYbzo2N~V`gJED0)(VcIuyEVJJU8W1e1=2%>V9ZoU2~|xf zv@+#^Ht3QTqHxM~OHQP#LlfOgSx5|Y7QD-dPZYF)B#fk6dQ{o`UCN#7ECH0M+j=eIZh;dg7w$=QTB<<)Dc?Y9J%AwQ# z6XA1zSl&^-0-F6EfqRN$a#h(PRzVST& z2M->s)u@zSoB&HLQd?7_15<3x=LX&i8jYEyG>W4AmJ^n4ie~ga5%%Cyy2*;jT!;g^ znZ4|fbaCHvSf5hOIdgJLksQH59v`^u`ecG%*HA4*BZtN2B)4E|EDx(|WhReY(U;hP z__o?n@daJWJxT*V55#ZEg10$X$!<@_!NnyG>Vsu}n7J&5>JC+TZ#R&B8~{zyjU#Tn zaN@4!<^`ZJI&K#Z#mFb#<+Y>+%K$MNrDv{>Cm4aL+z2^Vt~4ztq{c+RK$0eN*)?Xz zOHt@#6uaEzNDq%gy4!7@I*V11bt@=c2;*~(nbtsOG|_zL|2Z6z@(BL0^!8}(bhNlH z9``axRM+0O6ubC3CMKqTn1p$x+P!MSq?!8v{Ht%P;)ZhuunC_kg-t+q8aV_h1Nk)Q zWS5--d2@{K&@LI6=$WLb=BLd*6MJ&@jh=$_R{c8}JfmsV3n9F*?565@7YvcK^@#s( z=E~S!ijccqJ+SEHY&2O;)sU2uQkPWGm8HMgTb7rX*Kht1T?lzwJF+OAk4$>{BaCk{ zP4Qew%blX)JxC{sy2uAx;O$*&=6lvs+2`#uztyK_aqnDd8&qqun{sNPz7WL5NU-q= z&!s!x!5r?}h@{D(Q~$zm{L^MFh(LALg}*w*nlvtB6qaI}f*+dA6dkqzC@SFUq}VnO9}#$Dy*6DYv~-_K!>7`<(= zTb2ARb!+1$5Im*1zv$O5Dx91yLWo18digsYA%)s+V?Ox)*}l=kwiQ{8g5)Gjnq#)> zAT+(l1DB8CSlQUVe@##Xb|p1fMMb81%tX+KuRb`Jg@CjP!(J`zkmooBQoi=0gRKD4fhV>Q3+DGLTbS^&`m88YcWWzvy#}Opc4Y6M4Q0T{e9Jlt`;%?RGj>0QPbl z(Whzq%=T2NKuE7CWdd*sPJ%6Vqlw}kX1kXv`SuFS36F%&+RJAYI=BE}>yIg+ROz8- zNn4Pzg6?ns^TPh)>bPklx588AY_(x`vAt#4DcNz0GLVgbk0m(gw$?K$NTR2wM>7$b zn#$O>oMv0uo~nV>qyMGiaq#n`eRLt{EJiue0d+q0!JCV@2#5;B3`QK6>o?5%t`rsA zUa1Luh%)sgC8}_3Ze;eK@D^YWdkV3j`GTg=)@r|c{`JL^UJ8QDKZCaHHY3f z4v8q6A&Iv#mikIxZ~J=u4ibr742k;$5svmIVg6`@<6!#kM*1d&tKI;#{>$@kPG!wM zlW{uj0^?hk{XdIrY`{bWcEPo`N2^MmHxK5_gwHzZLx72An05`F6)rw=FLB$6;sX>D zt8uGsAr_o{6o3Bd165vICB)%PCYyPVE8LMLOV6*nLRb1ez~Nk5eaDskYrrLrO4rMD$us6FiVA!otPf30)S`5p?$) z-ClfuYzs~e_~94KQ&<#wz{F;OfD3BJL3nk4dx38CpJ6kDN}5;EOd$gZstkf6vvZn6 zto(eBWn<-#uMj0Nr47bOMvPg`_#f=s-C5Nh&vit*8kfFhnOjT7QwB-CB6Ja+e-_Yj z59xLlw;=`UlynCLNk$ZFlLndpMx)l2UmYrJH zb>{gOzD~D4iI>4Rgn1Izd6NANyB}p-jsu`bb+6B#wg+YuBGUo14L+c)0Ut-oq~QMb z5$9mO_I0M11z-=4q;;cjQ7BR_Jr^24;_RLM?45X{Xs^$cuWcs)lJc^ zyZj-cq19zUfedzYZ?P=nh##X_tq(+QS%K2;-J+qI^dhy%&QLq;6-N}W$05&Xhn*Z` z1CMWI16sf!maQ##VP^bVC6d^==xvejp%h`=jAPmPGMi&QF;q5Mduy3 z!QvF|T;n7kt8wjPaE;HbvAd7+ckn~(uO6@^n)b&D zB7IyJ&32^t=gRj7vn0%4u!V`j^MS7GfkSE!ve4LZucGoU&uX^UGJNDN#B|K;-i`v% za>Cc-)KXOKZh(>T!*Qi&WFq%Ap#Pf$3u;e9#TUKis+>YaRc*@*MF99wCW<=qg zaVa@X#fwW)3wK3rW$0}vxrwbmll!TB!<_E)-pKEybkyc2olH~bzN63k^mmTE@W3b} zW-g{*iY^^`6&P4;Te=v&)>$n`?7BebdOT6l@HvCK5aY=$WQ;!_jM!t_l1e%Aqc>o= zgy|E`N8uQnTh;IcRk_}-KoB#rQyT^lrZQt6As|k=%P+~J67E^-+HJ^d=<_dYD$p*C zkN1j4FI(vE-dCfL!ji#pt2zB9^|v-+8(VTlrfX@xZi1^@MMcFoaLn%W=RRVp^$+4m zg=fn3EsHqrp+kc^o3p+$t{W|^Cf(V$W^~XTShpVHvFT~=5ld!-M`oAyWj_Ct4Q(DS z{)cVh>-Ao z$!cGyJ&fTMAi!RJNVP?c)YeMIWq6K!Q7>_@?g25qd8(avu1_b^icGN?q|jZMO^ifE z`j7^Szla=kQ8`1GXGjE|9*UO$;zo7Fu<9Im`O8e|BZ8KYKf2zF2y)R0Z#w| z_L-`}yOYJ-4xzq7(rgSPXWz62dKCQ-kTJbV$|4r^?P@^~K%UG0NSj zC#shUZBEIGTwx3U9h1Ov2rPowz9wQ7;vhDC=^mTUCcz#4`uP&y9nHTjx6lE|_@ckG zMj+vM%tI7V8Wy9PARyJhjB184eTyAmMP=TZM6KVJ;Lj*IH33m+ZzJldVv=pX#_;>X z66P?k0EuZl&( zOF6VLB?x?T_nWO;Rer~H)svx?qf4};GMjkePJj}K>#|ewp)5IkVj>lrZ}NVcFws!< zy-jX}?mPbJ6MnqcPmrka>CHH_k=6i_TiQ&6-TeCoWDTdJ9-w%O__RnvK56UAeB&%5 zBto4pDM>iJwReg7{jOKK?k??m=dPqRbzh`g308+xI`sxIq|%j)9ZDltq&)uj1O51j z*pjpeyftocG^=B%+P6BGO@BkP1mMMlgoH6mr!)KGWl%o8ApYTODyObah7?&*L+Ic} zuUuhql})fL%NesD6q?#vc)G>CYF!rNn6EuR8R~8?Xz1>E#SY1fFj1*%DxTn?J9SOS zexp*8$x=3l5r88@E@eF}ausHu!NH14scd`jJJ7tq_ul&E5=0c6ly}PXKYm{D++s|_l&Gvd-i7ZKGL@KB&>}}OM z(T!JtxNzq4a9#HSHU40{D7$SzKf&)z^2 zGX9U}M{ZidDQKbKig-xBfq&IqSStA9Stjk$o@%eB8ygO%WSu8Y{J700(tw@a{@+|b zimr{VC)L8RH%jeWa&&yl%BU_Pmpb0Cc3H<%O`B9y^l?**U(K?7G(d8DUq`UkghdjxG9@B7si&QS_^+isI7RK)+Ab&(hM;T#WdwClt$u&>h%fIi&Ir6cje35QFfDzLoyz&z*Z(W8d)}$f4#pmP3JO zUiYj|UbAX!G(x69m^Cy?_A1N&s-6pveK*WRDt=u1)U4e{r$UoZ(CcxNAkF zJSCe=(c(i$<^Dd$Om|LlKMx|Xq66hTC{oni)YsQ%XJ>zZb2a>;UX5%nSKtrs_2zM$ zup+CAD_lYd{abt}(b>Mq9^r#GYKJ22CxuDfLGrwe%o`zBrHuPj!Q}6 zARg9p2@`tZ;+yb0l~-yO)qN;3(iBC?hR;Q|W=?YtsVL0#0d#O>9TX7Dd^^>ZC7#)% z{=*@3<0s&@oVS}2+E7F`!6%F-h@_W@B;ftRj|M?Yl9&km5+ZSc)Sulwi)!Dt20)G75 zSHJSEGi$&M%Z7uEROd)i{R#c6JRVOXb~xWgP3o2Zt5H$y^-u4CNvsRxJ#NcVlPl9P zA`$H=VN_61jR*d2G!Wd6P3yD%PNsUl{5^Caam+^qd-PMwDkv;I;p_Vm<&`|>*1}j1 z_vpqE-|EBw8v7;-19P<$&HPgkgm@!K>C)y$tL3E_osPFdKvQ7_f?E}p>sV@r&iGUq z&DG4%s5>G`PP6fwPc${u49ua~bHnjXBt`zXKq40izxtp0G;&?K#SskWOs+NQ9D?5q*|t z5E^wVUue{_#b~m`WG>g&+t`YQ<=v+q3D-xko;7S&sS213tX{tMhoZr?reuMQC%<(x zZdxFuh@M8*3fo-}EU7hbQ&N#y6J-A}3H_8;K5yxcJXA*V{5-eq_?3PFx=&Z_KepyO ze2jx=y3Nz<>0s2y<>+odeP*BpLxjqQN8vd$(@KQWa@Z1>?X;@Qm@f^+v!rKc@LBPh zVF-{fu~Tj}P4R$(qg(HBKu-2vr2%?+(*N8GNej1+%O9hz?Yvpc*!Fc@l%Z$-{zVr?qX$D+VRxfyErdR+2bWSz?;wz2tE z6N^l`5{P!J`gbaC#`%ti0oaWYvhAESL`HflDjulj8Qww+4@`Qme7Vk6l}+jXd;U{W z65>UI#eziMV{764>)~s|+BgUj-U@>xY3gSN-@P&+ljU$&-jzKBCYC;mB z2j$$=1ppj5Sq4!>bVPhBRGxs^2hYW~jH+X0O<7yM>?f8BOmVHAHqx>cQt9h4UsRHl zl?ARIM7-olI#sQin&+iX{`l4Z<|hPKz*S+|%houRly9&0Q$b)mFlpuhJvDAusDWzp zf<`ro?_A}J^{j!a&mIVEJx~|LC#nG z>b%01%lDp6xB)lDL7~ZNLm2Qh6nKKl*p)>d?Zumi!n6eWl$lFT0X-coS%>JYw_kBw z3R~MrWdu410qtSahE}zMBJHzbh0{k0$T6&2Jo0Jm3w}5^|K?@?)8>~#PU4o7l(aS^ zv@^7}3=yz7PycI7TEBA0VXH3~Pm54($o5#&GQJxTa9X>!1wvvXh{n+TS3emsN5@T~ zReEM-UnGs~1F@DW$NtSe)dGuQqS4`rdaHys&?xh^L-S-`j=mLv-!krhxD_LvuOz;#uA%3c zm=Mw6YLGXfMEIk{lQ(8PmIQ$N&_&XjYof80zPntO=_+eck-puXeuM#YV&Lt=c`VmC zta;EB6NfU*{hYXWubaPI8}Fdn`9f87!?u?Q$WYa6ZrAoRQC8!an3&8%9pEA|)uQq- zF3G`^GbR~8{H&Wqkmr;eMVthYt24G`j^#Q3r}7HDrO+D+gh3g^Qxoi16)ehg`j9RP z=u3rDbzCSpPq3vZ;94Jq64Jto+r@T?{Gtd9aj%VVZ}YJaT`2+0ZQ&mJHdNjkW~vcw zmd4Wm(`^0dhv6qT`7vY0z1$sq!U_Nv?<{lOz26qIU%qs~KYfX3Hd%Yyu*Ye!-r47< zek9aEpe9ZL2#F7{SH2kEIs!}`!n=lsWMA;QZtNhW^?R4|?h2dK#U*TQf7~v`- zMF#D&U6L)^EGpwmAuLb-`u8jV=gbmBQ_p9$cvRp8UNPBBj(%grx58Whc9AQI_Rd0} zF5r~!dTV&U4|5Ky1Dk;XFB}>IoV!CG3FX-49C0vVk_l5|WhU6j&dsEyA5Co-mKV>- zg~k=+G)>qR11LMP)hXCm_*L_iZ1l!e6STbc|9RG)(*U4i@%56II4KbkQHH~EIhCRW z7VEMtpnAr#n@f4_z!0^Np;i%zZ2lruw*9&6Bx-v~U;IF7bTp02?5Yd%%1kdgGBm_b z;jJZHwpE}qnV3W`zU?*xk-H50#gR1x-oLt=_4)r?swU#Gi5V{vnJEaPbE`^eWJJYR z=N>qU7p1+J;d1P*KzQj$BOoxieXWXcvPy^L9gTat2$%WDhX9(5Dv5UNF?79{cnXTC z6_?%ShgR?@e!XuOu}sg2z+0%@%0XD{w57snPlORkfs;9ZByash zfJ`?&HZlIeQ)^UOI}GeBG)xfJh4Ei1gvR*6XVXx)h-O_WzIU3S_!aKa?ARO{8zI-z z?de*taEBAnfN}pOVku7mM39W*IypE}>M~3VR?Lad{W~^@P^Q|$9VBE)NKN&2-JRWa zZf`hISEb(&_6)Sc^jr5wYVkP2Z1j1A;90s}Z&qqitMY#i?X}40Neg~gV@KLG1J^~i zrrs8gnB@Xq`@eb$(vlq%>OwQlsMJ!l70Q>C0KGmDTq*T`i}o+A-UmF4GIUE+K(s9q~O6KvK{x*GIyf z1;+{LFm9ykkH8dK{(7L|Wfudqes&+vifd@}j5ugGs98JQS(tntGKu$DFmOS?NavJ0NL>TN zT99R+_^l)ik({tqH!?EXf{4Pp%(D3*2+MwJIzi|TI~6@vZ(6=Fmq9g~5FnH?mYr^L z*Udava@RF?*AwH3zx*FVaLAWt&JA z-ch=ev*x=yFSNqfNd4Ur@RNwlDLIQ|;7FUmS_70nX!(UN5x#FONd8tbPn(cpjn7+7vW*+1yR!1PlM4w-v7d1KA%MF{WeabFPM)Z zYL+1U1Q5t!ZBN%0qHWuTm`gWI$knAh3xOpW4holK&TY@Sx*{xNh(LF1QW;!KjEp?X zb~np>5qWDBipKVOHvNY89c4Sq`|@`Ewb%V&!PeZNu24AGvt+sc_NWU!SI<3>FiWRtjx^fPR^1kSlju z&=YXh=O_%In?VptmNi1Fu{0$+r- z<97*>6GO$w$hh4ggW{7_RV*Vv@m;C*W(9VbnZPy-b~ujhXxRTKv8e1aF0O~p`hy&V zcQK+Q$C+9-ZZDK;uVAeOBc#+=Xbd?Okxs(SH$dYgCjPpe)$;vC`}elC{3`+q^Pn~# z%@)ST(?N-O_Rs5EK~jp(cBd|APQc*4FcZ#ULJTEYS$v@Fk^8+wxonJI#k#a?8#`$a z#QVPfR~_E54`0>P)NJ$uYn2NToT-GUdbu#C%V}|K$oFXeHp0^1H(Gr0b#XC&-%4&? zrWR-%@ z(MD2Ns`LrJRHx0`v=z1oq~lU=Cb5>JR@obgogs4ScFlEXA7fv zmPM7SSqPqkdXAj}?b8oO{}h@ zv|KnpGBVPcW2A!gd_nmpIwj=-9A1*~dvU<=y&@;qG&yo~xc(Ud-N7(HqxKXq3+5Hj zqTFvcNzxa;NtVoD!u5#*>M3q|g@CXE*>c*4cA#U>gpt4Fh{>vB|QuI<#0=H|qg zk6h!=<2e{cs{^EJVdhJcYVN%eBX{TD=|DgnZMhMExFrYQu5d5=TxE2EW7skFN`pEDC3Bt;@?u6qtW zF$z5MTBc2|=r#QvY34>D#gpINxgVk3y?^rI!F?HJy?q?oc~j61Y+F2Z5+ib9dEcaG zc`f2yfg8UK6Z2f4cVu5L(~UwxU;c%wBRNouT5F9LB(m+b?zlgx75u5MZ>J{D*#*4k zv}H=V+sl_POP^?!pk|nzUF_{wgz|ZV1hEtrli@(t-xSG%^VXl=`h8f#Ov@Er2rQ3?wE%S@FoU?}_OvqQ%kKnA%OOO?n#^Ok`fW zQF9eb`UVP9*gwE7LhV2v&u>`^VJ&3nPHlBacwDm^b0hzvqzz{t!#)fi8YBkYkA-(Q zx$m9((dr_gTYu~OwnS}e9;Pime^Qt$q62}u&S27OI0IOB*u#wFl;Xqg58wC6K4E-5 zAtl-}z8@(puCTXPSFTvT>thHcmS#YaBGA(}4D$eBn73{*`3L6ZK(`HJT2dGRGkf^S zHp*m@Mvrc9r3`58Q_s@5ba=rxJX-x>{l7~?ez1;T{ebQ0Y|*Qo~~ax7Q9_u)Tl_$LF_+o9PJaH15dd;p0ii;7N(`o7+Lg}NVHbLy$xfzjPt(NDOVK9IgLXLLM%2!P%k*@?d|9*# z?g9X3-(5ej2Gru1eQW)HSXo;S*EOE8d~}uc&)=7063_L&kQ#80xBADwK+G|LYIxwP#(3V$?pnrf|7@c}!BZadq_X ztDSlJIo{*$5=UF*70S9ez-!n7$%i^)_e0MMz0d=zIKsxKq6#XNg9aV%rxs$tfLzk# z4fYVATg7{#S5Y^9ZMh0p97uU0qq?>W>UlcK#fvLRVJWT+uU2jymzEsD?{}5HK z`dBztV{IX@i~>QCd!S!KZR!J@(mx@HFE|Ktg21C-vw2=v^nU(KK0XivAs+%bytyp! zgpRdX3G|*>!GlRs4?+PV8WaJvKEDIMc5(Y~w&MnAjP~&GFpUe|_@cIb>_eXx*F}eo zHkipVUPg*^RxEA{bCA-pP(Tl$Oqi+Bhev`Z%>+)|*cIZl#h}!)pWWT{`&jzi|H7jS znAj4Ru^lU-BJUn^NmEQ!Ho8HtUSu?FShCZu>*&zkJ=9*GX4s}YDVLE8yr9b!r}5kW z*q3>X`nhvI?XG=Q)xnDz6?M0d_xW~cvRU@`J30znyKZ4EII{9ua>!D*0wceikiylK1_?%3gowBOUW?&U*e}FJf-Gfw{q;~1vcL&TPxK7IO?pC1hk_1m?%Em-($z=bYt*X6JFIa)(8@;p1c z75kj|!M`5s!~u*;moC-R)h&H=MV)YD+c^>t6qL~1EZ^4FwjO48`*sXR_IX;`sylc6 z=4S@f!a8oOO40?*Vt>O7czHoO%;4Ph6#y_b2qEp0=;8v8=z?yxsCl?EhjEaH%o&2kr1P7(_UU^O0J#3>SHchl6PJV z?QhN0t3BxSrhhGR3sGQ&`6a4KN=njPmRoD)P(3JXffnrp@j3sME{GszxvU>A9d5X2 zOC*yn5VY>ypGPYlmhkfBanju#vKfX87wR?~ZyFn?ctz5yF!_Cq@$x!sbbqW%pQS;+ z$Nl+pe3dv0BkyB^dSUPEPPl#G;NWO~)6&q;bgXYk`2ua$`A@nt^461b5_2t=!i~?% z$(RM!7rtmeL3u*=i1T%yptuhMO{5=8*8N`5Cu?M{-Yr=P=gS8`OpC0z%brUIU`gr5 zVvFJH+gqbT9glDb8WIFRs-qV_VRN=j``EE#`mL)a>k}$v^fXVal>cZ0Mof-Q7gLckh4iy3aoQ;EFQmobP<+i}!n9KmVBtg@iBfOJB5AJ>z2^KJTs2P`3k|V;qqnKr>Sa~I&U4C>eaq=PM&8~r* z;&jE$*-o3Xu^hV+{aCwPe!)!}8Fmk0|3;z9?i&l(TFn1(QMec{f>A>-E?E`Qoe&pC zH~s1VXzv4`oImk+Nx7b1Wbhw`1!5 zE0wwBlnB}Ehle9oQp2MaJ zE{Xv>RU`f5u&f5pYZozmsO~YT0lTGKd14k%&*Qixw46gF*XDP3`$IQ`?+d}T_qq>G zc=wEwb?)A9)fO{H2+7-}a;(tFYQL!5iHHlYgbps7+*F#a-$vSde0fo^A?r&cKgOP7$fvMTEA6CrnceT^!X zh3)PnFB&JpwFK1Cfo?|H=c#o2r6FxrbmIyELUWdA(v?y5_$EMUbF60bkX}*vTc9?gRg1HyZz2;iLt?kr%s?OKTsd=f*gL*a#o`EOUvvsj zhY;OifHm>~8v+<;TH3H1u|x_+s_={kV`vstuU4eAd=b_8tR|8GWKh(v(QR)mMnF zuw2X!Uv^dCMv9fP9jbhOaxd_suA^N+1~JX{N!=-IWh2O1L@SH^rg1B)O8sJY46jio zzEr)a?&H*2`}Fj3p5=lu9bnSA#S3l=Wk;NSjMEM=TYE@~ivnr0)OVX*5+;gsOiW?3 zI*aqT~2i$nVc(coo=b+)f zSM~_yi2Om_j?yd7m{Afn;_1bwkXdpd^O-Os9&7u2MWQ{ixNi`;G;nNZTehg_a+1=C zamrv*FuZI+;-&Tm1JS=jxoG#IYg^@=Q}mI?61>~1P*=rzKZQYu9 zH;U;#Njb%sJH5sbadyEX*DN@4ti9sjE?-z8{#hi4{?1Es&Ayq7SLhG#qPOcYT=|FE z4!=Ki4d6w;sd)+K;&A0%O2A5 z5`Otzhpz_F9xtyN?zzxNQuKGko(vKvd`FJk+Cvxf$hr+3kQi0=>x#EXLgypzt5y@s zP1MM~v8c~%j}bDDwyAi?{J5Cqh7H)#{g?z6&E!Ydn`-Yv^DQ`bjV;=qi>=8<-`~qp zlg}U?=c(s?I9go&7y{AXKXiF0JrwU^!0^KW{-%xo!F3ao1N|kRR=4Kd7?4JQ-sUit z0W}TH`}HrKon${w`fC9;hg}U>ysOK06!PH|q$e!lMZSJj4FU-$NTUr)&h*eoj zQhubRq->hurv6==$ZP64<{B<>!kg1tV^xEaok#+uSBd=6`&r+##{-}QoLx(+6qz~r zW6R5FZpO8Z7c7j&yA1SJ*j?;2}jI%jdU52HV7Rt9r+( z@|GR;T<(mCT*|xuU-T}14txGGG!w1>DKzELC1>Zu2Vu`a`DXw?AeI@x@ScOw@xk^L ztde8xrnaz$qgTCHUL_~EZag0c-F*AG6(q73q-t7F#5_}M?ctQHZ)`H2*LAv{{{D}e z#|w&*QkGy6o|c-r%}D+96(D>$u`sI(_V+Iy=65@=ja@z_!6oDAKg|;>z$NFKFYVgf zpUdlrumLEcg|(?B&>6ILQq7M8`QLgmLF3HEFL3Qa)R*UY4W|8tCT?pQG8bi?fc8Z`F;n>HIYQV_r*ssxYF*OfD z`7)0OI7*FST`ad$0R9zUF)PRAL8UTO=eoBO@bqGiLlAfEK{}iWkd5 zy!r%RMoDeYzjTv53$CE=VK|$b=!+G;uN<`MY0C`^xOw9xuj8OB39o`|iowaYCQGA3 z1^*4l(HpPXAbMVW``eQNa0~Z6UNd@S&LvgW#!L;w0qyqIj^W97F|(D7s^msMMQDia z&Y}_G(}C#~%V%EBzGo!5K}xhL0<+svioXIJ)WxTIJaYCbk)wRy7%7_kw9XqgXlan7Cv1MfW2YGPFGh| zm4b}U|rgdsHDqxE2W zabS3OxV~+6@xEB(km>G|)b&l6((3l0rUTb8;7r5KQ!Z)Ime4d15%C?rR#!(nJ&d_A z{X-1z5Dj>9=gF$MC4xBJ?p4J8shquWeUtJ=y4q|Z0xv#~OoEc-oUxj?k3whd_yfQ7 z)t3AOoHB>KHi^gU_n!`RQ&q z1bDkE@wrCC$X24l(-9v9`9TiaJCTzc=eaOWPfvNbw|a%k3rS|SU%Q03Ca)~k99Ew?S}u)fdCv|$oi+tvc+ z4`%>}&J`da9oXK60P0XTMlyaz)$*_U9*&L`?4)g&iPpA6+I9dI$&FRc3XkS@tLlTn z$mo$fbwDbsz?t`G`S3Ihkg90d}WQI)}wrJehHqlxa;)Qn(IL@F|(xt{knoQSI6WtD8V_gWEuh3 zeR*UWHn;@03tjiEGg1MO%vM=}Tc=U{i!~rCDhozaDdOQRt*x{I0toZ*vOx_KXhFfW zH!gpVO*dq47rwHoYEgTCdTI$(j6&u%k09Xibin&87eK(MAS3f0Jo2_c9e4piYGcKw z4+-)#YE>SDO`jS8Fa|sdv^-9lOzZMN!Al4i^0K7BV;U6WBjV%7>-`3Q_ZX8;y@C!k zZK_mGZlaOI-;Eb~B)6yJX52QY_u8z|IwFzAKFzdwupb)NmlNwTRWdiv>|oN`>mcHK zPEM(Zu(L}kJ(l4pa;1KDu=9o(s?M{KX=nr+kA|ANn)QBh^X3lAtIk*zgWwUO5waC&Z48 ziJMy)I4w)L#}sIoS07K###-1TU5)__aV=;<XHjfim z6sy%psZp3@0$&!O(#lh>RzRt^0R$y^hRJrP9v{fnggwJc7QrFIz&Gc0bxFpifx>R_ zNK{0`@X{*6@PpOw_0onl%zid9MXMPF-++ft(u{Rvf}3KdR<7a(_XUejrgHL`#SmeJ>)H7HsklYPmp2?<%l{emDw6!00~wRbTI z?>tF7*~-}d%G%8pzvfVX7$3-Vmr@ROorpQj<9K*@D>JMa|99NA9yICu_#7@xOL-MJpr|>pSWuszkU;Kh zLl_SV_j$l^#lwkAmdC3H&EwZuSt%)ngx@9PL|v!STN)VN2D1U8zvbbz+X|ft5LISo z=Is1@IH)}r7$mwWfH8A>jyF2Gx_aoAKQ;`Up(%|7uEErPf>C$n7@n9eGoQw)J@a zN@!?R7V{nEm`D~)3CC+aSUo+ls9kR%$i*kR%<=*bEK z>>x$|(9m9@FmJ(x(;OL5NV1wRRfP8B5}5pbw{B^Fyw@rad41`-u~z)`OjO@W`9gVI zr`z{0*HQYAUF}Od9=p{Dd60{ayYu`=R&CHam$zwnsy zbY@odmFX$2@kNXShR^a#ms{r+p= zn58>VjkI$|oIZ-@XJL^ksM+w3q*r~Q*r?r=FosF84~A`W0VE=Eoc-gjz~ONBOY}#; z>^xmA%E-uU#UhS`K@^ti^01s)RZ=n-1za4OCmOm*LBTkEcmjr*2#1g@k*LlS2#c+G zrU9Et9GvhXa7BHfZK57iV$t|8!SV5G?QX)t;^JY<*hySeR0b*rG*67W{Cgpxf!zyK zF7nN7qME)rg)zR+5`5Q0EMkjglzuVj-c4$q`3q?QKp zVuuuql-VU(ATy+79ZhOUP*AGmps*tsb<* zz2=9dR@1`cj?+OSPzyv-3m6ztEvVgz$*)?-zJu|IXo?NxzA?%eK4acyasUpvMxe z7Nfn;pT#ghbVpH$J7e8u1I$^$!PMS&$x{G2I0g(*VOF>mzgvTrBYHW7lgP3jF1=tdtM11 zCrK|*$5j+ZC5NB8rl}fu?L7H zXYz3o2*BzG{)OHoCtq>BaGI|KazWrx;@}<&vWYUFjtb*Zy$D6PS8I42+Yx<|Sg8n1 ztQLNYsBzjHV8hVJ}zoQ7+SGlOz7<3u2X{bq$?Lj@yHvypt*`CRi`ss2a& zC29nl40I*JtyLu5&}gTu%3wxn`?GwA@%~qmC+GSrdr6YTD`D@u&2BtQola@8-S0y> zfA65(>gzU#UvN;*729}-cW$OR$o$u~cr5|cgeFSJ?PxDvy0k#6x$lod#`6(OxXptp z&}cBs#V?V-H;hz1eEbpcJ|hf)dJN8DkTgCerrzpA?ZFPTp5Eb29hX|TB>J^IC9U;u zpMkjp3^**2hTCbLqN}s5t;mP7c4x!{bba?6Pm*hxqQy~^F1eujOvT7(K_#oy`chBN z2%rQGKqOi)IXU^L6Ys{26jWHzQc_FH$_iW1FF-`>fX1NmaZsYS@Hx12{+BP$z`%~s z0hrnZZPET@@lS-mZ-gZnhb;~=HqEo*QWaH8m-!ZVsxf1Eg9qcjwA?}6#Q3p6hHgsJ zbVPj2YN`20d{XX8myhP-x`&d1l7mmV2UQ;@x~DVd9%RjI+ljulCcm^;gdNoAM=+CK zc`Sz~t78Y7DcNpz$nV=cwMnhBh28Cp4NW=cadLCP-t@s=3Y`C;;Q8mj^s>PxREcTk zX6!&uecphn8!g~bx6S;)hDiGX;J@+R21HbP*zb8j*B{h9b00~quSenz>KmXkC+#oo zTJVoRCfeu0wVO9Tg0h_=YNT=-2>Z;Vq=!*BjCd}q#;Mn!EWz?P5LF&RT^}41-@eM1 zZ8vV=FHShDkMf>vTD#=2b=`hP;cmAsTZZ*7& zmxm`;_H#?i0!S&!FEKt95y7=P2x%osR;~vwXB)soRJ=1Z+_vSH$0}t(MV4YP?)Wa~ zn_&?!^n;3H28nsaqm`l7VUsoT-@U{o#=iw>p@7Hd-#S6HChaLOHUqAh*=@v+zCVm)I7PBcrrr z0^^zA_nuiWEN-amM_$W(n1J|hU7OeFc`nSFC9IT*Q@zF2&+2-4yWuZ(wb5N=Jg-qD z*3i!((%*-L*#Q@w=z%fUj2Xyup0cKMPh;C(pz4~HmGvCd>WEqO0|R?f z0RB0Mj2BM(ux70hOOG$GV+aQD054y60K)a0!41E;wKYqkDXFlBQVJ?6F{--&(v&)= z>7g!sF8sRxEkeRftLYZyrlql}cY4KJTec9u9trogD)+{~%?9e);KhJ%H#IiqYD?ig zl$6ugPY+yU_)&gXBlGf>AC(`Mp0X4IUBH-xn7Kk9$yPM^GT-e*Xq_1}GbckRkF3)e zX;`SLn={>*I{d(lY4-XFhHbkMb z)Lh+?fC|zmt}`_T2l_IqHi(94KRU&QTBlSJ3-%*MvI0fBDUvRDNYmqGT;RmP` zS{6F{W{UdcEOk|T#Z1tt9^E0A6`6}p*5D1uVFSPusRUL5&d#)JnavfLL}ei1j0lVV zc#h&~*~W7%j{ifN{GD9nKWk$A^q|=%g&^nfJrjrAW+FzXF%kW(&X)?xw8kqRXK+mB zxAFyLw$SrpTw?paWnepLmkj5EXyZ=EF1#To9dAO#m~e1iR?X1A<&9MD>x$?YPF@ef z-ERqLA@QdD55(qwfmYlD~2I23Ozp9;okAOqk#zE#b6A08kb9Y}E@^hJEYK*_3Ln)^|ntRS8OkY9@-WYA9VtCozKj36O zq0-^0{x7=^5#T-$l?43NVQ{!Y85DXrc5+0QsB#qxOBS@Lx|f5n&%(&u&_MnmuU471 znm0aIL!Gm<(>vvMe!8x9?Ih$D8F|4BtA(88fzGN$V*sta-QmRQu@I!LzD<+JkGoy8 z4OSX{G19UOa{vxs_!3Gf6mPB63IymDo7`IEl;rXo)n|+iiAQR)vn&RZi(xS=+DnBA ztrOo8Yn}67T{vdvsGGKi zbVDjVbT~T=0k8Rk2Q-|ViYTBIDs*$Rhi!>_pRxA1u!i(+WuWIiHy>koIrsTnQ7gH} z2&T)ZlAR_N`Cuqswe9{9?~Dbpx`(KdAtdkH^qUbE(`ly5)eghMc#QCe`s2AhNtU~k zOEbg9Br0Wk4sSS?>0rUf@(W7~RvkEj*QhnIy{hhc6svZ9x%X?Ct!KkCJVNzEHBzI4 zr9pZ$GBPqi$^qqv=d0)Vk~=>GejW~xcNx4Z6+&KKUgpZa&(B24UZ1Y8K?if@=PzY6 zZDizIs|bBsyRyV}z!>e)i7G#OPrOY`L?pF3T6?bBIc@bbEYrCtmeIFDD#Hiox=5pV zbuJ{e2M^{HW~yBT<@ax?0}FO>WSsX#C3r>cKvT|#E%MPXF7O;$!u8we?X3AncJz{UYKsUuhrk-bPnD=5oOJ|vDBR^(jku;=e3qQjA9rI~TTNF>;GA+|9KV2Q2 zg9)5b8v}>x;J6_(?xw>L%}S@CohjrSp;veLYG04ea>lxpWqfZ2xlH6*M)ra*wA0KY zCAD?Iwm4nc&x*3OZ9@5fv+X~#=Kz#F1C%~aEE&ma^(P$LqW|SviUj%va0K-95e4~4 z2F%KW;Q0Bl5!Njpv_>Ci7ZjwUwQKR{sM>$q z0vu>=vgp37B6luHR~HKQMB}EZDmEcgG9y0MaciDpN$1=Lho{C1s`Aou>5s+s`ntZ+ z*xAL_R(q8wu%cyRdhuYrf&QW`H!xiRFtNZrx@C1Q&7*Gw{s93nFr2zH{zmysa@)m= z!g*i8IqF1K&WWq#Pc~fp?PZ#LKPJVC?ZskWT#zDNhmSVUuaE&^uCo0}1S;+#W1>*K zf8V?qNM=z$6f(dal&{E9^KF;Rfetzp?u5rli>YQb3VFL4$S$G(Vr;|}0t%TtAr>e5 zj#~go_(Uv{t`|i1U9HnL`akE8YwsHpiX~9_Q$-AbuHe7isP+@O$LJ6oB(CsSE*o|& z4M2QYUez5*G#wg5bgkf!K`K>IhNn5B&uFqgVNK-6>SHE5KCVwRB??SVPFgNMjJ>m| zoWKB!#AUoHL#AU^%2q-22e{=Y+xXA;=D!`ECjq+8F3ewp>$ceK9!)UEr3l8tqHsvL zl0Xi5qW&8Y_usyA>_K7Co?V8G1c`tq^YzP;i&Sg*p0szQ$7dPXp`27SbY0c4CAs=X z8miVQ&wR7XOv%F#r-21j7!nWJJK>?!RJZ_?7NimOsNmrD%NZC zH`S$vE(QUBh-N`m(nJw?$&7m|^1f&GL>eSP>gjsubOb4H@3csMzL^3*v4W2JZZh7& z3WtlUgcP0DYega#Tude>9p~9!b_9g4!8*Jfcr?YL7&Y^&s(2!zqB=08S;+M_zZ|{O zML=A^oY6L_|F@wnQg`t{9ED2%xAE;_nvR`T!TjE*JfwnXSHXC$)G+o856t$ZfYLL= zK)hJ!^&Q)_A$v3aE!_BAM&76_;o~7?q7FX$miHUPx0dK+AH9giQS5g%C9{wm>l>k_ zY&PE6dTGJXq4M7h>5Mb}D|6&Pf7Vds1=#4(6xFQW1o~PX9}L#RAz*w;|53dIn1=6Z zkDxs)yLzGr&~okooSaizJ3c!JvNzD27Y6wr=zR78kZGyeD2K_jh|cpI#8Pow$)HXx zq^Rju7NimLZ(9Xlc_m+#gNJ7>^}AbUYJVAR$5kMFNzHi3{_uKZ^hmWgmJr^jf3apfD>i{?wypT#@IxJr8=RNUk;J(t&7B6izgP&3IwYx^{{K?AxcpFmZ zW(8f(@OXZis{68xo2y*5_=7O9vS5E=6_Zk6BITscViIO{!J^0#%4$JAFjZ%(3V>+Yk^REP{Es zWL4104%{h0#PPeSxa_V(Tqpm0HfN5@`M>CYgY%-{#CI{Wh^v*|n9L)tYZNG?mmU2> zz1)bKkViHTHB*q(SU!BK`%H!K8J{Wys(0lGuwRb+q%T;kW*XobO8Gnk891d z&#mQcW24iS;-_6i$bf@-yMwBZVxYUVYY1@cBdQvcY;<3duWu?{)YqevS1_(?Ws?a@ z36Hy$1)}Ycys2QnrpIWgJW|+D{^Tp$)7KX(bya5nB#J)c5r29oeqPwW|Jd4la1%9N zk+*dmf%>v-#c=>b)fes4N?UzBJ!;Te0`M!24A3J#Nol3rJU8tAvQ~%ked)c$?32#a z4LiNgJCKu>D2RP|e~~O`<{|rzyi%|_c?nq#PwK^Xy(f#tB`vLva0U*yD z7+?-1T@j$7LRpUeB@EsZX#;KT64~ zCJ3ZMSnL8eZ(3dpd(?yhL)yx^zxx}2btNG`WYl-@>62&S<{CTdPv5vaM5i{9Nx4h; zxPNGDWt$QM{X=+I^vla`#pnW^qOy-oUk~m!ygGAq|KXqi#_uK&fLwmFh0^&=sz|)Y zhdqF#CWlM|lrUMBri_dZRDij6-8c5KSFxA~#?SYs;x-toeDTSvo;xoPCuFeyyOZRi zkBk&hoI&g&VHwk&OSqS^%cZ&|z zEZuzL>%`$WO6#k0{m*^HPq*Xe0QAqk*%x?Z?(fMZ$;qFn8-vMss6+) ziOI4g1rd=Ew0nMSch_M^>fz5@aC`EZmejB?%FJnscIdy;uBA;`+07XX;^H1?jp0Tv5E#GCANmN9XTr{0nFv(g3F0UF)- zxUcsvqC3tSJViL2jk^DA(f_~yw?+>lK-We0y)pBz`Xw+(?M6GT^FmeziV?WfzVISw z`xgV05-H9i5?M5g7#T4JTlDtrTgyM$BHi`H;Ho?~aLZ#@*L5=Ngy=tf_>lUgkSjx5 zj{R%cMWZg{jMZjTDx+e+f-Z(b4-dkx zF~TeG@1;eNJsE8DaS{1`)J5JzlJsXz^B*{x1}Z!tArd|=?Lc7ws0t-B!01gswEox( zg(j6W!H|{K)GPpDPY?2%hzlJfqXQQ|K+$Up#GkpCo)9k~d0x43Y3+i;=^c>o+`RGq zq-U(|BHGb~GM>|b1DK0|9MDVA+$2Kv=@;=z_-%wqEKITP38xu3FQAEO^~v~7NwpYK za_vXFdf4BuVc=4%ZO-2K$@-yM(tb4UO`?T_q~r$A+sE}z4pf}FSrbQV4%yj|r6O6$ z=LtEW?Pljp!B_6(Aw_zFG-((XR^}A7x$5BJa``F~wS!7Ae%BGz9yP;=jR4-#vez#T z`h%;9xCwj?=W0Or63zYlpVMmjz+`fJR~HJ@pqA~R&_6PgjcOkPwx}*qJ&TYQUn_^D zp()EF0PQ5(ft|&&4&h~Txk-a*p=CRdu>?-Nnbrcxw$9_nkt;{dxMl8x0W{o2vf~hS zcv_=3CA&@I8x@Xe6)-01Z0xiji>6lm7SoT%7qVAb>O+gQx4%yf-f;HI&?RafK8^Lr zq}=aA&#z$Fllz$F_KuF`)VuAelJTtU?Mf={On1esO;Sk^35YmFM^RJjzv9i{*Ub;F zXj9qrqnHPy&9(Kb@&I#TDSRr8*lX)9KCP@`S|kpF=1$Qn*4?i$D!e1D+vpt;6_$N8 z#)j%L_e=_Ea^oES55DWqukqR))a9Mm-}|Nj+LgD+$nt5S!HDGZzI!P5}n) z#!NHh1rl#Xd$9>z)giOy0PNR}U=##1q~w}|O1Igl)SwziNg{(Aock`W*owGXg!Se7 z1S9f0UO7U1Dg=`hnSrl~+RM=0_0ncKz8h@JJ0Cwxe9tG+GA$>R<}b>bxi`cSZ_Wp= zZ$_GQB;M@ryP5UY)97FX8%oVW2S*ZG5PVmDoOE==q0(4SS`Zg^jhd3lcq*5z^F!Rd zu(s~?bIxbq??2zG|GL>-E~q^Ckn#N7IY52~%PFW^sOgbe2c&k3!v={1z+7zs)1Tp| zR~I93pYxnH=sGw+0iC1;*&!Hi?l9%3E=_m=%HagVxo!u@472iGcH$`oz_`5P(!qLm zM`=JY_`Lts?b>VcFT6~+pSX2UAD2y90rTNn5+NoC(+1uTvaSQ{_UfYhH!%Aa_1%5s zz|c5HaUZD@Z_broydV>wnWdmB7RFP>`*LMRQA)~_hxgS0oH!pIi3#+URY%Znfy@1_ z`w8|U_Yyrh0a-dV-Z#6+`7Bn^KmjN5j%U*D`S$d3WL@bQ;}w|;&v6ci!>^%ZLN>y59kDg|I_2L%)vJG(N#Qj-eE z;BtzJta?U}NMsD)*z~mW#ib~}B#J#hKWK?ei)w$#vTZVT|I8i&_g#L``72~ZrC4%x z<)j1zv8Du0$#&v~f%o@}vhZK#OTYZE`o?X0>w{fsio~0z?jxnJ-06%IdYYD|{eg+z zyye3XY7hOG&2QS7OklL~S|)%xc1H7SU>T$tJ(A4UNp1Ae=A=98e*UVQ^k24*Nd<)K z#Pkk*-g6Zo3AO{^rR?%@`<@&^FKH4^^HTmxkfAr{0oWRPbad2}30RQ^;vFWkfW8dt zNVmHU7{yBbX(aPUc++)leg)Z=UyNI7b!w7r#N#fU2%UI+F{C&Adu4m9b#%+;myRaN zQG-Tit9|8=2tWy-7VDsxnU&TwQ)EDZj~{VpirN3!DJ&v^G!^x9k~SyychxzdAc6nj z=KPe)5XPKKR3eEPbz-Lh@DLP%xE;c6cfY?f?EQN{Smb8ZkN{{!SCIfQPS!I~uMMW- z0b!or=>v^FRwuM>F}+Fz!;d`^{*RYw8Rbk2!Upxw#JytPeaIWTOh5zX%0UD+xI1b1 z31&x61Cp#;N{?ghR|;etLWW;dcDThl$bWgiYnMASS3U^x3iWL7@m3eb{@T3ShY3?o17 z63Pz=X9^aCl7Da1>$lssRUs!cUNn@ERgDKV->jWeH;3nDq}W;(Bq?>hhC$Nhs} z5!0kN25Cv%m2T7Dy%++tSox4pHzjg#ryo5(kRf;_5(O9}7q_wZY(D z72ujaoY)9F>qwYhT^!+2waqlHE{_CYBcwOw#y#D~I&7@dX!IXy9)sxFZT2c?G%Qnd zCrb#g$-bzRgtNKmSzL@P0}1aH_JDtI`_4vrd!FFz_$~2-+ws%Iqf-Fqf-DN!2@u;{ zrlWPp^w=#x20fy7x-)7qnDR8hb~=dDc&;m+!vr-w2?(Humr}2vy&ny`zzL1h&!Btd?vZ;~ZnxM0z1BRl5K{34mQYsCBFMt##C-La}Lx2P= zsop6yg2^c68&=3CiC|_36jt^9Q27a`_Sw`k%G-zj&U4#sC@g5%47}p>gz3l z^K$6^IL9RB5TeWrC?gtfrGcEzmY&*%5faH#BhP7iYFmC(m>jHCYl0?cC*X`*e}ui=(Z*y)ob} zLZOxAz!A9}?qmVp5dh)i;3NY;zmFg9>FMj!@bapnFqD8ri_LtDD|Y*jNUJ~wFInse zfbGCOSX<*YUNj}Q^4;XzH+lTHTwk(6nzawV%~2pu5wO~zXg+#w9KA%Rj_#~&s*Jx= zV74@hS*VgAaHOpw#E*XAmjmQ$mX83miQpZTmHKT^QX{^khyHlG9t;-ukB#N-?AUi; zNOxYr!BH?Y%mgO~iW`IV;XlUmeq4uCwWPY4a4`C43`&LrY-ALn84`h_C6@#`<4e`< zJkoBm_t6k*fqA74&=kJrCkiRfuF@V2s3zd(62c6xmXF3g>wo|EhrD4BRX#1_Wf9%g zMdv5N{Q;l+JvH#!lm$|g^7rL=<>jg#`3A2NhGI^A3WZN*Tm+dlUMSO&!QQBk4g_lN z3$R7LZj37vf{upo0x<}F^?IB`OH~I9hQ02-++e!zXTNX`t?8Z?N36VimC{oY1Pnxd z{kkjU>G4A7!YKz)%Ib zkpVLn8=V4)bYGrFw0(bAQt8l4v}@*Qtr7=1*kIU5LNsX6WR~ccKcUyAa092nXUB;( z8|go;7Vvusbr{{E7|qV}g(Gcc2j23@0T2ZE002>W1>1jZpY@6GYbsEO-67?f+H_eS z)e`i4&dSeuUSf|E6%CbCddvpm6T3mC@$0l%EAY$~`ycDA;e zQ0VS7cJt@YMk`|=8=pIyje9nHxjsK{EDMI{XlTY!{VkEMmcwN>I>SI~R)+o~;Ld}d zEe*1=XfS-0*xbLCxz=;LGch#fX-OY@HwZiCez`>oy4fM%3VoyfWQL|+q72C7daLXf z<)o#P)T`~4FC>xDTY@AQML`iPn(M}Eza$NYw!je3dprN9=g%1Sc$G!#LHIun(z;3m%Y5!r2F^;iRRJb^k057KeN$pG)60$aa0!>H(bnQ1 z`j{Q9m3F%N$>5R^bi#tw>1bX_k-3C*&uI7+b*T6Q=lUV>KVf)1%M$DPZ#MBi5XdqG zqTgG%n(7OY7J*oOdxAIyV1&Jzt2t7ND*qsV%-h!^j)y75IEfw2o8UBi@P|DYJhONIpOfmL1};zFHUH(MiCW&3n$4~+N91p5fh--x z={iL#irafI0#U3dHU_NJFkSK zlTG`ZmDt=Qa+NJz0x2MRm^_4I^sMK_5r@OUkA`i(W~Nj)4S)TDE|sUjLn(}Z>uP%) z$DRM_!=L@;!5r|yEN-{qOjeS+ZS^{?x*$lfv9T`{i^8vxcd2-4O4vhZsFPIWIV>t!3*t~q96oNEyx$d|H?o^NQ9N1A2?HD;r9k+ z-u^21Qo$RNS@FQ)UdrjOl>rxuk);6BdF;Q@HJOpNnepWAop2|?q+5R?Fn{)Q|K$mJ zX@kQtR^s2Oqc~`Lm>39#tH-UT8sF3rAocZt3JywRGjuqkda?Utsrwrg2;HD`$;KsE z&x(kRW5)40)PNh^G2xq9v~SAET4F;L8ufO=FCax{r$vclG4hKPPWNeN=$DQFyAFev zZuf1OdcI8CN+mV}8H0J)!L1S~bRu1NwWB-j|`0xK8%gTneBm3_&I4fI^Cjpup zrMQKobWSe)4WN*z09myzU=Qq_n3&L01OuMyt0&|20YH4`L?p`Q>X{$aFaz!lkkn}E zZi%d|e_K`J%aHi)g2RKr=mZ@1s{0C^)$INO!nlsD%gb3qWd$5ES;0z0h9jPq&vQ{4aOUJPhcJgQp^BOVm?RovqnXOC6JCW@vm*o))CppE1LUK z!rk^lg^wR!LKDvfe~cWj<*Ms7*qe0tP}QQPX@0g}C$I6hqc@QiRx2O(77Q-Auh#J; z#o|kah`*B+Ga2-1x5!oHaDF*fV=8Q_#La6iiXnLG)&b&)1=~jSUwZ5Rzg=D}^|aW< zE?u*eIR(m0+DBU%0%WByS5ONL0?-&bmPbOk!JH_@s&fz$o`^opN#2)wjuQI~59 zk1@kLktUVq*x1aXjXNOaD3sa5_^I4iCmXsrdzk00$2VJDV0h`a^RS(9k<>iMb`?aE zPnsfwgrNl-mZpOM3I9v9B$h|g(LLEK`9xu2y5QbznJ5@!%HL%ys7AK@i~jm=9NeF5 zhSzzNf=)xjpx8c2IL_NW91N*(qO_ENOv;k=Dr&0iaZ*)3^H2NLV0n|vu8t!cN3I3# zhTFQI&dM$FE(+&9O=2MxjlRWiUgPNHC(i`!(<#{^qr*=_;W$#x`;&g=yP|fs%`l{A zrb7+V$9xWZK8T+AnjKuiuLEmjtAOC6EGq9^J) zFGR7Rh8W02slxW1B$Ge2I$^uEOW%d^i3rF%!dT>1x%>JqwR!Y1BIi(MQ@&OsG;K5UZ z*6_822UcXW#{4r)z~9g3Y64jJr;@JW_`1{M)!pn}e}8{RP*BhTBcKJb0`5n9gKl*( zxPV*auo_S#_h;wkE?o_%im96YS$X;P;6;^@7+u7A_Rx#tCrjK%rnVnFte1&pf4ih) zzg^N+af-qn62|i8$avpr#nEitj(p0{{joul(UsKRu|E8#Z-#g4<=TZq0t;Lh)Qhy> zuc_#s?QFk=|KP9xM`r6y1qjlWNSm8RvmMYvAUjGScmUH%4(MuM0>;lRu3h5?M=cgd z!hpF+L{(Kassr>TU^5@iTn;rU)O?54-uI3d^6EtL7&NJ6wcK;TiXQ~-&Pu6YzxCUr zr|?umW5P2odFOlOkeVG8hdfyz86#Q9&W>FD60C28ygSW139cBvD^`2LQ2g@b`w2r9 z`I(<`#*P2U&HKYQGkF5#qD$9yovGsYU#-LQRI4GPvsl%;2%yl0A*jbePebF+Z8O^p zs)1vAxIhx4^2?dO)cV=v)_xUuj`I%5gKY?;=A`EeN@Mn`t!7`70LdK^T_>KStlP>% zs|L?b_laFhBtHrz8tcGzyo*lPvfL5V1Fxjaa=J#YWh%OC#h&?_|HpIsf(>M6D%azD zaywE*LWDs)0zIssp||h@mS{ew)-n02;bRWr;(;2+9HFrEL~ zN0*G}8Cc#>JCd=5VzB&_a5NO&_;Ms+^SyrpSWfmrYie2^_gt6e2hE$C>A7zKHmn&N zrYNGJp;4bOWvDJ>I`IN8wtTR;tG1Ogg80BPH=|q;K-C5_atZ%n&i~846QV@BeCBLc zx3gUE3PFvnf6-Zyml*`+003C6uArtiwp{V(=UsW3+7R7xkeQtynCC(f>=Q8Vi_q6S zbqXb5eL%N#)jj*e@hsIB#w92RtWs(Hn^)D4Tm+80Xgv-&WZa_cFsEnMR+FN7bM}NL z1Lqe$_iZNei?0Q-VHk)}-KfaO9WFrZQ{Kw$7QZ%T)BSn4#GqQ8-+Zj%kJh4!P8>_D z(;aG{+cBZpTm-`?PRq+#&^2xQ4$_COi= zg^4*H3pK|bHCevnqhzqOPeTWdp@!y0F4N(;Cr5GvpDGrYJBq7BI)CXw7^c6CPE(!O z{mvokvJ#-32n2E;3@-ZsYDx;4&%s2j`nKE|j3_$T^*jr=oYu4eeEJ2e;LATh=7i#2 z;ytil*+U-h#BvHpI`H;gvN?1g&bh5jxS1~ZGpy|*aSEl9h5S<*=RX~)OdgY&C>(T)5YIZ zDxcjum=pUH-IImuI(Z>9YaCVvlspsC2_OD+;Tn=Krr zz=_%I=m;D{Y$`~FsJ=2*&2E6eYy5Bl<&XHvYI%bDIn-=AO{w2;D>m(?6UwsORaNQ3 z_?0!i(*;-CddUm>Y4ze@!72wj4Gb3IzLol$wkuF+J%&=zRFyMc)~v9+GXa=uKEtF0 z#;2_wN+paD$qYcn3EHbq~gx61f!9(jDBw4))CjMrF(PRf@*FTukk zkes_5!NKdeWqVYwx&PciTBiP1=ljR0!uTyQX{_ILY;vw;(m6dC`PrcV?z`~gqf}A> ziz7%JxxI@bVEe3jx~;ieu?VI^3A!P>YOVmv9E=jqf^^mQy#}NWdqD5KX$ZhaB77LD zcfle#Z}>Iq?T-Ka5gG)ql$QNJ)~-4r%B*|uVy&wpijt_rL?`(4>`kv^YpqIZ9Hp4T$UOWwmrjt|nE}IZPN`4Gf8LvJfOZojf=1c$n`uAU` zM03j^MPGBTiYr*e{yxe1NczCz2b}h`b4gE2I7u7SiM@*54{_EidY(q$5~h4A@@oA< z>QlwV5BFpkg2~9*=I?NKuHoa2l%|gI93@ z8TQG=!i}}%D&6%;-35WIS(u0jW7!u4W-bvAEF|z(0)Z%l=i{HEJ-mGEhquXaws2@T zb2vGD*Rpo%`ro)Q><cJ}AUh4^UT~6l2-(r8ABS!1J%TZSgBy_vR$&3jm zJ)9Eyy~DdczqR`ZdH_%yL}$cemXKlJNE+u)Bib`^e52%3)7E;^TG4Z-otNI|u~*G5 zY&s%Y^m&u60{)Rb5kX8h;~DJsil;~0YxUYBSZx4?7p3-z$0|$D+bpa1}mHg=A+XDCP7eW zrpn_GWmX)S{EmIKbo~X6N~QZBM@2_q!REP8kM;ERc<)O%kFiv}hP%D`<6Yev?@$-5 zZm?)_(C5w6L|^`JupR)bmS>7X=!`g#kj9v!UQ#+&9gOb&rKkK4Hz2tV61Q~DmnO6h z02dGGtg`nwr>*?VL`wt)Sg#;cbxqRr{xp3~=ZJtEUP6UB$#-fWjZZ?y0l})rF)R-q zgF>t31SxTtG5*AI-=iA#9=3ILo+7)M0z=98>%(ApX?Q!uI**C5Mu(u$!ykHRH{_uQ zxyurW4c~?$*)1s524p5H=$h!eMt{`;APb-sFSVXQ2KR-$^xye=R&#x`U9O{JU0E|^ zRtab^M(Kor2(mgu3tz;@rT5Tf;}1?S-NFpISj@*Rt8ar(?-O&!PNbmqvwDr1-A#9d zpgX_(ui4VU0?2VsqIMUwOTcU>WHS8cf%b}Sb3y0K+}_^b{~5d`Wz!N!TI=!s_WkY? zlJE8ERvl)mJCM+ESFW)2+~pUQIN^ISSDT7>FUiPPhjGhA2{7DJq@`?^Q8?_-bTeKi zVOPv83?7blN@Xh@@$RAT9mn~{HEe(Yx2CRK8hxD)kI-Rd#PLe&Ra+~xxC6Q?zae84 zXZVQM7w*xnpCDzu#`E5}d&lkVJ|PpEUc#C(VF!nFDy5q$*?%cgSj+0&=x69T8l@>m z$bLY-5Xv>}SztGHM#fANQMyAzMv!Fsarvj!H{fTf6S ztu|FbU1MnlL|`dkLSp^zJQ1Or*lAiQbX2I7;oHxgJxlRO^ws9d0H=*IjwUM1T2NqJCYeC(Z`%%BItbY4jVi=EHakTKFH? zQ&}SRo2DjHG-}mBrNn{7ZKfpEsl=Rvku6e`6B9~ORl`k&&zX7{x2~S+;FPB5c`;pE zj2qyeQDthMuVKbtI}1B*#?KL8H+XNRi0JeFOOHcC$q<-FShqLdV&>p|JPb^sfpJwn z{{H@{LhV2R!{xh8rfQ3p=>27xeUbLmW*S4H(MSLMph>8ysGWZ$xK%<#-B01 zRAeMDXi#x!hVZIObVU34q&shFEvrb&CW2w6lb zp)K^#Zq3c!woug0vx8vzs)xkQ;|=!(7mpF}I^(+#r@z4z@F1JfOR-J#HLJv|{bJa1cr;o6jH-ZKU+x&ukG%O6NTnpIk82 zvQ%SmDunfE3k6ZGo3x``7ze)PDgXDG=$F58pU?)JWpVRWF!(AUaenkAXifF!%-Ej*cVtmJAL{?V zs0JgPvk4k6kDXt`QM@Po8e4^rOg;wcX)@p6P@9#f)@R3s|Mp|2*qGc^+CF&4of`QaWMqEm#1_qYB{3F#-ZW*y~2gD|U7!Pk1EQ5u4d5m5KrYFSg8 zx#_kQx;3YZ1*R!&DG(&Soo$KC-Hkla56KA4{1HetXvBZfSUyia{0g5i+$w+AvW7rP z-O{ydy|@CK6RinTyt51b;R5}Z=(>UpK?LSxEh4OnFc0sKx!L|>VF%HuAvnB^gH9#E z!j|tPA_c7C=GCF{*iU28mXuMp(l_Qi?t%zyY# z?y1K>7diNPxfvcN#W}lvguq#+i8t`u9pXNr7CSxpAt}X8ByS4E!^RU+S8$JE?lGs~ zTnogO$fb<0R)k8DHG|e!;-x41U7b?KuTv$PRTojzcq+ka${D}JJCBWT7rgjGTe)J#8w%xL;AgU z<}O1bi@}DO=(iX5>aCB~ZAA+@4*w=Hzy7RC#g-#)O6)jm`jnt;V`3hyZ@|`4gJ661 ztsk@2@{91umQFpc0wxzPUNr4`c@n7RH8g57{YCr`$+zJ|c0yUz+U{}^lW*|-NC8UN z#ErykDqYb2DFlva%P*EIui>V#1xZNNa7wrX6Wq~PY+Y?^wD{-^&8gB&9){~1#<>`4 zG%lp=+_#+<{9kiruPu*4{Wmh&j<(4O^4Ddi-7r*-YXI6kmaWL@c6?v;Eq8pjXX_h9 z#R`bq&+E^3o0>0rm(YU=JGWrr0MZXpo)TZ35clluRYqG|n3-Pk6f?^ZI#xAKnW0Xn zW^s!k9q#$@*47^aNi9nbtm8E^JJ&8oWi}DeuD{=h%iJ8##D*D0kL6)@9Tp+d1!DCn z@j>xa^JfRYT^7|x+&vB?{$bH2hh<#LKTwU~7tIuyqPhVmy)&&D_v-e-#J;W7R$AAw zedw9yWM==Nxdu*pI_&UGZXBi{=*}v+(@vE2FWT~s)2h`;(BhOaQPnH%DSP_pZ2ycK zKeV0E=H}*Vlfuy03ocV8x<1a+1&MWndv~t9x%+TOg2QT8UXsu!hmo1d>80BKV83v3 z{zi4m)RDpCI2px~x#yjI+J)C^ZHWnZ&03zS>ncUNHF(p{Gs>I_Tn-O%lKtCa*;dm1Pfs+s0P3K4~aU3EJfT_NDVt+KoH80upxnp9@`4 zQZ8dmNnzF>d7~?B;w)>&+vQ{K{p)$mTccE<<*;(1#=UjYRu-QeV!l;*n8xu#C;y#M z?NjXR=1;jfI})2-vQcj$*ax9)rs|GcYR4Mwh$sI0i=Mo^d1&F^$y4-!ElMO!WaIJ_#6) z;B5W=xiBRlt;95%9r>?!;h(nQ-<|2#%?H5U;td}yQX^2vEX=CnfN~!R%zl_&kJ_(o zY++HRHxsso?3u8xT4Zw*|HSyX9=5`3LrYcg+j4d$N|aYBqt}|b0m3X-vR}HaKywHC zltPKqP(LklEreW^XE?m+QB2H3UfEj}`b{ud<=QUQ>yu9dt>&}B&~BA!NHTkj^q05P zXH0B=>^8lJkij@|t!*+HV_dj4K`B{Grl^x|$3n51$?=_hu4+5cSQ?QP%w z@VXg6j<2DLcBIS_*CloXZZDv+r6;tK%oQ!Zl6u?Bg65Iy-mKe7o7D&zRi%D_9AqdS z7XN(k+_e$|peZoMrL*!rk{QplI~7O<1z>-GZe}T&4J!e!ER3>vwEc_U_2J(K*IGvf z9H-SGdCGGQz!W63hE9*vMnDBD&C3U{un=GY?5TNs&d62e3xYZ&CUj=4q_Czs?yh>S zfm52M4vk6c2Sxq!Bqc&0f9N`gdIJX$IG}?xS#;kn>l2o)2O^6c80=`>gz~1>Zni}W zJ`M7W61e5W5(8OaqpR!_-!s&YxzjSA@}4<)8F$G?Ut8#tkt;rT4l}0oD2Dv;e9x2b zTCeTPAHgi+}Du|6@-3CwD4hB7$K3!*WGC zO!uHJQpE%^CnWx&s85jBpBw5?1yT#CoF)YcKx=oCnOkpM=nnGWadf#@AH~TADZgz) z+Skk;p99c%f*iSP124&6zE8j1A3L6e=+)+K04Y#SZC*R>=vR%q6{3WJwwU6JBy}D> zoivwuxVBl*YvVFP!*l{ai<0r?lJ$cmHJlwzbn0ph$tSC># z4ryg;v%>3Uz%Ht4+0#uy5WL?Kza(xxvG|73pPFQ%^^H=PEeXGz-?~9ur-}Sd(&P7- zBhZ!0cADQW23`^-nz9J_7$M&Q&LAZtBNJlNvBr0FDkwhQ$S5+6l2(ycgfh9UasgXE z(b);n;|fab)n+Gu)@c7-4PreP4Xa2I>t}Tt!wpol2s3Cx?TO5A$v4mTJr4@B6J0nj zi>l`2=|$i~^Y2 zta3^r>WrVyA?tWC>h-p^$cM!XiWR9pUl!#DNOXJ>M$PHa1c;Teo9pl)eFlqWK6uPNdX-+ofbdPJs{bYTUI{e$sREjL`R-{~ec7^&P-P zG&Xrg8ets-|?IO^^vdtw0rM+lj2>?FCOpwqRvx5Ld!V!Kmna(NiqA(yJ$HVdRL?qy28R?7l}HIPP-tknc4CAC~kI zyYq|w>yLk1gKa~D58Z&oF!jbf+T8ID)yD3aVUI5Bm2M;{X^V8=Om%k6Ef`L?n*>yh zn>vgWRnINRZ1#$iB$UU@c^}PGp!_tb%1+~RKt5ebB<}j_<*Yw>)*WB{SD(<`6wE8w zPm#XP2@kC)TNThI9CA}z8YV?bTUl8>2?@b`tO<&bi#rv*J$^!H#HeLwg!; z{>*6M(`&sy7-IbX`Z5x^k25~nwWfCUDhB4=EK-f~#JzmUVP3Ye=_UHZl@< zTPtXL4+T$Ju6!KmQqm4{6FmYbT~^_Uzlu(znCj8RMZm&wI4t(T+r|^^25~3OA3J8Z zSK?;*&w$ZDsl;R7vwb^V-rqjZKe?ZTay{?qNj~Z`?!dEJ0e(l@D!x^Gqje>JYiSSz z<;-`D>_vy$qQ|$RIjung#wKbXZQclM!A3v@@b=QgOf^lo6r#`g74~EaD5)J!PtRPh z36=X;2JXMEEq^dX+7ql^y4hci0cA0c@z!UGi@Ex^ zwwiZ=mr9P#J-Ki?J}H*t*T=Bx#gm8Bg;3Avk&0iMIuC1V6z`4;d^@Z+8z>(|{r8b=emKNTd=*_`4I|73!(();>xh z!OqK6wp&!|ZI*c}Ew@imFR47$C0ebCyO1CmduOPpIR5-EDj6ayuG?Gn_YDGh=mn&t z?KY&sqPYi@*Kn*Bj$^nEY{RaS$BIcBu`j=Y%A@uised4iFugNfjK!EJHje_ z#Svw4X|xY!UsF@BnOH4~SZ_0MA(>TY2?NV4PN=}tu%4*eiv ze}W?oF@V&nsh8}fD|+iSG3ysyVAj+9b6;<1j(RB3Mh5hI#`saU2PA({?^>A#Jv67^C>E>`E&foO0nv$O3uG1jt~}E^ zvVwpmAip7Ygjp@4J~(7?+Xptwe6~&9A}FLTND2KkJ_e=KWpqn3D>K`1>eS(WDw!pJ z-=IWIkBmUE>BtKoD`93I9J3+xjiI!sd{%O_%o=}g@Y=Sfzx)9R2Y}6Qh$=IVpF07Z z;t{dsf)${L_La+(Z+}SvljTC&Ca^-}JjGQpl_8i&VF*A@49ll~AwhRaaRvI)?c=MG zJUmnDmF>&I^iWkcp{j6b2+arJ)GZYyg@yEsUv1}KEbiaIHvjfcxCdlb2#U-DhvVBP*GE>f+{!4wN?1*&$>sF3b>ZC*>#AJ zd^Xv}&f$+5K+mBi0cOISMt;S^x7g088(;=$gzm@Hu2D_2x+_k!WI}sbz}@wV3H-AA zzkRIjW%}PA-0L6@&^E_5Qtn1rtT2~M>Dsm9h&W5(!QWv#p;y^PZJoB)n#!%}fC4NxJajikjAPPed5B z`P%EQdv1NA)p721-hO{7;#XZQ@IEnrYs)@~DyQ+Zbcx%tg zPtF~b9Z~dujD0JBdiJ7D@SZ4#A?6j*fYXtA28#3y=XD0I+g}&|`mNu)nFRD4hYh|>~OOsw8=v^N5>~bipOSvDie!s;IT*uOVdD0VP zyP=kgF0(UKp>_}f5QTv&#(Ij0jf&dmR+^W0H>G~Zb4urSH?FXUxnr+i=$a4ormpVd zJlk5HwMSXcwUYUbVo+58%|o-cAH8ZhOpz-rLdMl>-$v%vVi^V(AU<$25E9rFBo;>ag0HOua<7KT_10uN0;U+jY-A zIjf8XM&|UL;>=wjH$H2KS~8q<2mZDW|xsP4B|E~utQl7s4x zq6;XkmtqUmEJ{HH=eJm$%1>W8YLDm%}#0B9@(RWzJ1NK-}Y66hAU+_zqL!6t`ZF(NZ$3%WdGv@{&(E7iybcJ_kPxH zp3u1evX?tet01d))`WtJ*I0!`{pAUe_<6t_;s_ddda!qBr^WF)48+0#7(=o7Fu1is z{YsqlLBTPz^cw;1`Y#9rW9)U}$mgosw4|bp=6LPNSjjHE0L=gvDiWVteN+f;H+j*= zIdZUbKP6M`8vSl2*iCQB1Wch#iQK2w`{;Ol&tuyk-ophYiC0iuVi54_{!|yQ!Ea zce}Nd)*+>8AcN=G^H&^SAd`lcFvu6*Uux~!$s+$QH+B8d?!B9owfAg@dJpUKqE}s9BujGt+{0E-oj-zO75?&n;b)6cwDg}BOEz!bXM5D zcPE2y8?ima7q%p2vicA+UIK1taKYSsFV3TqH}JyqjIO~pv~`?bV+2I8`4!b~A2T;n zBt;No{JmF)aQYSvH9BUTH{Xr|b!BnvQ_{@?2~u3EwG_^|U8jH4@$BBK0>$Xx=Z4*8 z!KExZJe0fKCZQwRF7C2o8QL2_$)8b&u9rX>^Kg5&;Um0EZ1Uo7#SRX#2rdJi2y2PA zX;XQkX}c-#DxrjKLe2@Ng@oAHp{LcimGOMl%0)x=L;kE`RDXEZQhJPE^qsNI%f%XH zdRIeFUUqama==jDcagakN?mhLx#QQi7-+mVr?z9muPgg^qxjd%5O}W3j$VC_t@P>l zH{6SYj_k^^t>vxKDo59_&%IYL+oDLj2Rh(}98Lv}-eKY9Fm6lQ*@v~D9V^;S({5V? z9X~n(If5X3u%3&A(6=$+Nl#s6>%)q3 z)F$@ox8S`sW|zfXrdzH)*i!>d_t(-Oe8>g=Gkdc#6jQvNp#9w!{A$}LoFSCDIg?%E z-RSM@w0UBiDt&BBAcyxvt$4oL+FWyv6tbJ_VjF=-HxaD%bzgz?&ev=fcF!hLz+{9c z>OmbgzVyR|8Pdi1w2VOUad%a{u~~L-;j~9Nl^*7 zFpxGzTdLabHJRfVTON4RQ!Dnv_vpb#kPQEPw?wBFsTxC<0sg>3vUxxbYHQP>HDchH z_-c(JlHv9ilu)cgGY}c=YPF0LH!rZ&9V#86@5whx*j~qKvONN(li*SR6HGWF;$c~ST$BmCO%pLulmr4(BP-cR3LKffd zalaD$59WIS@lWpu5UIqFFNr#UobxlN)+D~=60kPilgk{!rhOc7=Iartd5gH5oSZ6N zi^!MuGXpCKH{h_?%6m%B!u8@?4dKlGhzQ&aUoi~WV+Z~NsxsGn8-dW$I6+K!?K3G+ zg@T=`hYGM_wP*nedjuxHt6Xer_w&Q!^l{kAx2XeX=Lg+aYu=u@Q1;ksdh4{4v8Jud zM^S$5MHmxW>UHp~e#dHfl zq}40xD^ak~PnRDmf&}S|!EhS}U2Ea~YV1_;RnJ0~RLj*sRYN}ocik8$gH3j6C zOqqqs*R>&s*sgn{a{xwKJIX!f_>h}lydC6?{sxq>XV6m|Mu>6}fyU@5y^hy?<|gtO zFX9(Ux=<&vy&1UblB=fkQJO0Zb#^^Df8sEeAayQdQ`N^=`VwuuKlLbmUtH@1Sxic& zO1V9DW^;N5qgFZ>oJw%p$jA#u&l^T%cCyujwCHhOCN3NqnDEr)n1Mlv{Ww8rpsr(c zYH-sGPVPw7T4o8-qfxpPzr4Nh7DTsSSu)vC_aOMn@i34rC=L(*HGb29VZq2WQixaR z%9NvPk674+gnKdGanOqC=;)e?E=Xb6%*j=zsqh`NT2xLK~xE zn2;QR!Y85%Zv}RAbO->`g}2i&_jRJ!VzSzx4nAr>1U!l2u4D7T0|o>?iNUNSU88Ez zFYfyFOVZosX5OLx=6)#%`8SP8s73sqZIsCwe}FlYgvI!{X4ahO=Z=DhcnjUhzEV0_ znK#+eosHMdoxB{%^WzR9Rvf9pB@WwaO$<2C_(8tJ!*x=j`;P1TU9=AuCpybAq97iq zECMlK5chED-{5z1XmTDPhTcRsoLS7*P@w$EiS|J6d$#O=-XmQm=~E5&u+8rR$r2U( z{qa3$c>b*0#!Qp*rD2bnfcRx_?lkcp*9_|y4}_O#0x$Y6ppK3lk2p#Mez{#mw^9z z*Z#IBQ7Qlf^r71j=h7N>P7tMO;ue|2F7GMk2hQPA>&bX8v6%=m$3VGZ-Ct4O3nT8Y zdU~$RPG*$PMwko}N;&d=U6}8F3kV*e=W4dRtS25BE9Fn7<8X4iY-=-ByC-uV95=ul zv$Xo4pd8x1(P&>vj9QRE9UH`Vz>#XfgRId1~JSiIO9ghCdu) zW@tc&$xBQaLlDN^g#uX8*MN%xQrW>rR4za*f>fJ?ghZ{c?A5DR*;O!FVQLof=vJV{ z(b*6I8Y|FRxn4Ojy}Iq)1`x#&gmEm5w3~?LiNfjG>Lx|4sVnn8yjLfhUw{W^Gdies@r1azcq!cR zm5=FVv5H#@it#UA3|(s7`*EOwe~=2PALv0ZtY`-`*$mQ}bY}P=&=uNoez`KHG1kVis#k(*uXZnsp4@+<=55BAhbJ!icxz5{+4*>Ln>h=Lvi78#372!EZxq>_ zFOs1=b}T%vid2Qv>LiiD{_95{Kl(}I!Ud+}OH9sFs%6%-45v@$CgdrvQe{}Tua!1! zh%{}u48$((Tiy2O|KATsLihBz)M0z-V=F#oD?Sc&ngd7q`)!od)X{PGvJ0<*+5;8n zIa{y8WLig|pJH(+1Bi;K5P`Y%Mcpl11OoR3k|ZNU96h-V^;c;5O@(@)rCxmu zC!R-*JL+rj{nu9a-{A_3zuJf>jZ`>J|59;x&jD6p_RTJ%ww7lnbhipMRnPk! zCxP%zA3;9+q3_kkXiDyO+u%n3U?n0X1x<5fEyo$Yx;K2$>u2K|Xt%W)Y(Ybr*W5-g z{&|0KvBwz-CxIokL(?sr$p+)%cVp&G1zxZ-@|zClxw-yQJ%rG;7A6iw@^JMl+NYaR z|L}?bq9l2CUyE#z%sV1j(`VCVs8iR( z=i_YoGKqs7qDwZeXNmww5QRDaU`qmGnhSYwl?{-JOD@$lG??wTt=%oWx_u%eqhCkw z62`8YG2@ms-6;2^E=LXMi3$|mwzNG)+Kzcgx5_yA_@UJED}sSRm-dOjxYKIyMT~Kt zxYhRZ4cg{I1>9K`>KyuVLJyg`H8p!ZfBfVmB25&RNDqIz!xz$De+b`&o(>+Y3{Pjq zw!5+~n{h{Vm87LH<(bm-bI}FM$Jx?f0`cgE(iVin%YZrdq{( zFHJy?$N2Q%b5FH;`Jt*btC5PgHDpy!vi4Y&K|gL| zqD-)*GLHK&Sfoz?i%`FNX9$BXoJMFDmo8n3OG{%RWl<+W*zC+1ui}O@%1qS1wA{R= zV7Gied_Apqvq(LA@m&`>-%#?g)344_v#&t5`GGQ<7xrAaLV-NP)8rz_*p@PUWql2u ze)n6iv6<&jMKTtiI6H6SGymOd|2x0()hJB?Z4Z{WNDXijb@+7sNoeTuyJ_mdC15gG zoNa7svezCpe}OV-*c*oN&& zrpsQkgMN<0A((DucAs}If_nV~a?QVL0rb!JIpXNp`AKXS{iZtdR3B;RznetE*T}ZK zxl-;)nx|^MqsjiiJoV$dp5+|jn`$7FPe`u{Ezm&HWva%`|A=MjkEAq?! zH=7Ya?;5(~j&j;S;i+iU&6{l6<8u&pr%}M=5ftOO4(~A?A1+1`z@S|r4lo zJT87U&yL;x9W>syz0f8A_t8?Ve0s3$`09sy_W)FVfi|=^q7u1G-q4U39ZfOgPn7gQ z5|aatJ^YT*F)~8o#amG>g$8dbE}Xs^owY1i-8z{}r@tCk1I9*tKGf!p?a+7UVB6s_ zIXNU_doj*7-gdjgTWj{mtkxUqeR}_97RXFFQN~|60q%Kwjvt@ERcbd?j;QOv1U^1U!_ZuHCw!5`{!@=UF6%Q{1U6}Fs<&9Zs=~a483v8Bv z^Y{zS1e@Af$^JW{F1K&gM~ZOqnJ3@YU>9d+ByFOgtRt28ra!YZ^usLev&eN6{mzDJ# z%q=eNG}m&}1`pq9C|6<01&vXj3B%9t=nr>U>wWm>qlbA~2};?*JejG|78-n_3BIg; zLxzg}*{wfcb}Wbz*H>HS`%^pj_wCmo9m@lcX^P}yVO%*RzyTH9O(MizQgOWS<~!Z&cY2Rl6euQ))6P88wdb@LC>8*~Rv)+}N=i!aGM%Sx zTkCF&t38mgJtj;2d4&ZecOO4aiG86BT9+|2e)7U<()!B!p$Y?ikq=#+eV}S7sjmpC zk_kfjvJ*BoSIp!UsJe8?>0ijiohcI-R!~2FAW@47=h-wmdUX7sc>Dlh6D%Ha;WJwU z{*DOOVEqU}gwX<$q>`>Ks+R5`t(XXKJ@3?_{l!F}*oEYj?Q?N)aj!l;mc52KOd~=L z!wF}xKJ4f9^{eh3=UN0{!UESK%Wh^%yXy~AeI0i4rT&Z$+6ltD&0t_v**}-{G*4>s zs`Y-{rK}UasX=wM3Yw$(rOO3NTS3L9ffF7o+GVs#Uz<~BxC)Kyu?E8!#{>lN7 zMwlSY=sDuB@J`TR)%6bJ zHpDmsttMm^`V2xQmWxM3XNTBPf`CwGN?BXapB*V(`UH$gx2YDzKlbGPL87D#VXBFI zaf;OZjE8DWI-~1e5yV#SuisF45}J2koLWdAY2GAq7{IJW&-aNoE6QAfB6&S|QL5-v z>~&v?ZqYIyYR#@a|22~NYrDRX1i2Z_5Ve30YAHL$4?OAB!wU$=)K3b;7zP-)F5hQa ze#iNmkn2vS+=@}Oz5GEufUq2HuvJ6p?~iO9oJfwZ`A z`%A+w_WTpkZByfip>`Y4p|Eh<)WufSU|M+I$XE_jS|X{}6y1{4C(2`Nd`FnOMAMjf z>}6t~iup%?r>@j=pj@~5$JMZfbtCq5(WF0I(1iSD;K=O_i@@c^O}iQb_s=kB>Du&F zVeMiED=iqgnC>6b_!^;1T$PLkh5;6}9& zOE!eX3yf62^8M?xp?IY1r@PkE85bM-{{F0-i7|-T{qx(`P zkqG%!g?fNEtT_=mqD$0lGi_&60*+No_XD!f{{mf>awa^2I4XQlyE56J5e3Px^zd?6 zqf0c($ev2*3Zpx+z&%j{CDw;ylQ7|trr+Yk2BfE!699V2CZvY9=Yfn*4 z)Ea*O=HBe5^21f_YPzm1E5T|y6fkbhYGL$U5Jt!OrR_vVC)>ssR$12N*uv=xrd^u` zpM9dL>FB5Z;jO{@QugNG7n{21DNZfuq8Dr6NjC2B)_}{F1ybt^c8--8Oo9VM><;rS zW7<09KPDmcM?NvC4OO6$i74A5YQa0Fp~~KQYY+fjGyjs^MtmVpGl_sQ+$al@re@p%96!er3v&aV-KhmWrj=RV@jMa#TC1#3#}X7 zZfjrc&wT4;)1|Dyluy5YHh8jBELwXjg(%5?^Ty?W02y#Q(d>%C71L6kKhV$q5rzn89m5C zta36;&iM=5`TF)3U0VBi8Bt)U5#KF-lOVQmLYdFSl0}1qGlVsq{J<^Wv5cGPKaREh zOi%(UH7R6|vK@V3!d7nYCP@GFq>mY&)7I8D?Jvx27=waBy8mHnp`7+V*eelV*(>g) zBweWIr>OA@>g!_En>owP8Lj5GJ|=CH&=3S)8kRC=V!zWeC`Y7X?~Neq!^;W3cEuFh$WWvI2SpDc^y8Pmoly;QPJscm8zOfdBcAAJs+cp)NPn{wr2)@80 zV{N^BpEI{Ziz^IJxFM*8&e4Tk$h+F{VMH+5fF)9tFN8G`#)a`fcHud~^rMVTxc}TY znPDfkN+aCfbtC3K6%Pm~l-evR5bCs@iLmk`=z5$BE$f?plIP^I*sj(aDXv%XkZ)TR zyM(T}tBpLOt`pqI)rtec(f%F@OZUzcwP={@NFHPrJ*v9avqhblJ;B&=Ls1YeGDy## zKTg~-S$Y(=E?}{_0@&hn0sf8bmnZVGS(CR5;@dgg-~Qn4#%75}375psUUp7oIlIM%66Q1sVilHu(w8Br0%S{X?5-F3 z`tVaw(@ZRSzWO89_7w#A0=2c$UW*X0(mQyZ=S)g!YOiTxULIdY@m%f7Fn<{ZYK{^0 zn2_Z{p2lFu`%>36EQZnosh%XJXX)!heOu7Zb-fb5I01EG>MC)eF{goIM@G^4FImv2 zhWjoVCg9^E5HYP+?Y+#U$y2N`wN=&7B_TpHr!6^_FuZ7u_ z>kGD>yV&RNC|fN>qLE?}Ab~gNA4H8ya{dE8_aGI#^{x@^rLKz^rTB3|DBgCw?#VU5 z1#GPZwEnzb&}wsaq^j6)9wFKzks-N=gXJ+25@`mGaX7k=*d0MAIwEV6*|YWBO-NL1 zQfD(d0#}j#>f8`x`&H;WLqt1*qG`9@KbBRfc%3lBe3jLqRZuRk>QEMV5I;w-b_-BV*&wL(g1kDjb?NXAGl1Rf5>!5=Cq|_|Z zgI3+pAZmnx%xmW*Y3vLY5Ab2z2VCYS}+F~E3>47^cr)N0_9E@7Cv$O2aYg@ z1Yk9Xn#p>js*dxSs`uHn%3DpO=i7~BC6;u+{^6`+*7`0^AhVB<)GcyhePy4d$3ZX1 zlcrZ>GCtQyBw`LwWe5_fdfLmFZM;h|OSN=B$g@v6cY6W`T!)}*sPQE-#JjdfaUaekw8=AC@ z^nBTw1@mGK`~xTWge8-E=T^sB@_A9ZPqYM|=^Thq@lrjh;YE$$Pfz~~>sd<=ivw%v(t5miQ93yKi$ZNW2H{DAm?ubP# zJJOFxN>b);9uzt4dFFsU-IXV_u$&k2f8CPOdD( zJ7;}BC_A!;{}4q;0#o$%P zfPnUn&I%7|MND{l04AirNyZuV_IX5?%=PYr5$uL~`3}2`w01VG%U>?a9H}8L7l0@_*Z_{l6RLH3bX6PzT*sqXeCx zoo#(oM%){wvNkBGsda`Wi(YYtx%U-FeihW=1|x&xOG`JOk5q{xvY5yuw<_^F^1n-n zlL%No=jt%mL}+I`OH>Y`_cQWJ3w3R@2Gz+(_Sr8uvWJN&EDCB_$I7hL?Us@!=VS)3 z=jZ_j-DCIW_Qs?fY8p6U`RW!*WCP3I`sN~Y+<0{~zi*zaGaNM2o>kMjj{0Cdh^q{tQ!_xvns{2O}dhVOD`jKLoKv_K2@vabKgSjIERMHsbjMhxNJy zqV)18GuM#N_QL}*41(<;x~lPLG}@4F>~%_AD0@KBOsO!)x?gPH+5jdp0ykliD_XxP zT6v$zXI-a2I+)iXY2vN4x8BBFOgw_$P7(C?#eFWNl8I4(TOcuZ2(EjfX;b8 zM4qyQgH)kZ;Fg~_{~cvVCCyHv!L`oN_>9txK@$c)isN-4n~lWG-OA@*pbiV_y#6EM z{y}RU4Tp3xqaX^M4RYVh6|v{;VWA@O5J0Sj!g5tLZ4!h{dWD>4yu|&z#{-Yfh!B}3K(F*2EOzxwaum2QZyS4 z>O-l6TW{?v^KN-g$F@F&2HNV%u$O6xx-MtjN|vUR(le(69|?;v;5ss&7+}8N^;!s_+i~IX ziULOYPu~eU|JiHWoC7e8thsUFED{5WY%KiIN)_$8)UERYT%up5%MkM{$N-B`*FG|} zZvq^#>fT{7a+uI;(&sYY03a$YZ~^^MtD&f|=5;&%%OKBVbbyDSzlu4S65!qdOre?^ z8Si2VEsK4R=gZd@34k>K4SVzXvyJpIzI%e$g^bqaoGp_InKq~F2Vpo%$71cf!{B?i z=DG%};?9OvsE)X>%-xw2F0s&-lsD~~wZRWIkj&m$4cDKKh#*E=SXfP-W%ol6$ONI~ zE_F?^F;sheoZf9;O5Jx?HS+y~kNH2$X3!=&&`L<>QGruGO2f;}RkqLOW6*x*Li}Sv(3z}I7jQQHIGjVv z*Bxlv{+$uqpDqY@eFC>rTcM&aNK@l(MFPe+J$XeV|CHy=@o57i}rCmxU!06 znoT{IPmjyIvz`$vYUDd_F+=|CtzNW<7En=czoP8QBTkLy5n@_8__>@UL_3}%?#%2H zpUg>7qr%T(r^OGdvPK#><71= z5^=7nVrcK7snd4lx@*WOLA*EhtyMO>uaLeWUio|o6iGRwqx47s_wdfg}%)iv#z4lI$ zvFhzvvMP-rHs3dU=%b=CqW^rUec!q*P(tFgF|2yf_dMvvI7*tAB~Hs%j9Q z8`RiGeb{(B0K@Nwui*=2I!kpVyDDn%mb}bfafcd|&wjdFFB`MFf*L$#Dt^wM;C*lG={frWf2hS5=7H8;8x|4Ad;Mw4Bh z#wmJg?Zf2!E_joGy^~Gjh2>*EC6xg?uE)(DvrC2is5Z4s&Ac)=n?9F+)z|s0ix8>tJQrj2y4}4HS8d;Be?dv}M z(zrbZX}msmEQdztvHZw8W{9=92oCujnytX-=6D6}H@_w_C2!WT(DuIThBB@^)Rl9y zX#;g+(a(0uJR1s#hEl(@DaNcT7`zR^PcQ-^pCT4cTT~~z_PnyNFjC+$uYgAgjk)y} z&;;AE_P_!0Y5DoGERY^E8=U?T$Muy6AS~$&jWz+oGsWCU8w=7}57jl9U)m_}hHudO zLSA>>Xw_SKWsO50688g^qPy)Yh3H!i9VAu|!bfD2$`_p8J4BMJ(HeB`#XXry!B(~a zM!sC?tHEP820!$Rc3&z}civcH#63`pnIE zsr67W1~%LyLez0Bj+`o|mmL2ud-;xoKdK1c8Q_qMe#9*Sy(O$AZn9 z1x_EKV+5+o_RkYF$66OBjvM3LM@k11MQv>64YCA>>jz0wqo4M_JKKLv4PC@@t8u`} zwBh8vOSrU)Wak;$OjT;`>o;kC-_7;szjEhBh7pF%C1fu|rd~~l6)c40Uk}h|2ZQh+uX?nNw8 zA6qRLcqt)kT}rffvzBby%>LdB?o8wS1pZ%zjt_93v9PqrX=QZg;pwQtQBF?;l_(mv zCb!xz8ULbk>S*AlAt?nW_TgYxt_)45rLi0fwtvFl$T<-9Pz9kd)mLRMf7Kas@;3Zo>cUbqCR$&_;l>iNp?61ln>*Q6QW#k4q9@QC9w_R1II;x z3h9|wyC8B|oV~b^iLhPeX8){*pTMyP((P6Y)v+{FqQv+R(DRZ<5qlZb>7;f{`%-2c zBWRERMEXxa;%T6X6wI4xe4P)!(5eXZ>VBAeuB005dh+$|t#Krhwn@^>AoLniHZNbd;51_cHmlm#lQs$y2 zSj03f&21>D4U#(+Fb0&MJ78^o+((g*B3po`oc0{)g-IWMjw4a59~6=Ca-P)2UT}$& zuwS;iE@RraDi=X{xR%))@)Gmxg#5kB)C1J1I!;ObQy9VsQY3xQ4E5WSyVwr-$GE9! z$%~P9`b)N&zo;-gWnm1X6clu16(4-aWv5~V<*t)ZW;tfqKu%p_u=NXrT+N{LJ-tkPQp(oKd3m%Cq;msxFH`37YOiqz z|Cz^~xli~7_cOhYCrW73KiVJUXw<@U6^{P{X7w|HD2Vr_FxsY5D~fN}06|gPzMll( zcPRvUeShq8%GhHT5UuxdQ3itN#u&HCiNj8_DH+cg2>*}0uMW#9-@;V{6&XcAL_t9j zL|R%J1EoPyT0y!*qy_v&QE36`RJx@;8WaVjyGud3q?_-qjWFY!@tiy74EK-wJoBH= zjP7smU#xi7yWU0CL(H>OxJA$)wMuar8XB_X31D>)sV1?K%(v)JehY;$Bjb(ReIK5Y zyQrmFGGf#I&Hx`3@K3d$-A3Y#zC;1M(%63`_3+9L2X@B<{cqcqNdcclbkZ}Q@h$ZF zcyKmUzBGDalos zn@bba*jMXN_Da&(iZM4WqbWVXp>A_hsbxAo=nQYvs+^~ktskzPD3y$tzsk}0uj9r1 zep`ULgb}Viw^eV>r#d_Pi6|l7zxOFS3m{4GjdMYYVkUxa%Xv;0E}_ABf`~LYPs;!i z#l=eqtAF3Eh?q@1-nqsPlVrxDN!i&dO)bC149#P7IIS!eE9P|ExOg`8R&*%@tm-SC z>>Cjod>yLf81&{%kV4%{-2GxuvXpV|_6rXzsouy*??;ejdZ4NfiIe&|SdQ&UQy}K} zpIH}wxNC?gu7z8PkRo=N?N~f)KZ433LKW%P{f7~ij+ElfRiwFQmIN6i5s`|cP&i)& zkhw{Qd)kMX(x=B*yOKunpM2Qw=^-{$%mb*{NM>hzVxntIOiYtKsAxJss}yoL=j*REv@buef6@Dy3#wY4I06#k z{Dqw8Ok_-{t@`?Ncf30#NfT+f4aep1w>S^>kW^eSv=RC{QF^A17Rm@9f>S4>FlN!U8w z>cpweE8#yL(9R>Yqn$@$!?HjzZ>u2}X{OTZ%tVn=JQ!=T6Mg(H@GR2g;c1$a-4hwc zIgmX1IBa6Icw9|hr9Yf}Q>I{`K)@CH4@{QaI&Q!8sjewQOO0{)@|PdzExPQ-7;-Wr z*3IR5e-r`!A9)M^L%@v~9u(Q}PbqTFWKZADk7I$Rl^f(18C2`~AD$gf#h>qQK`_Ph zH{3=_<n3D^xF0`|?;q9B*bujRfnJYs2Ej1vNoaC#$PygbzqWcd90&#h)7^G3ZL- za=0XmnU>lJzxavHO=+N(&mpd7JS3!dV}UY5Lw89$nngYEW7LpkT~jz-nfqxna(k8G zUMWX@&)(hmEI~Bxyh-I%^k~SH4e0(ZlY&-VuptP;0DFlG&g45o5H)N!Vk1w1G0NwX|tGQ<5lE-o&)Y0&qblS1e>0n;R7v!tkOo z@;7l21H%{T>5a^LrpQhsxn+EtrmchOPcsw)e>9dc6e$#&VJLY~v z^=wDoT9Z0H9`DLm9|SZngZn*DIBjo=(CPTd_V@}@+Ahoaf%6x{oUo4WcWPJ9jqkbO zUt5sS6Vyu&O_OC%02`>mGkHPUXKDEAaA@tjo5Q~38UtJ4*EOGTr{L=8>k zc9G2Am?6A*`i15>Y3kS3yW)#%Z=H=LUP_lI$200H;@bpbCz}Hd1_s)(mZue~R+-z@ zJ#W)+w5M0id0W1efr#OzUy11doS0@d%sPqA0~ii+QH%HWZ;l}~vi^8ta=i>+z)9Nk zbk!gpG6}Zm;J9q|b;80l5n*qne@;4(-)h2M z|DBVd#3}Yvfr4Kdmo8mQy$CSC*z6<1-SH4j#NhB(pXsh2l8u$*JF%W=ZD#q@FJy-4 zFjzKwiu-SEHIh0O_44hQ0$0#5@@}$q;rXx7aq=+E{aTCsnS`pCH{NfKF`N_4Uox$V zL3=I);gVGO3FwvwS^j-OC~WBd0H|^8UA$^Hei`N#ah+IySj#m$--=!YDy^Ub9&hf< zCUA3&=6+e>deT2ZgE*`3{c{fvSj2?}<(-H3L%%_AuICPpH_=%=jixbHyU-L0!C!L%k@u7tA zBUXC;ax5*c^Ni?}FRhd;w*=)keo;TH)UsOt#)~=H$U$Kt33uK|E~~xeR`^5@l{s&72ojK zCmWB{+&i^A6u#AAsx|#3U(6O?iFpeAYPcJVDHyTcH-R<=l6~(osPU01s+tMC^7SnZ z2U*#Vh`(M~Sm>}kz5!wJXG&C1UZmOQkzPH$i9n2mO2%XZ6hkZ;i%cKAeK;g<`lvZ| zS`n(zX@GK`88fSXO9Lmh?9FST^8+PZZ8{R;ieVdHz-j166qD@ES}UE^JxgBOVSD2a z0ADlUxl#vtkTTIQPv0rTh8C>Un0j=pz9!&{G=Tv@wpTv~`a2Ry8LivMoqm+1y>gF& zNVamQJorQbXW8N4Rt6OgC}{#@WNek3{@SO}uFfz!Md0}9WWM9Ysv94yP3BCwn|6hC;v9N-wFC2y2oZ~8ctg*u$w{Q z#&W`*S-pEfm5qZvQyEi+LNN7$kY73Z$)uN{h-JMri#gqAx9l34J}U9_J|D1kyjifrcwOSg%tG7dvw{SI5s2 z9t6oQD{ir17-EGoAwnFU)S<`R4o}Kr%9cgADGj>6joy;EyuSQeQX{qdzG=Ssp}FTD zp`rwiKlTt(;SW>J)9%63DUQ6^HqGSfd4_|%7rlIbCPTh&l8NEWI2eYkr*elGFLEtb z??!J2lqi?5DvC_694MrEd*}VH?S*z$k3b~Ele2JC!b^Oyd%O<&YLCO(-7j`F%K;{UKVRZ@yvfM3snzJ$# ztOXr3X%7Y~w{+sEsHpgC3y^lI*VmrgKE!v0xF>=Sb^VPNX32lzPEY(|;^k!nKpU&$ zpGjJCn3klUWQm)9s=M!t&cs`ND9~e6z7BrHe+qM=e=${A_p)wMnjW7om@9!TA-EsMM{Xaw!D*rwze)4<(aP*{EGSxCgaQzvC!CVg?*eM1 z8yQM!pf2*64F`~@(fUJ?V$@b~Q*uI(U@jpL6#8DPJT+g{;(=iaN5e<+fIkbZly*40 zY8u1z+f2JGsV(PT{KUYWY>~&3o2e)FXkxb5J(tkWQag`pcg`6TSi~`^5u=Wnh={Y$ zF0)jMw6eh*YfsQVq#CLX?HZqSob?HJ3*VfNy#?wM<91yy_n=TrZ=rP1$#)O>J8xee z^oHb2eme*p+1i}DACpdU8g=5zfwiB za|EXctz(B5$~V}!oIz*e3;+>Y(O{fIQ&r(P?vJrKK1iIc+|>HchjiK#%p+P$BM6yeomiKUW~F}L1|(}Yc!nA71a;zXaN}d zcb^1O8nwIK-&=v5gmzv46|&jp?Xep94q$xZz-eR%M$UB6TK`%f#sSGs8|v%pBPYpt ztuIX#%+>mt0q{u6quQb48W_YiP-~ho^zZHuQcMs+}fG!8r(r+9Yb>G@ED<7)ggsowmrs~jDF!EIQx8CE9vu{oN zirRs2o~D2Y^6oHMy?pBDh0X{b9Ce44!MKD3ZuHmoG_CV<3p)*j_aYx-Jm^XLg2H}c zzkm4!1t#*{JlVHg#@XMhV(Pv&U0h6cbPVZc zZfgyhE*U0&OsF0A5oS`7Y{yvat)ND&p zn@Kjcr>g}l&!Il>$7%9^`1JqY*ug&+!~n6PrMS2J@t98g+q*OIQuee6M2~P09&OHr zd!}iXABkh-{(FPO|CRVLl7KKwD+AkTP`T#AWenNf1gQnHqFX(rFn@pv6v?@>o@xHq z*84xL$EBCZIgQE=#L*RrmN7$^t4R54aBy%2;RC0EL#7{5b^XaH`7d3_4=)jc=mcA& z#fZ_5fafA%M8 zY#2b79{W}>_?&DOClq{z}RJPo|Xg=B+(UvcDA(~0^w5NA*b1e}6-v>zJtgvf2a#6n zt?|Uj%7R63D=8_VA5{(h?XT{}A3j^4LfV+rj(uW8U3s#<^Z@@i)P#X@_FsJTGNvOR z|00(QtJ&u)Z|SqR=dK@raZ!+<ckVqlD9Kw_z#+j|I^v?rj&NT{p;`X*X#l`@#4xc zMHqVjkENx5_bX>{U-kXF|M$OMc0s0tnFVXTt$33LJFi<$hsD6aFxhDj%`r^@%vzn3 zbAyi9(23mY(GR1cg<%YbMZSH}&U%wv`J9lLA_|=F*;#4MPQ6Gro7t~}efB@KKY~(+ zkTD-(**%tBv$R@|W&t)-+w9qy$R>Cm*V-e=E5+~jPl6&g|99s-dA-X#$@@}dO$%mj zKxG?VB0>p!KbzRi`QL^~M(E-URR=bulu(KJWv)%_j7^KUg?)laJ*2a@t2&e?`_@%r z>YuIU;ft3XH`gt-yg@BU@16R=!0kQ!CV*u;tgPiZ{Apch=gWUQH_~*dlb#coZzv_A zv!D?irHnV9Xz`@;Z;xyESAQ-nLU6Gd=P%lapKifV4kb$c=u2ojuV>$t3hxig@)Emp z`X|5lvxWc5pAVj>6v+mMcet(W29NBGQp?|kdfBd+=@bmv1S!5~yIkNv8 z*@Ixa|39*_%DwicA9k$N$X2LfADy_m8=lAh_UFYeQ>lS+&*=|X?hOkL?#E&-{v?q2 zTXwUC3W7UxHL1dFe1ZjA-Dc4DIOZ8XdDyMSyMMuS{RQhPXo0g|F55P1XJqu4*qx6r zjyXMEUGTxnC`xhD`@zTl*QRsl_XYb1^=y_3lcz0IJio6Js)+HD#aMIZy$bK5T4^)} z1Tl=Q)pJlG^p|Bo>rH1&zZhC8f6%-3@aJ0*Q$!QPD2z8jV z`%*H%_#HUDBl4)dg`Ih2JO8|p5)n30H^?;$cE^?mWLdxa$cr6;nGCEm2bK>UCiz+v zQ@6=60*L9qIy%%CUQ*sz!X$~3c@4X}2%2!l>HD|n-P>E|#Xj3Y$n%qbR7U(Q%y2;C z8XttCIfBLg`IEbwmBL%b#XP2zZfTb1{PWNMt^X476rM|UXJS`qRzIw!oGb(r24e{b zdHaX|{I~8N@*L@_@LYmRmOEEtH*Ny*iU~Auuh}hHB>r>U^#dGjn+>`k`=X1B# zqUAV*>_K^ByS;C(kW>KMt!KC1bNe|cgF}$POQ>1w_P&W8gy&+Fnf%uq^PlaeAUh5v zDj_B%Z@2f23W7G;y;{3|jf1ef1?6QGcYWUo;JLn!({}wDOF&|j5TCf~`-Th86@Ayf z+t>IX^ZVV5|GzW8^|MoUD(s}vzugQ$twYseeEDjh6H|+9$NGvL{}6!OX?P=~4?&e!=Yf(~ zhI5zCLm2_M;YoVsd&v7w1pGZB{eKq$zk7UP z^!e`zQFHve5+}SS;P^Izpg`lxbJrSS(m~TZ)V^^x<-6NMNo#12E}fqnu0v$zBWD)`rXj z4>lV5Mq*)R1XnWDAqpjr4tSXst?1m{w1hbt^woVSZ%YmplKJo14kgA8(yJHbT%tZz zcmFVmpO=Q*>)toNcw#QfEd2PA_%SKI>b}-Qc~cF#$)5nPadKzNHZ>(#%DTt2C0Tl@ z^w0R81~BFBCP$f=on7M1#9odDepC@Xq+c3EtE-aRo8b12%3sx69rPX(%JbjvCyk2q z;7(AP!(uS6j&OZ_$p+1EXd&D6IbcA(LMWKgKulyB1(jO6Abxw>b2Rc@&suM5 zxo1KXbqvgC<~w8k8Kn|Q6nmIZH4dqSn2|LoG;Hx)+Y(2 zc)fJ$!g^A2h5ZD8terWD?m97KiqA9g_V`>G$M#wbV{6V7z;tq}QJ4xNhD^)$>&aa9Be80Wj6WTtS3DXf!ep}{3!z?Xq`V5@ zZ$r?3$4OSv??7%|`gVJC=OL&HN*p-zP&zv(7o>Wi@Yf?3;PYsrTDsI=x_SG^kbrE@r zcFKIuq}ehAdEYpK7&Kr&4>s%jdrY^(zk9*dg&LJLtg(ZEYYWS1LMie)Js67IP=8BokLcj$(PYCcM*&PGNX z*$Dpl~WXpw#^!g-;9602NgyLnA9+R%Z`L$oXa~H z9Nj2Nmn`R-$D`2cODUaNv}&+gt;;Way9KRf@B7H}xwEGLjL85!>c!=KwZ0!;({G27 z_9*KBP6e|U2ZFTN+dLAf_PCR^Ss@>ZG;exj8IbHmoViZ@j9;3g*ah`-)Up+vbb=Qm06Fl1e@ z7%_+lHbyIbIrkNgQf{I=s2+REoUxlo8yyNZRWC}VBkzAdE-S^CbxFO`2mi7a)rsNn zan20_WW*WXT+~csvnx|kJI#EUBo-0gAV?K!h(MRml1wySRW#~;<@4S0#(n3UC-u-~SWDXpXMPdrHCd5GB90d$@vg}Eq zmP}DhG_eu~8ls#~goeCQU`w0bqBwQE+NVr3x2J>5aeXEe_RoT6uiVF@T1Cjn*iPMW`^FG1^E^U7 z^DXREunDg^fcbo= zdwoja=|bfEkGpn1$6w+n!TY7OcserI)Frt?yjVw8@ ziF}ygth7B+Vlz#`*|wX4+1O(X_>2DPwlwzNo{)>z@y>~7ueQy6y^S*!j#&wB3N|h3 zn0Yz<-RcXTImC@|SBNyf_*_jLGbCSR=Y{eD;;%vp=23$AXb~&Eu-0NC&~ybf>U+ zwM@NR++%T(jo{tuqYc7gMoY=Mu~cvmw8f1+-giV8kC<^nMpg2A|22eP=5LVOaL7E0 zS#cGS^e*kzPe$ihT^@;eA|q9x*TOL*tVdM$w6y=RT8HM|aKv&IZY*}GaTR{$f-lAd zQsC+;B1TA5BjnHHo9a2SZEb5)W1C~d1HBg^WMDIFO$tj$m#%|08X0S*s6^M7h=$Q9 z%u<|c2++Ly?ZXisdSEsit0iQsC+9VvqCR$3niY1z;70Pfon2_d9-F~m{Pgy;^WiW! zPbn6(14-b0FWC>VhF4r1+*SmwSHUuKBuH#C$!bw>YB4Ha>zR(RKAF?1l6mUM7|dl( z5p9Bv)XxKvecF^YLya|MPwK}8hx^?*dxgn2MsDV3RSFrZIs62PkF!GAhR_vG<&8HY zLr)UwOL>hbe+}ZcFc4ZOKRNpT0D1QSzDo9fGdNSRa50h>62EKeM${eph$oCFQgg#K zZB#Z)_VwUfY-cm;|5!phmvw^+wAPy@v-{M2b+pWaZoi0aye(2U$nDI7KDq>3 z=ak%`pyT+`UEAUF>d^omE0fhxDaWA2Ub|^Fm>yH|c~#4kuQl<#P~#`iRVYkgegv}# zlP}H;7B2M$Q_v`|XWzD1!qZs3${zfCvGVuBjZ!DJhWRBjltQVolQ02JqMY3Ek^D6m z2gfnla8Za0O{Oes`Ifpt#0!6rvCsEcWbzUcp|$(UhJHVUBYwByuzHBPb2L)y%i-(fGA7+*upY282iJSNa;2f2Q9HN*N*ByR%!?j z&2qI|eqGR$zjfd^1Y>R3%Dr~CDKSkHg(qU3?bN006+a`EIS+bK4Y4jctS?s+_4mIY zlHll|gbpDlMe+QJbg@ooS8RHF_k&Cl!4fpMvh%^Tn%r7@aa#=Q0)0cFvLkTF;$rraL93r?^`z8q4qCjPYvpBtWJq5JX4LNp^V z;HxBZguY^Evb_()JbjQ9 z78_43cDQTA?w#NFpo$^4TqzaOw|V&I1zCCC0$K&EW&Fg9s<+d{$5BW^1Ykn04x zUA}5xx^~r}I?YnmyJ5yK#Y<{NygkS;MM?PWso~N&6&v+)FKk*7rGDdrnX*6$HOGyG zQ8;on^u#!twH!mXoLxp4nI(tlqO4ytO!@TZX7YBy49(&gSh%c~M0tg7kDp|W%io?0 zguqN+foAOfPvTv zq*ZI#7$rh8#F2s4^5D9#0l8frt-@upY2%MMUq5-BZ-RgiOgSiyg*Wz>x|)YNLT(Yg z@5tq|8AgT5+WS-Bj#-?$6v6oBDE7`t78F4xCDnMTj>6a`8XE_y1N~RNCQZWhqgyH1 z(gS7gVv6cE+F$nedsvi_m#58kL7N^onZeS=%5b>U%>6mYn7T}gnYKNiF%B}+aL9o< zUct+LT&1tFr5UWDYL4QVb$A9yZq56IuTzGam7NN-hfH<6krU>@RX6v;p0WH%$yIMZD#@mCWI#VQ)Q^JM zPRdBB;xGlx6}NG1W3NwV%LqY1be`>w;9B`yul0cY!%CXi<3oZOW6sR=ydn{dc_VL- zAHO>a@!12+yyQ>AfCzjZeVaco-lxw9-4qmlz9g##oEBQ^+F!SpmS zn=Zy|8e3Vdp1#413Cuze4|^DsAVD2wtvQy{UIUumO}Nw{(gFKw5{41Iq!dc&1wG#u z&T`zNapWk!C50L5Y=@F#xWApBo2XAO*I}6I!|uZMHDC{iuL#!8mh`M2t=O*_-3i&6 zU$Xv(!<^!2GE1yM^AVF64-pC_bFZb^!5P41iH6l z35hYF&6ZJ4!UguD3EXr0;HqX;2eVe$y)g1gWV)@k;^P9B+^;}ACj(ZJNDUez1%U1$zjGdutn%ioV9f>oRiOUyf1mdk7q(c@1I0l zM$o3D8(MuJI=fxd^&!~X=#%3NS>Wrn;EY<*CfTYah)?vH*YU|xe$4*OOf9537QQr! zqh2FPd}j)&s9MS_lFuWq?e^k2k)kT|2K*HXSS90%2d0G!3bv4i|{66`u^miDt7ueEiXYb@P)bZU3!C*G~Fk)u*mB!$X7}d*vJ5s;@A@k73 zk|vl?qQB8Os!QqUSz2V6R+>)KF?#W4ck=wIf`V8<5J-s9ZR5V|_-_C_Y&|S+vwp zU@z7Jb$#mQRf1!`9VwM9lk_THp$kIB&_GB~R^H<~NOSC!d*%E*u>Z1re75 zGaO7L4npx$9(4{*M+aQafr%-V5Y=;QmaZo4-MC0x5n4GYqmgs}?hxg*W^;g9{3Z4f zX=_G1)1?-qH_;NBB~6|nBu5mWy43i8VzTu}w#2#vAog=O#9Z zZHV+pbtN*{6L~A5>kndiMQF||$+cG9vl%jde&kG$1B?OHEpwPiE05VS<*KdrGGNUu z#dp^0hscSa32DAd$Bn>>^P7wEa?^kRD^6hB;A|BW6Qn`3FFwv|Y|i?#jw4AH>+qGi z>b@K4Nxajp>mUsw;bl!{SViKWq?nlGz$N)ua04NR3bb9N+%`feNh}1A_1ehpMgwU1 zil;lWJe_RBxs0Rl(cJMszd56ubELAm-7$S1PR888F;P?f}BD9KXCR{>-7{3(vfg~2V8r>DKFUHQtp1tNfo$7lW= zqM!X%jHNUebt|b=BqE(r(v7vuS}Sbwm8_%)hy#IQY}pkpx76lE_?_sJ=c_ws3+Mx_bMC9}NFIO4RyaSFfdQgOX{qq6y=jJ1?pX>O!f0dotV;;{SrRwYEdjl&D zY)ZDAdDW6XDV>`7r|*re^R95i2UT6j zywPf~j&vqhWFGl6 zIMC|-{h^HoD)hWzy2pa2JTRpvr(jvbIFnxKCb1Dnm7X?T`7Ll6|yII0tc01rK-Pz#;(QV~4{X zi|#w#U+mD01x!2T!Y!fn_-HYI9kj2mOoTC2Pa$=Md9jT9S`@g-A=|h-V-5SC@Yd@O zVkwzNX9ZCoPK1SqpvjyBq+v}uVPJDK$WH0_F|{nNugo?1Nbv{SIIx#diIu)O<+g2( z-s5n9Gkw^p9~l+7 z8&H8))L{m0S-zwL(38AYdF+HoDG$ZQ+EN{ToAPJ-X*-P+p%>aMld4` zN4IfG(e0P_JoI&CU#@~ua4eKTkS=8#Dm1v>Y8GQWw=+EG$+kD)8lWRH4?diTPiMPv zSsGx@W1}z5dk7;@_?Npw;8__$A5_w?tMR?b@&_G80c8QdbuOqhxR7Uh5T~T0Ax?U8 z&x_a0ICs#Gs*cI>j*kR~2%Kmd%ebDR2WDAxBRk#FKb$nweu7UybjdBe`Y$B{S*T=F z8f_XOg?Xi$GCxyJ&eUm{!~)sO^a;LJOWkO*!BceZxdTpk#AxHP?`4(xS6DNcg(Ex@ zFG2V-WF9so7VtF=HPBbk2eoy{efxijvk6aRraomJ;PsHkkj_HNn>r?7$*uGmZO3f4 z&np+2&I&@s=(+5ktb`~`<)1N)c{St?yo{G8h~u4$V)jsW=c>4URJS8tTgdU)>__?0 zWev&& zg%OvDW)*_)-8`fP+Gr(xRpxi(xXI~g88hwB;h(i0*%Ae?>PiN|#GQ_B1`Bu0vAuME zNokxZj?3m5@V3!Z*7lWegdaR5G1xo%@i@Lpc}|-9miJ`Kf2UY9>pTB$AAS^T9WSdf z7`VdD9^?)nh(I8-oUIg#^|#NeOL);!e4>-4qlVP?Mz9lFPNg{yA44{A}NI@JNUGE;cXT5MHW zq{IvF${(+n#KIm^h*%VR+05YdZs(~B)Grqf*iot}LxHdhhu>vDal@yomQUc4u$>B+o0Q>7+_Tvt1jIqpv4TZ1+2axUR9pRQ0iW z@3ot5CvVN(9!CXShZKpk5nu*oDr^f{X$k;uZ7UY~nCD%Bxe8G2T;A6r@iTsYcwbi0=r)C(>jK7ILZ zP46q%KybcUMwo?^dZqUz}S!!J(Fh*0g{?xrI-q2NkrZ}J90+Z zIyN)k^|7wIlh9GMGrERtC;|;l$@GDO`^Rfo_uo#=8ME0$2gu_pL*r4LEI^QB^Cbse zw_~lNSW1p>Z-Q6Odk9H;3}$m0BZ&w|2XPoH-zoqVSDvj};>&ACM3A~537ejD!-Htc zT43Wh!*EWm`w@){WF^jyGlG)23{q=@wN_hdTL+Ko9YOIfVXFnk-3I8_e6LVRtcOSh ze5$V^T}pvS+ye{g?c4K*Z?!o4l3j}A+>HmZCrQ^*F{`N$Hx}ArMSLzqgBh`O;UA)a z=`w4XQ16J9del!C-$0+$<>^V!TjL#6?swR{*y)D1Qd42K+v%a05*H9o{-`PRa zXaZ?;+$tM_XFG%&;j$nwBnJhrx`z;mMI(s3lhrL(U%D)&y4xzr!<1T6i{RgBh-POu zT3ix&w4z&FXt>pAQwGmMcS8M9gkkzon_#=ezFrY0ee;gz0S}~pQL?W_>4ulZ3OI1O zb=RlTNG8JlYTOx~%3sNPR_}qRH&*(&zBAz-DXJ?U8(=F&0J)zi|%lTx1?P2PV0b27 zAipBw><;Cs*jn!qy)-*-Ua&T>8n?w8!oFLqKfoY@WT?@mSbv5^AP>YM&K_}7M!A=l zOYoSYkSyn$X{0-cC;+%7cfW*G6JoLj!_v^obs|H~G$pADs?lW_F!IR(Z;GS)U9_N% zPIB{XA1TJYiQH?)R|J7E!(W7Sen@mr%P}f~tD#VD)n8wpp&9#>!JwRZQ7VnmS6^Sh zf|sd|{C%MB#qR{ggKPzYSyLjj-JaO$1>aiC5*5T7B*SuCN}HK)^!?eC9Axtr45O4z zdt)4}r3E1+9eofhpoz}Wq zJYn1-uRiu>SKlTFLLsSvC_00D;c}~jCo|5y!ZKeQFCu{dQ;?J*vFhB8;MO|~ZgW5W zq8Q8<~>-x%EJ>?qKtS6t*D| zq|6kCtIF`lcm0DEh5t43Mv!4xWEx^nsF~QuyLxeN-L?dz1(jW{Wv9n}WG_k-bB7$P zx-x*mO$cv=#4yQASy`FZr5{2}38?a_2n}ES&tiM1uoD5ms}Yj-ILZ9xk`&2YG_$F4 z1^WJ%5(J`@d*G7(e|cFmlxQE#8J2s(h>@K`vsEKPjUk!* z8O5xfeaK@t>i7@@fXcMToeu;^UH0SR@sQbDL_h`CvDv;s4afVV6)LF{yeKpk-YBdZHN1>Kh>-X1d>(}nDb2EHC0KM8p6Mn zTNjN~e0yo^;kGq4$ICV$6)@}K`R4-bcU?-&XN|z1h#@vG>`f(N1q2Txr+shwg`H!m zcI+}7!X^j+eKdW$plOc#4Ai#u0fJCfZ#ki2{*4x!T}E+q*~qxeox7I#2Uyq zA_yDqb%(>oYBY?9V(I}2(@_A=@7-0;H0FhPUl{m7DoDjPY!hL4-Tm+^i%)fS4rpaE zvjP7XshBwQy9(;?5cNkhrEJEXne!kAoAk|Lh@D zgE8NYklo)}CN`#5N~c7?BZLa1pIW$B*pEq@`sg$+op{b z8)j>eLWtB)R0j=|;?##l#@SPp)k%%NBjC!1EG_Yc}ID9^Xxmo+=_N zsJ&mK#?D)FlDJEe{*yteZ}x^JbH~kpJ(}RLm=3zZAx;LH5U3Ukbjw&4n_-$n_D#iv z4Vkqr%Ls@_NNqYzNnIgz-ZDZ<*t45`UA`F19Hj&;JGGL5V^^;wRSbIAe@!)M!#0by z4gY7SR!;0}9Zxi{1H@pcwPftkS+^{C^Yzr*Yo&`8ht>uBBY`maC@lyA6ayzd+FMaT^(58Vq#y5z%hLuq%IiSBS zMo!g!L{U_6D<&3{z?lV{ZLj<|1|zd1W7D62tdpkMpmRKD(i@4L=<$c+2aL|Ip=~hC zTRpaZ=O|17x6vT>vg>4=f@-Oz%i?%LLV&A1AU4^^A+Wcj0 zZIMU$v-0c+d^d#6m5GMQSqibM+*2c!s`_E(qVAZ%Q~uSmj*4ryXj42ER<_yE^KOCw z>_$d!r6))P|E5-x>Cx7w+YWJF`B?-`@$h5 z-ry-?W-$@D!zQacm;|XryhicFP0ZY;MlVT=HAzk0G>&(o}FJL#CEER`_5jtFk`0Kif|;@Y|sJ$O&Skm zPM?rx_EZSKD&ys<0s{{BfxH)}EmrDl^X6wBb4>g*fG=W>z(Qb&>@+Rjkf}bWkUj`& z%^^i%{84oj+_@||SF{}SCY^V{JG$)pTy5VCRUl+DklJ-UvC%vfD;#9Hebx=Ak0y4N z&&kdhDdaMYTbG+_nXnUF(Ke1AqMYmYksIfamlIzw@YSC9GZZPVP73PbpT==>^&T2|^)6}v zr+o{+i0&e<0@`ye4AMUS3?oB8r6`fQv*C=S!1^7@+f9+S)4 zxA#Y%AEAz|IoO00Em=c={GIgX;|-+I!;^niPp7bSNMs-i_Q2NzBc!2Q9pH6JHr}iP z?qLidb$p0qg!cH099j`9Uus;%_<+j&{b#p7+87l;TrE0#9J7Q_T64Q9S*f}1e}2aX zDC!#vgf=6zl-;ue_BtkBHa4aq@@96A+M;<_PGyfzbEJI75gy^+PW8?o>W6U<@&&@g zOFvS6+FU#c!lMqBGx$1%!$$|TetFwYaS2iIB7tPf#fq)cn3+*Y1h8^*ZzM#nxS+KO zr@Bcd2S+ma5M* zRlCQou6NCGaYdVFqe$BdGQsO8}^%iB$%9Zk+#w^ol?zo>o>qVVE&ZiKKmKjY!S@kIpofxpB%!&qQO) z%-aP(-o(1}|HTX25BfMZbm%*e7Xy&>Lrr3;E_0`mIBKo(^$vD)dJ z-n)m1$D9HheTCEsom$~ zlw^U45gZc`98x?#-6vrG^*nhX<{MVHN-lmteS=icVgaHBUS(c^O6Kg>05a5I9iyf;sN)ckW_4v-azf5>UeG1Am zE;wTHqP}S6*J+S7>VB9^uVU=TyHN^G-4AJ@DJzdjMS7t~cr1;YIBuF>`V5pOJiNz0sZ0a68oIq$uFoNQaxE|RxwVa^ z6pG&g@*NG?OBy=h_9#b_mm|;#!dZ=0>qX4MggyqC(z&Ca%_Ap=xuKSl!AwHbFh!@a zB08Y8$nM~#xH1!bIO5mu=kRtvHo}x{G_)NG)NO>}92qmcZ zKVt$7A!Ch}4WAMITJ+cbNb3d2PrGxwcVwIP&3Du}pquh?@ynjPcZ9n)#NRx+hEzzA zbUU=Q#{TWZrbaj2WK(Qix7rD741k?6O!lK0pnPRWrAwXcl#Elabhk@Fyi@`017t`{LWnAgDpw}z#d}?1*e8IH-*aAnNLJK zIU)*hPrz^h{&v1Ned5IdTcoZf^K`lBm}~q4Dob<^66sL>N=tY1jD?(%&%=+$ZoWd> z0GXhvc)hBaP2%gRzG%e1HQr9UnB3t(8ol>tC!>BJ&Jq6Q&!UmsLjhWi5Jz=oP68Rj zpC8CcoW@OYybz6%n)28uKfo}NnRwj0g*2eWGWUP}fG?mrs;*hRg^zVyzQug+)5}aa zDeT6CzQ~9`q`fI)5^7|dO!i;!G=yO|PilrS2@%5dbcwos==-sFtA~9=VEtPoVYz1X zGTE^{2^gJ4mnz=3Qfu{~#oWUb5HJ12WOUFRpwMY)cQOuJkl6RHmF1WXkdPBwyW4aL zUtB<(urwzI1>om{&1-m%rkoY`-on?mRZ&%2=9r4)=Z>>V}rAyD53Xszl}~&^u*~mWX^)xlv3z zmyOgGTR$LFUHzx(pBFaQrlo*#(~VU=EP$}HH8(2O%ZE3+Ws`@AjJc}Ed{8N_$?^ga)>a{ET>2#_4Z&Q^enI0Cv*xV&*J87PC3ut))(STa83tdfym^p*( z#ylyjjyM9_>8pHqJc6c#$gC|Fu6OC<{LpJaMdaLE{bxA=hSgcl1I#x`g6j|o zwg8}X)DmkvVn!Hlp-9JzO5dtN#+3P3|C(4wNrtr!0|DnoKXGwDQ)&h$vm$9;^TFZHL}co&%^ZC`;OXg6J~$(h?hJC zyIJ!AsTt;K?Rbu)S2?j9py#SN1BrYhIrlZINi~qKXh6J z@--K(Zmt6?rU+{Lf7*NVxSsRxeV9fwln4=}veQaC?II#7ZPFr2RBuVUHpxtuQWT}4 zMY}fACQ+oawC`!9v}%zymFm8p4f8cKCO-FP?#KQ6K7Rc-6Xo6O^?aUlo$H))UAo#? ze7Ri8mpChAvPL}duD4|8)ap~DR&|0ma~)viJzo=Z9Oy!(_TJtnda*lV*lf^#Zp~|u zzahjePTD%m4Ig)iw1-~ZU3n&JGkk)^xx0W1^UtZ24Q!7t*-8OCzPzBVZ)n2Q`_6`U zyiWz`zwj(Q3#;_6Yjcd<)n3X$$k*QYg+|e_YD_#Mgui( zBR%t*xy(W_>YSS#wOIlyVI0)|_Nu~?i`V_KFJD_jR+~82dmAl@j_F2I#>{2ZR`n%` z6YRXoUh4VP+jS#kPh_A=BTP{%Ly|3VntVI-ZEXueHqHL>Se)dkSIt!NVms$=b)xt6 zFhmg9U8w+Vgvbvz zy4L64JUzevM1%ZOU_A@W$D#u}FF@k&fYjCbvE*2(WEl1y!ZmZ4UeJvC-}EgWak$HUW!0B8Fv=`z!;bkN^J3gMuHkN8I!Tb&=o_ zTSKY1qmzM|kKI&}o1daUDaDv)W&I0{w#7FX)i8~F*e+O=_ImUmyP1C!3Q`*vwZTU4 z0er8F0!L{p?hg%(^*Ck-%v1^K!aVyU($CVEkkL)(g!=NM4_kEgZh@taFdMaE&Z(}6 zdT0MMX}w00UDwEh6-AFw(p%n1`GYvsSf~!Jw1g~$k6s>JB)@qj3wCV0o+YCqPU*j* z(END9VCnn?WM7?#BE(vi2mJoI+GMDJj!Z}g$|$W70gYc98?D)Ica&ev3eCaz@qV$l zT1g7gUAo6y(dD^aexmvt?Pm3uS!8-X5%oi*NK2JZ3pgFa_QTCfDXcFfjLJ)74F29HEsCC{yXc1cGkOZwJ1%tOt*Wt7#t{e!$p!6XEuncZ zR%rR=lZQ+j_z!BblLauBfjCPq!Zoniy!GIA?zyeTt^Hfhtl3;d>ukxKw5wm5pHZ|V zI>-`@1>=je+$BcZ`s;d5r;MB#FE&GNx}!gojKjzv`4E4j()(HH7v^tt7FiY1usWd* z_)S`MzUSsHE8*)1zj4A3)mU<3tyykTZA#431v{5}>~LdJt=J>}cjV!eeee)T=a^U7 zS&dqhlIl&yg-GWs7W^J}HW^baMHZ_w%ivi53P0H8mm7N4Rcg8WcCx7EK6;w-vjnoN zL-e+i&+k2Q-yyi6wUd<{rcH+)OH1ZnTp!>gzoq$FawGB1GepnEB-$iF-s2&Ez%9x# zZ84QC)-}K zuu)B0wCm2P8Es^7ss!Wlej|fjHM?9tY{TA_*#mS38LRVuL;od=4LeipR?f;_evay_ zM!P+=os!5ZQ37|$jv}GUllfW~7J+JR!b14LuT0~ZzhyNYuR?};|Kdg5vE9c-73kT% z{DiN94A)UA-iT#$DoI!;ch^~Hgb-0=sgw+Q8fRY`JzYFOOuID4MBfolduv<0 zDv07_5yhrbDbe#-jiJ)mNaz#OFva)5?1^-s(7h zOp}>L=x6$dZH=RvYa+GO^W#@~%mtqpJFsNCH;8VjGvifgFNa_xK6sZ?NZ1kI zy=tLKws?w_T;3?FMbXymvHZ9^W1#{z=HV=#0xY{Yx&l| z2)P?XFf(UNdky4>Hh6xd6d*z<(E5w!6SJMr`xu%?Is1UH66Rmhfh9f4viPryZlDa>&QH3>E!Jm zxB8V%XoJu~m-CMo|NbEky`aCa*N7EbAPnZ7yr+v>M^x?E)Ut*kd4?D4Y0+7I zdL%5M_=_kBgf5frxKMzDYmW@+(&C3R&ml(Vb%LRdVt6B(GjD#L2>yz~I~2d@LefG) zJH&?zRP4_Bq!Z(HVbn4HG=IxjGGair(2MT5T-^gQnPKX!OuOLsuB9h$7UreKQE_b^ zN`mAgD*kTl?%95BIiI>f+^gg@Als(g-?(hfZKl2y-hyrp@$#s&xCWcq^v~fJ=#{eX zk9Nh^VfgDO{*7$xCcH~=u=f8L~ zQflZ8akht~!Vb^oumFgEQfN#Cs=@2~mhNeUCbO`>!KTd~W5-6ra@Z@>;7eerK*!>!b{0YcR zhu!J0I~{hvu5PBo?sVAw<2+0g4wKBlG~s~9AT>=m5Fj;8I1nQ1|6syl8tqP_-7k>t zPgwu+!IRw_8Q5b(`a|HUpb9-r_t^o@o{>pf#AvM zsfNk;I6c*X$1pwBFgYgjf8SKYG|4bcGE9>U-yLb2CK;wlhCc$u^dtj`7}G7a>6Y4; z+1megB!koG0p}(E(FkwQoEB@fF71+qfdQCYY;K>tL?_a@mNd#-pr z$7$^wonpa1;>gthacItBA6OUpVVgcIx(54VuCqkEm?+y1Eufk793!h-#9^7Jx!S@R zw*<>PEteugLRQl)<6DD`wWEg63T%g5w8F1H)A=d)(0zGZ|CsOZzMb6+SpUEwBkgTx^45Y1=CAv`Taj6ZvdN z&xB%OmMm2zlC76?htXt9f04hhm;U*^IMr_`G7qF_bDLm_`=JvfHGknLqU;O74-Pj2 zEH>`N+L%IEa;yLogc&(}AqGSqzlAF4xq*}@+Dmok;fLMCq^cCVS+#@9?4%b) z-gk$b1OKEf0a-$w)art~oGrHDT2kX=XDoJHgd^~%ADTv+h>G2$)Gw53d?U2?OD290 zj_OEV2Sv=znz#-f2wzr6*YnV^FWf{7lC)ilv){hB z%BaB?JM*WXLm}03to!||A>wN0GWd9FF5@E%-Z%+#WU8yXr&=+ z*2+gOa-o7;1^yAJ4?Z7&MG_h3s(K z&3ydsRuM?}loA2N*vhi*7f_q?=!RCcMb5Hgp~OrH1}19lb`7Hk$x%#OIq(g^`N?Q1iFCJU-u`=}JedbblJZN`^^md6%Gk~qL( zRkf1E?D@)en22j{XbKd4j#zPn=TU!2B9Sp8@}AvM*^Ohc#0rpy@ONTw4s!(b1Nme! zxjt`n{&PgIkii;3TwA-`4+XKuVpL>nvXpGM7V?t4>mp|A0^GiSh!1jKlk0)EATh(O zCI;)+_{(+X>pr?Q90oq~MB0M-As5u+s=q3Xs0B@$^aMgWqZGQ{H?*TfzE@cksRZei zYIyzetf%~sehnvF*NkQg)*CBJi8eM7OYAhVBaWnM##W4CbqDLuLCrNzY_5`*!V z!zP_aaWiCeL*cv-0cB&30Vqs|sdovaBM6ovwD?yKJ35NCQu;NdrEo%^$@ zob_dX;kY@A)-g;RB`O3=kSdMyZJP2UA}stTwUSdkcTt2wVDgMry)U2};AI6VgXEES z@PBd2);#jjP7WKj+W^AG)SmqJEeHCuFUs&~PXvs<-f&>=Bsf zm7Y8*93^m)nQnZ@aU$j84aSLXDCHWjKFjl+PhNC`I?BYi?1qi_j|mM{XNougb6H8<=!Tx zZopx@HYaQ1r)Y1eu8DHvuBqPGSY}03NK2rrbmeHiU%6WC(jhFG23WzTBoQvxp;$;= zh27N1SoPv|_@Bx_KguL+BDfpwaT5vFY>H~Ctqo^%{4BPYt;lhC6s&$hJ$Of_`Xc7j z`--!cFsz2?Tk%r4PHJ8&x5)V5_(NOwZK|kR%oQ1#b#4XC2APlJqj7VWo%2m?oo9Vc z^!l+of|Twpt>S57qqY18IAa3YvU#5^Pq-U#_*+UhU;n-aN2v0dU)k^dywU&9v0?IU z!s!YQOa)YK77!oX87ScpOsR~s>^`=&Rk78n4!WC8)bt-a6AJn2>Y`MmtFXn*UtY0u z-^BP>69N)XPp|t=wy~L#8o<^v|x;;Mh?It_D z>7PiCOT@rRkSs^<4>;-;2um^s2rXPD8nnWJOth5yCn1<6(~$0<2ko$^TbYL6iyWKo zoS~1j5nuj2{PO#GeXrxRZm%ELm29l`(!>1B6(9X>!5CND@(!`P%s6&;XHY?;JJ-S* z?jO2vCeBsS>STcK_fWp7ts$exFd;s6&IjdMeF#49D;>CQ(bNcEuRiB=Z8mo~80pI#Yq2cJ-v_pC#oFt3iop zFHatU(;E#wF)$54SAVg^W!>+*C3Oqx-=w%7*qJnceOCYRa58NVf{Yq3mc4&omb-#{ z%M~3W6zmEuFfqqBj+{hoj0pAy>*sp=_T1Jw1p|&O2<9mdtHcIETyIe0j&OfAga>zM zY+DK4sS})`THJqml7D@KT$xn)jBk=OMZWX*K8L}i+1PWB@Q!}9(cx3?GgF8^=0J|< z`dIjc+7ik<>iJshgyDz9k8Sc;qTE0fR(pYJx7Q>a>p@V)g{a7=!Nbu2oBsZ4qsU@e z@FmfMga9x6D77T9=zSSl7%K_5tzBx_ai8_XNTch>K?G}$Zu$d@WbLe8RNF04MbtTOW}r)aj$ye3vX$sN|4$8ns>OF~{2brdy!FFeX%DUz0>(Bpdb?UDbj^%9X8> z4L9Y+>MW6?nl&?P-@)&tJnR<0m zPJ2%kr2ZBXeM~p=DEl`!mu^^WPyE0hIH9n@LAbQRya@wwYiTt@lNlZ)LY`LUm~!&b}O}D;~&P zKPq+u(UNV1q|)upF#|&=cmrsQZpRtGX|xzpR2@9|<~-C}ROy!OdZGZ~B!yZOUA#zn zT90$YgjGe_V6WDR_iTzB(oE&Sp&d1-M>B&+($zw_q8C2ZYMRQXe9oL*8585C5P^Jq zR-B_ixU7d^k0mrgc2-|yNK{dlb5bOFeci*gP?0g)x<;^^r)?PEJ{$JTzylw5ijA~| zq|S(~9@+Col;Ga}?mE$Vn_r(%hny6Jv(S$Fop}-r>~hG7OUV%Ef$+&nB8?>Lpi+s_ zVVUOfcp1yA+!6fQ&}xQr2Dc_A_o|DGF-b()h1%yee-tuP1hWEmK!xg=h;qg zW(@Y~N%Q5xdq^cJkdT%dyZ|+yu!9Zx&BslQr;I5A^JcD8bkhsW^ZcK%_|G?7DrAZ5 zlkG&7ljoYjVV*VlO(;YP1)PbVl5YnGV_1-D%vi(LmW{&N=1l9$_?`Nu;?%57M1&EIHF^|Q z>Sv!|ze_DVV=_AVu`wAs>lvXMl62%|oMB^Hv6qEMNI(E%o z`*oqM2}S4RO65(OGzBA2L-8Fp*-5>_b|_RayFZ~!NiG|wiKYF(@ya-DnTrimzEb)P z%ueC@2@7ez=pOweGM)S}L7~(EQ1Q^Zpt$ntaYWmd&8}Kqu#fiFG^gRi?f+F5@ws66 z4Ao`tkYDns8Hg7kiPb9U*J!(mV^kB6AkXHk)t3DT0WDOI=RxS+N8D~u%z<%qMQn&p z#%T}foA+i(sj^JKs`VmKHB(aDt9m{v2|_ebbZM)I)%*k*CGHI~qKqb9@HpEW-+l6P z>f?}+#F*K$N8pU52hpB9<{4fs4>m~$DQ$iQZLWonYzH_t&xndLv6bJ5>#B}|wecMD ziAdYj71?HGOn8G|G;9ZyX(sh{iQ6DAjzbBVTB9nN^%`{>`(4OR~MG zI>#aKvty{c&U`E7z|)^7_lRD4_oM6SfkeM*uYSSQJXBCIjetoazp&|d(){Oy-k};K zLFEGhFv;yb^^ZcJ);}OJBoo%hP{EJXR)sXF2L)!@hn?qHZ8s1N+53pHW)5S0ffISu za`|vpavDvCe4hE}X_U`ACv^lZPnPZ5nN4I5OPljrVN|)0fJUPCg*V+rLGgKwZ0A^zEt0XqC&!FJ_$AfEm2IlG7?S__o=E1<#^Wei- za9AgadqZrRNke)s-p5YcqngRb4`|wiJQPAe&+C)#d8eB^k3}R4M8gfc^bZ3yu=~}J z>u54jiT!DONH^U^wYv#%`nLUjS$lbkywh3skMUJqxDkb8CKqrjwh-kfb@f82_{y-) zdR6~al?!xcJb1nLpZ@euhg~XU0kKeRNfkmMCALi|<49^32vPJi9n-ZGp~Vb*=l8I9 zA1DSbstdhq7Zf6kjrEE7WhZb^FN!mP)5EzsO9UUI?B#Je_{cWfU9?+0aagT8a_LOP z#baSXH$hNk5hsR-AP?fUO0=|Gx?RRc6+Gr}ebE9yHY;#;q+=H1(nvX>-9J7$)Wb#) zM$2i$Z|vpO&0)zwp!I~r~$FOw>=`TZ1n zRD*me2}{B^J=RTh9vHfH<nHG{tTk9mC2vL~%`=9Q z579bf9K;aTG|C|3GL-E0I31Q@J~>l_sKRN%$bow3HPiQXH$9axOsk z#bseQ9^{#Gt=Yt39W ze!k00DN-~K?!(cSpIaBizv)|#okD&qg7&y_Bm|>|+9Wa7Uut`x99&%ElV8|kuBX2X zDJA);9{D6sTM}A(9@2?SQFU-7FzrgJ6e>F@=w3>{Qjy$W5WT{vwN;zMTpSLAr0*SV zm`Jax<4GJRso$oPp|cWIXWoG8?l;JjS8Z+sDq6Hu=NvqRY(frXX;AJ`i;x?NGPBVKX_BS`bOKp|s}tUQG$Va#nX} z|MM!NiS?VnqE4_}`xpUM=^4>`-jnbNHz8OdP=bz0>DoGP=?E0RR}9?;RLk&N69sL1 zkR1a_zcG!nz>hG5h9}x|<%IRED^dw@+G!OP<*RJCsXZo5j4RV$azr1AGtP5vH+hHJ z%-d!T`=L%$C zEWhZ>IQ4D$@<81bUc=JajZ|Mso3^D}eR#3{8K650c^~OYmHad@$<;=^#1W1twW=TW z)i*o5Jr!0u@%`V>KOlLdME;F^Z&>TE`x*`8bWvCA{(D4$WT$e2kqVToB^F$yvp1EoW1BQ=(Jyw9@m0sPrF@L67w zrY@Zrl?X|AO+inz_}4)lvJpLsZM9%L zY$&d(MfifQrK+(cEoa_pXLf)1<(lH&?MyhD-{Py8a-vR`gvvzXc@kGsZ&fuW)WL~b)PH=0SdF?KEiQjc27txgvf@BQ^*dU#dxUgA8$xTTTKQN# zo`D|Jkli53D_&x+T%_dfJ!tv<^%K)#8OCyT3AlIEvX-eUHfQ35F_V5oD1A_r@w?cR zfYG|mjTcsvG@~llA}ZepG+MeUHbX+wwxT-4gpK{C0X!~v(W0w8Ilw|?33XikgK9KJ zME1T3;nyvyP>RNP*PBB~H7B~4*`R>s9~i~0ZpkbtF;`Btgh|hbH?@Ta#7l-OQOf(- z={Pvb+Ic6b4mlLsnrx{cZKybq&3SU~;AR7Vhjk_n<;Mhi=+>&g$IsXIQ>95u!x=R1 zx$JjB_RXnw2O%VoQ^4sIjvICo;-&1evHR~FG%pdJooZa+sbr<`?)juXq0a@HMA z|Dm8Zn?Ys;wP63jUhZ-i+ex{<=3#Fwm;Hk0DU(`Cz zUZkfLu#1=US0cIT*HOO;eH}&$NfXD#)TH^7{VaG){>@4PBbP(#vd?)!g*6n2k9Z2; zcZCLU+<-aF@hI-MoZ`6+4nx|p4k}uE$hl)ud3mU%~`iXw?uWmIil*+gEF7R?GisRxps(WDReNQHMokL2=s4ggV-%1c6 zqetV6JDuBEu7riH$YfY?{Z3eqDb2OK>XNiwQ}=|V^p?<5mHSv1*8ULRa4krR<`hZP ze1)BnA?_}y;q~Kmu4Lx0zE-NMRxXkS>lvLU(2z=CCQ*wWE~vB!b{HKreM%=>#>As) zY67oaoNV{V&3U7v@VtDfsci6sWF1l!^VfMx?HDyhJ=)H*1qIf<|~92^NFE|1KoN~cu3 zt)P0$(0}t-d%{XeN#8y;Zs+%0aKMr2vPF;G4aEAL4Gt1;t?{L5RPrgu@5&6a)H3vS z$^+z@F7)HHO-w9?!W18I0goNbRq~ED#I=~@?5s7?Z#=N-^1G8|IrS60mI%__v!3

{Vn9mvw>hK8O-FYJIIs9TV zn`76y{4L>6S_J5imE|;~!Z{j_>ld}pMl#6TIEz`ymmixR4OuFfxn|o(m}-pZFyp-Z zNY&5n z;w18o!~n%093)elJM2lNRC-NG~=1wge~G@nn8QgarAgj2wppFnSgN*l=4_uyd7 z1=A#`y#!D%w~L=^nst2~t)mP4Oqp`(VZtlZvR}xuo%OaT_u)b5RV>_t17F!Yxc(W_ zq_LSEJBKIFq!A##S&jpE`?D8D5qjPdQ={@3Zk^omn{D+#BXyUKk42K<9DClT3|3O{ zQUzKWJSe_R?$r_Ncz)Pe_V{L~bMb==cFbZEKT zhROX{8Heoph37*&bh~A#gCtLbbMYSh2;S3!DNvgyp~<*&N<>{=%CodF4qEp2w2^hv z9IEZnmL}R*OZAcBGK*>0r^u;-D50zA8V+7CG_k2{7LOB4QkvJu!w06V_0MwbpPVCR zKGk{=`gcv>HV^F8IEZ;Dg>$x;q1cHl?>7wpToTkCOnu2bCt{6*KRb@yvdv0LdZ6W> zkUaNw75=Dq6G7QYVsb#NV+HzIhuzx7-rCDwY@63l)Y5>NT2MY}o~GfZ_ZbU`TtI_x z#u%Me0~Uq$dy5!km$}L0)oT8_aF+er`8a2*JLiSE5=+g4UWXiz1TJVvlkjI#_Q2HK zm_E(9S0)=>qdSGi;7j-=`5|L?txP^D_~D30xBTj@f43lCrOeU3)S2LaHsM?~@1288 z=aRAI$Y4Rw*awPi1|-TmulH!2|7^+a7%(q7=uROkJ>sx5aE-DUTdURzb(76Z(dxMT zql+vLb|x2f zPEt!Sxz-A9F#NF(7A7_i3iYzB4SuCR&iB?{N$zn74q@G}RKZzgFgLGRweA!!~-701|)SBI#P@y^>7_0pAb@6Z}*=Fry4LnV&0`$C02=~Y(?OKKk@-0jjddygF zeV05q8y8!ggq!k`Q-{)d*0{J}x4xndO|mTxg`5yGG|=-yJM-Y^RPb1FPfp?B!LIa} zH3>r>-38Q;k-YJZ;am*%Q^6#`&>BtJ?cd|(AP@%uqNM=! zwuVi35h^ce_xO_~1h|kP43$W`=~$S`z@lZ5I(ZCKQr4=#Q7_SPIW|`};^xa)r&aL5 z1>Sg$o>&k4IajaOz|FJ%MU>*0nLw_(37LeczIQfZOD(4EsAa{m6@1FYj&suBon;2! z!wR2+a6tMmF2Fo=&jUar*9Cy3U}$kAF1ANo zNcZmihkZ%>l6=0KuL1g(D*4>X+}zqH&oj`|<2$!Xj(xW3YBMNeHI>ahy!&$FTD^7X zRF)$!YBE@XMw018z>MwhJ<$X?*BB|PxMG;LZ=mPyk7F7W0?irt5M zqg=#7bJ2pSi%tRQ*itA>k9(-JT$L7a!Yf+3L-z8Oj|@Akx*Q>T&vyxmnktoL1teOI zNCy8#_g!x}Zt_6@LtAP3afKT#MVcDqwwmz`MGs@#%sW-jNlQqN0t1c@HU!bm-8d{} zzOxsQm2O-{&!p{1OL_|3$)J#ueo;T)WJ^pfCQ+5s-mhCV4F{?(UC<6JBM!VD-sBx5 z2e!catvg!Wwb`5S)f0IEp^%GWoM^ME7;ZFs##}*e4ukqHV{3UaC3B#j?}w#Yyu-#$ z>|BFWn})8g>dpDRbvE()E2(zTQ!B+O64>%h85Q*S4nNf`aZH;kRn{H{K;zhZ6+g~PLTZAyd|7JvdfW=b>*WFNTri(WJ z`M>_}-}-fMvG0;yY4pFUn1j;bilotnSMz@v$GB&GAbvale|*O!J-p>;*8czIAHm@K zZ%AneJ_C;|%Afy{XbK}R6{>uui>#y9)9zB%zKM%%P4L)gtrCf=w{~%rn z8n(fK*MGah`2Q-fhNpl_uY+rMmj7Sw;y?U=&e_3Ar!F~++y7uG3GURR1%!KP`Vd_F zM>XpMcyho++5cZgnisT54*&Nl`d=?cP4IQ{R`bQV|Cb2M{;#5IU6km*c$LUBe-&N- zm>kY{DB2X@S$DfAGg$>!6#&sHc*0#C^BHQy3GQ!2%}eqjM4KpZmW6Sfjg=}T`dsF3 zVNLT7QGfp_l%WY@XbzwSRH!ZE`b}f}X@QtbWv_WwTQ1FdkYhbV=sb$I$0@b&W57Jfx}x}?+3$}+Ugu&w2^)Gv2<1z+yEMHre61*89-_`b~T zVp?rZZY+zs)+ow$8VXE9&C^NzK}a~4DL$kd_y%$h{7?fcOI{e2Itpgn$o^X0NBM%a=w)5(BMs!;@M;F zxQ(zRW5m#8;oI}ha?Q)7ps0v0Zn%0 zPMBu1XQyM3xceAg%`D$WJ;@*iSSf!zLp_nJGX0x#4(Nv}NL$Cf ze{5N8U1)R2DIBrAJtMa;x3C}|D>1R%`%%+Y1X-t_j@%?qkd;%o&7|G85%>ou!(mm-e0?$6JE{93p-{Y587SSFuSFr75tvT@%bQV?d(ke7kz zqL5=_+2u)Johs)(=_lA}*__3s4D)zek};D#tgu-j@^yc_fpYgp&Fm$(W!2juIH^#= zGAsBwE8pp8MU^!yW?J8iWKWs!ayV`>tcNg4tmkdGW!2I$a(;%Lhwn3HF8AGBPr0Za zUZmXQlzi(jEx53L_`}rK*qeG>F8F5!Ps=&}y^#rtMYM;!gh4=!5s~fp9vuqej};{Y z{o=J6;YvmzBXJT6u(l&GSh~MV0^}ORfaQ2uHuy&#TX(&2Ge~@Uy2?XtJ}Tbnp0MYo=gi3~ZtKrl zLY$^nyDgMf+p)Jjf3ksA{e|`;GRU8+npJ!r@xe6H73mLkT7zus#|6f(0OOB$G6yr) zXX==5m>F#f%ZwzU_BOE9ElkVKkp9ljWRrx z@x`P%nx!dqRw-rE-J}Q&&s{0b36Zrukm@?T6N@&)yR!yDGj2*3YkR-C9iYSq`%ldm znrmOqz!nObeDkETp z@nK=~G>8k7aKTN93}#HSK{j`~J%W%%=6+FrRirJps+&zHk`RzsHW{gF`?OXq^HaG( z`|#Lhbu5&JI!+Xt|LK0U>nwP2IMKJ~gPk+{1Zh&ik+g>2T6xukBv0W0tsS!2g;z~3 zLCOf*Jyh6IKknlq=}J8-U}Xllhzq^EqB%A-S^xsE@AHjgwrC8mWh1X);cZYU_CX z3+)i^?DYIEPvL=iTsezWL0X0ui&19=$t~nvZO8=dt#0arDM`j-Sa)bAqG5B-WMhVg zgNtfKl}NlGr3OCf>TG6E=3vwVH^LGSRDp6#kb#&P()7waM59bWGWS;Sq6_$ z&7xUY4lLr2Vh)U!?li2Xx`T50WicuE69`>O;{co#eM~5KqkX1muT*^d#wroaNmsA3!_()~Wbk{-t`QaTqSCfuh3?Xj$aC|o3C z9`g^tb*2F$^ba3_tpqqqeqcVC<%p0S_^f(ii%qEz7xg#{Zvu%)Les0VUIeyb)X?CFlTX{dpP~QS|;ET;_f+BH^)W! zV2H4e76KX+HY`VsPGVj)*~ehJ-($(+@xG^--qbZ}7;|NxL_wfCp8W)&kD;;)>&ihh zA5Z$tJ3kwiM6(XVRjY0&NslCDFD{fKvl@d{sz6Dh0h$g`K*z}?(z@NRd`2CLy_LmZ zL=zW=JtT)YlKT4~-jIIq6!;E*L)$6V-5uDt_XII`i;*AX1SNpCgW(gGgr!RNj$h4M zoPN<$le9j$(`Tu6KO3#k6sa(y^mym!(b%QYn8Ef9KR`b{qmgT~vbmhb>|u0P$^n;1 zG=W4cjQUOIG-U%Sx+YBA>yDS%2R}>vAJ>s+t_I7~;^_bs=Hz=q5n-f6Hae`N6)HEH ziqxrHQ_|!M-EQ{o*4fXwI(w?~?6<4u4H1CQJi1hOSyc8`lMoT|4>0Ve_{@c;(X&K3 zB$eC?|A}eA86s-PZ}%*!n-FGTAoD_@NQ?FL_dem1fq%NR|NC9u*T+?ha4C}G0smXD za_hcKfDiD6oPy%(pQo>X44V(WwCsT`TxaK$n`eFQ>+1X;suaN5`s22J!Iqt+8Xd5& z>okgZlK<6{e_YT3esn9|s5%(J#&1T72lgA0G71Rf$(NysPyE?Ic?ZM(dbbiYt?jo< zk8ruF_xo8^Bm(oV$B)emDKr|g^BeQjFyy~OvGu_+62>*1>f?b>1cBeXC~fIpqfIzr z5MTp|SJ*1k@D7iBAQ*vbRvV{vo|HET&#ohnmYn7FamqRFH>HehLOXad@mg=*0 zLSI40rka<8@Lr1ZL>XM-Y%;rOsijDJ9NAC7NuPjWx_Qt2-Mt)auBDBYn4BhaU6|P* zZFHDO#AZqziX?rd6!it`U+h&whrdCUPH#MTpJHT$%r2L_iL~ju(3e?t>xS2H<5r2P z)iLfSy$MzVOaAAq(|kRk#n>8kvu>|AZB5@$ak6>*#j7c~dX7;Gz9HvJ=nxFg1wxKUr!zNH`@dlVuSL z<%D)^XF+5N4!)dQyslBwg50+AHGMPTBw*eXE-^*v-JZTiqs6U>ZekrF&*4GSa5ajb<)tn_@M&FRyH?j=2k#~>ADf<;Yb#g_+LB8L>r7zHxGNF7~+1yX?F zzveek2(2IQ@QYQ^U)aV+7N2ZN)QZNP6ho6ei?Pt0PTL`0kpSq-N#X1$Y6(fqQ5jNY z)*a7~mC~NFqHRL5jG&}LcB9iP?mE+Og=vOWl@7HLObSYDC?d<(t9w*yoYmG|rR@kK z53`Z$FqNKKsY&VS&C)0W?sm>~H<^3c+QeADcB*9RG%x(rRBp+1{!J-FJ_Ly4WE71j^q%72U<+0l$f{gt&x&m+Mf@* z;8fR_;ndQa%NboKeu^{cO&P;1$<~?~nv&w!UP?(a*BZsLEsIZ?SZ!9SuAaB_gB+_) zMx@)9@+#B3wd9`Rt1CU@p|irQyzR!9z_%8)BdKrIPT zt=2u?V<#VNC_~D!14Zgrpvjc}dXS8V{8#Mk7yxUSat?7S%L;D3AsAm0aaVfrt5e)x zam+0Wj){U)x%msb%zQ?rECGt$fNpFA4wY+iqX^G;8 zeQFu8cXcL>yhxfB*w$-ad<-y=^jJ7xsdt!R{im!o^U=I189HmGkL)+& zafTIwqLhXMM@&0vpxAQ(p1v7#jp)0st5ZV2%%w)rtU?0eKOLuO01*gDZPCxw%cqQOjNzS?)% zv-Sho$c^6OR0={Bg);~Z%1ExdtgliCIwX9rT?5|e2~iu}?WoGARNHU2je8nX44+3L zOth-vwOBrH^nk8Q6T5V4IoNeob7V!|U1|g<1Akx0^A4_~S+ZAyBlf{HmQNye43|ICvTz(y}?Y zqTdTW)y^Ft%U$!=YCMVU`k&XKP0q7o+wGgZj=}YoM?Iu@E7qKtboZ9LoDG7{P* z?5<^nxi>rA_SeYtXpc;Vm?{v&pKcMT|7gCwWGB1fxZE!s;lfdE;%t@*-4PLe00I+Z zIm@l9fsGs03xGj_eYMEP7z8Ny&<}@<9?&EqU{)a_LLg;6?L`GG4BkcePqTzLRuY9I z7OPf;BYY1xU3+=-ur0kSIL^-1$sXHQx530NVN*Gz8x!pF>J_34;rC)(IH56`y1DQ~ zja#i(|FKpyjFYmiHCE_0BNLJEOtna|TMHd}uF0K*IC&0lJyr|q@E58|YO8@J@6vot zk}hR-B8wUw@1Ba;#yvrX4P~I6g8QQ^1F7Vvz%B5!^sMeo(W9a+A!%9dr*8a`myA9y z(C!L=A$Cs#?{k|6a8}@3^;Iqag)h^#{o_1cLYJ1ohb&kRy15`_hJzO)z-lIg*nF&8 zUW7xrTvC$%v5cDz&iYcbqnN2*Wm7_vYH>bq7REY2p7{R8a!i>3Ok5Ea zxOZiwJ2rYd27ofX%b6}=ajx?;qeif{1`^}sXtk;UyYn%V`>8RlVqV*)?vV@r%mg!v z--bUFN(}ye&_*wNHPK{^OnjlJ~XxP$JX zD^-fd?2N`jO8q!+__!IV?FJ@o;6!uETGyH&XSGRKsJMw^U{Dd4i+7^nB8;cA9Rp6! z$qL=_4ZcOo%2!eeqbSlwyTlydRX4I0_yFh#l$o{lDD--Kp)qQfSmGgxa}rXsqoOF#CIa>&(uiPuX#HL8>PMt5g5%xN=^ zw*|Vn=lBc$zab6=EI4(J;SI+PF74Qc3jzY!^tEbcyp?-vbOB`SvKQYzJNX+q#vXsNVX7B}U_k8$k1UTgEyq(-}zvW=`24PTI< z(;tQ9Qe!e((Wy|D+yh%z-4D(CsVBK)^l92TrldfVXD9n7(I;fd2(2crNT4n-Ci!=w z-Y^|5TCHl)XbGkBpPT(TA%x7qg7T6J$(7f9mF=i?e3@Zr)}d)hcltP~TpslbFN^cP zImyw;5Fn#ic>^Jpzu*U84cWV&{0l>1fm@jC-)M-B94vA2K?Lmmg=vP(aH>Jh6`H2H z%wh>19GyYT6o<^Wr7ns5Vw*_PCF+RzF(G^Zu1WNN`wT`>-`oHTf2B@=5a^oizs}JW zU@NuOiuLJBj>iuXOl6?uz|?mO26|QYgz)u$?_WidA6^+H8kX}}{u5jyl7P<{*`!X8 z;$NruFR27T?62hL|M4BLrhJ9wKLytxpu`iv%aPJK+cDS|aV9vxVSEEn*TrXlCB&R_ z#F=`kT~KsqA~ktgCNCp*kGdAue3428MB~8H%eH+{I{B;fR!!V*hhnJYD%;R8r=bzc zOKJAyt4oq!GL0=^*1wt?^rtw#oQjfBWLeBs!&_Tv@3kqfkh}g;!(J=DG?U-GVQyV1H)Oal z(9wIB3V%w5(icz7Ys*Mxl4U6)-KeHLlh+6z_i&A9gejEQLejghIWnUfr!mhN8cJiB z95#8~Y}TvoYwT=f{rYy!AO3mW9pCFJBA8U%2i;WoTzYo_;d ztJ_mKnc05X$?q$ZA?mupyWcW0N|S@@KaJ6VpLbGZ`OED6y+_2EAFl}H1m{vnKQ-Bc z=@0MCX28zU%9Cz8GZitC^^5C7XCfO~CG`=IqB@<)+AwXKDXu#zL0N`0Tvm2@$JfbC z^U0*LtL__j*PaH?qwxqS=PZrCS^ZIhXUECZ3oEuhPTVWV7))T zB7iYW=l?fTZ-D@;R#?UU0{HO={c5iYkK)2XDGURS48NgX^Rdt+<)PC(VI=>_>!}O3 z!?U_NdjA~IE$y8EwU zg9X)q?2Prz^tZkoLO_7u)KrT>rt6R%62n%@nN+U*VG;3D#5#z3t0(g+U1(qlDH0wm+9dt)g$MW_Ut+KNGFHsU-wov zieQdwUmERhzAdq9h|TiI zLk+-;vt(y4xNwK6*D(7b51SQ*_p!JiYg4eE=z-ie%IG6kQwD97Kjd)jiC;*~2XGTr zDN!L+;zD|w@^3snGs@5)oX>yg5zU-T1R{C$LP{Xfu;2r$xKg3mqfG@HBm~6(7{gex zaNJY@w1vT=>c5y(rf47LYb_{=!(o#$I$v*4kpVehKBoT|(JP}fioB7gI98(iO`s^> z08=!@8Wv(QpH3?b@I00T8Va-|uEm{0f8$`XChWrJfZw`-F(w=_ea9!RKpdw)s8=~&9goYwvT^(EC-^@qIlI}`M}Z#v6^apQ^#C$J|(-> z{0XXK4N-iM%rCVHHl0ar=oGa|hdk3zlImTpo<;47g;O#Lp2f1H*H5L^kx|tBW zW9mpzN|FmWUXu5GM2>tTjh{bS)VxhCzWX^f_pIq|PBjaKOb}V3-q&XXZPKr$3^oZD{TkKEPKJ>y?WJp3W-uxWbS1@S zC_kOTNXJ$ZVuqGMYHssw8`K00D03oobMn9(v{PnB&jQ?@i^IIyk-7JNnEo6LFG5m` zk4l~w!ofbWuqdODc$LJjW%U>yU?>Rl@^9>mZ5BH=sFunboTd&@Z*>jif?;`uFpzJ% zQAn?=yUax-Kc`riOD@QLz`rzr6(M_M%z74r>E3ce?EV3bxFim9pwd#qBqYJd^%4wv zFJsDBPq#w4V;(`~IQzyFQ#~zI@*@g$)$w-MxPD7WEpvS&c=;7EwfW1!QsZaX8eAbj z?@~q>ml&1i(uy#i7>Nx3GvW^Hi1`N=huL}D!{?LlgS1)G^iQ_Jyxug!QSl~^hi-Q;g!rkK)Wly|NU8%&1 zDaBdsCX*2Lo=8y6ce&?A)7dDP0qU_j?i0D zZ$!R-yqwq5ZQPEDU<9I#bu_E zk@eEZymxxe!|sWvf2so%KN-cXE)ITmaFfe$Ra|EVy8Oj~^w25v9IlTPN+tbK<sq)svS6ig{Z7?G|Dg+-?u>qPbeTFQLt$m6|}b{J0(+apiSX#r`4ZzLlPgf zjvdZndWlSn;E7iDo=)D{0?v)Lz7nFs!9qd>@(6!<^_16uxk+-Vi6Ct#l2#D5uQAa% zUP4fW#Zzl7m3dxCr#Z6I{JA#VTO&{1=ZHO0&L)L@Ww86lnWx#y%doAe9XFkKO4uyE z57dwkMdIh4jB1w?t(G0+p60cJUrMQERaH13u!N`5hv# zZjyeJW^Kb027iW=Q1X61k(`(@k<4iaU7~4apIILCHyq|-8uJK!R$jT)9^imJeaj&O?=AWIbG$1~5>OZYko2?a;l%OmS zNi2;oH51cGWt$+YmhA4waSTTd@(xdF3L2gDk50A;Rp}I|kB*#GF6;#As+xu#kDs5S*u) zC*ip$l^b7v^e&fpDnV_ACyCaA{hlt5Gin^O_xhX4{qkwjd_zf$0;|8zs$b{=_nO#QaW&uC?5k^o$YwW+fsJT zuJ6c!=7X;gt_=&So~78_rO%Oio6FG2sxyEUilcVGA^VdE~Z}{i5*EN$~NvrQZVOjzX?<-TU$DJ zU~}diIr&ZCXI{M6J@RO5oCR)+i?c_EJr&W9CDaatvP!|?-^ggMFVroXIH0U&AMTi0 z9h_vfz2!kbx8Z%#>+R{h{e7^$99x>q6U$A;PBMTyFx~gKtr@v+FwZEP@(_0Sdzl}v zW!}4WQtqJLOVoBkxeVdNNQB_P`;0RClPbh=czuoEowMY)mv8{OSnm6S+n*FF8HikDiPql&B^!%Um{i!yL~K zHR&X!h3lbD%Du3YEYt6{L`3<)&*e@lZ1?NKDoUTcDRm8gK$z4`CN^W35l$r%O#P_B zGe2jhc~f!qDNJ&K2d&;kiDP_y&yjmQ7Fp({d!bvifTwC`+C!|(H$8Ghx+c3+5f5*Y ztJb{%y%>B4hDBM4*Ddo(hGjd|t@gt9P6w)+o%C*hMdnBqqRa3RP-&(~U?ZkiMVwkIVSZou1H3KBov@72AX5IJM3&nGH2B(-s~4wx#OIEe!pbg~vQ51J{hF#0mE;5@`@|isDBN z1kdgCf8zT8fiS2Ljo}?3IntS;|M>PLGQ>5@IQ;PDqK5+;J|STdFfNt&_dB?k;u+Z6 zqr`GtnQd?T&)fDNu>ZFSS0*X!0JtSr6P}$`ruy%RzTXpI2kNhM5-#tjb?=qhHS`iX zsg0PTgSzAPWHH2G!~|ZI!UW-`6gqcGk_fk*RCNvK_V1CiHG0Sey_^`tJjUs5e;s{& zKlkGw*)1hL09O}u{`{mWQu;@bG85|+}H4)o?G6Fzmv_c12v=pY|NQ75 z09_jDTUbZ(rZleD|IFb3I>;piM4{2v$Kzt@|2{G}JWGZ}MRK9)N@0!sa;^CJn2Y(2 zk`NGfAU5RK6*_f!g+ktb7G{L;WKye+p!9RF0i{;aP$tz@FaI zQEi*p89OZlJ8cE6v3GarLGes(}=@sJeG;C&xiaJ6boXC zgC8#4D`0R-A%Y;|c?Tg94Sj~ff(8!?y|lt-E;uj*$|6>@vcJ4EGg`iE7Nhok(M+zn zl)Kv&>ug+EVhy#pEUcY2kg(g|`AO=$j)$??f_V~=1>1SRxzkc?#!_6!` zeuh8NAeRA`>(9`lq`h^mARqg*bolD5BCcAjEO0;Ay_6Utv{1J7DXC$TF`7y~5mnFw zb-cUK%C5lolbT;3wYvIKtb)9+$Y@%Qc&d4y5H{{KY(KfwoS!uN<_@JJT__x))?9G% zo+<@57uQPpEKB2*6YK6u+DSIkh<70QnD}Tb9RsZIa<<_gJ=#3lKT%n#gz(Ee$OVcg1SFG^oa=oF4+ z+rpj?aR5q9!y$cMy&aNLDzp@)wzw1XEf4a<>^4S>GQ*<{1}e9?>q<;nqDsY`=O)W= zbc*MJa)=xZS1;9A7{lnXm;<*F2_~^6y%)VyRi9!L{18MVuEc)F#R74Jzn`4~BU7yP zDLeK&-iuK+ko;(;oA{8wt-gwWmvRs4-y;@$WW;W}`u6j0f2U9J7TkN|dHp>kU zwWbu)XlLw4ptb-3R!FqQgdIaOW-sVkJYDnmU?pV#U&g&p-BCQ=O;cz&af364#~9E7jLj6Nd&Qp?OJoU%66?yjr8=H>>i z1z&hN;$Eh@S8-0F{vf{iQ3@hs~wne!`M~y+G8Ims^WD-5|m>41x=Ye9OA+fzLLDr3Gic3ve^AmCO_8vGt-Qm zsOy9+7f%Kq1+UEaJ;2S#yqG7q-E2nCa|4tp)K=g_gTVR=r^EMe7y4!t$QSJ9=#K2u z12*HJF?g)$PM{fvBbbjO*1ZZz>;c4aSHPC%QG9yC8zBX9eW#2vANp0zBU9`b%keJz89gCt-EE`a%4WV zT=cvJ@86*iPCaU4R;n6ouL|(;Zft8+rIbtCT+vqT7}f~aEZ|PYyzS4=Ck0Q6!zHq3 zJ#W*Tg=0fs=h`k>cD8)N^bL>(vg(Cua}=aE&F{atk?t^VkFh6f|4nZZ;Nu6iKWJlG zYw`4l=3cVX{Mv=!I3~+;c3F*2wHCn{PA(MM7?_;54)3S$)Ms zUE>}rocgdbF&4{SscPlcSH74=EkSj1NDl)sFiZZe-QSaH*7uEf3o7Akz3gKJYFZ^w z0^HJRuAXCeY?s^7dZzX+xCQfgbMeA&7AqlC`xdh*9tQ zuGtTVC{!-UF@+K!fmPwFvBv;tbUi3LYGQP;#n#3r2Vyqrhuhc4Z#ve3yT>ghI!)=W zjd~0z)@QFH7TARJQw2HKO}%csKk@xjbbL7ScYT=4MUXT^fp>{@{5RZ!0Gj%8_P8|Wd0lX^v6vC9#4YnIq9O@p(3R6?^ zC~Zd3Ax7ZZ>+XchIqX*AfFiHAtn>LEET~K-W->U*WoL5rh9J7Nq~M&T!m%BB|NO^_ z>%~mC{*TX;EC}Q%u#hrQA6lB}NRUXtNQ2-D*ZY2x2v^}I~vyX0bo4rqO!_b7w7u-C%sPe*@)QL7V^;Bq)U?ERJgQb z1{>eFHZdt%8NP>|f>=Qaq_Sq5X`%=FR@3FfO?op-1=&$3k|Y2sgfuTFcf@fQB~E%p zMgdvN^n4=-+$8P~xogOjBi%tJID#O`ZwxV0DX-6e zoiZnMA|}T0;P-KmyrG(emP=9z#fn!{hL=Fg)QzO#mUrv+?!!)NxP~%^LGCor$VbR z))ITL)34r^#=|%$gC-!`|E-h9={;;;D-*IvIT9BTrXDhT)5WYquTrvc#r*q{tfSDP zP#U3e6ge+d%bijgP0Z0Onz@&`l$MJ4jksT`%l|qRiw7+U`9f>La9AosC>_9?4AZ9N zsw=d7pX}6SOfkT5C;Le~UR+H%p zsP-A`QXT`;5$YLR0;yz_ogm8bq(YdoD&`J?Ym<@~oJ0&5TJquQ>B)l0LMc!s7c9CI z1=p^Qf%!F)0gLwNF~H>2X(l?A0}x%D&W`3PFFB_5a6>b+z$@wSIFA@G(eG~NkoxlC z;=q0e{=^uaJ9cOfGd;|C1T-`t@8h<|9oXM4q&cn#wu1Rc7BvI`N=ge;dN81n%l?rz1XBiug zKTiZs{SL2IZDkEjUv)hE96`hsNngk)qtQm9(PTi+LQJ!)gCsBvK|ROii;Q#e^E3)B zVn9HZIKDm4w1HmQb!0H$DJD4GV$JyC&lqsM8^KNBIItp&cehQDM!&u{^M72H6HttQ z5A&042^o`ulmp}Uesr+9lfVcs=#s0l$SRI4T{Bx_uGV!HC_9eGsw4Qm+7}BI;;yu# z*>9_I-C`%p>+tcciW`7TT(}~M(hDV~l*0y>+>3Mo1M;&*-tGXG`(_1X!z;iy`uPiK zx=ZwfmiC|2*=QbHroe#^oV+pUKb_VMiEA66PmIF4z&b7j=*Mp^UpId|_|wTuQ>D)} zPwR~$g15CM_fW01CxW;x?y^~%4_ zTgrTo9&?HKpnnK{;V!!;-CyEr3c8dPwBAu(O-p=<-eo>uF0WWvf0VV$Gg#Qy7r_>X zPLmx_SkK=Mp6mm{_^}^>TL@g)Z2N_i9*l-Jhu)8#J6vYBU^;Zr;d=rUTdNUic3zpH z=Cd5TmT{~ji05iCB8~?YXuC)TtqC25QJ9_Ho%)fUbSZI6Rjeo+4VX(PM%K|3jM`yF z;z`=`sxgz^dOP&8lx!%NOOI_oiqedbpcOD%?}JQ0u>2xhglrAJ{p6*@^3r*h0ZzZ{ zZ%a|gFAQ@h=n*>~?aMF)~aTH556jeX{Vso()yScNWhljl-vum1ViZ8ZG{1K`kJ zQ8-qyGCaZciQC9|gu@Kf&>hF`Yicn*Pxi#f=d=Ap2Lm?pBy5GmlS`IWB`ydA8~%nV zQu=T>sfLD54I{$PB$^=_@B>ZABT@zfa_WL9iO<0|F1@ugNL^l!O#*}*Rv9aby#ZQi zc~Ee>zt0RAql#9YoH(1YUaII%qO+ZE>uq6Ol_dY;6 zCeCS!Iz-hTMtDih{Q>-qk~Nc=^ZfT0$`UnPCR)n9_&h!Qeg!**$H3TUYmNm7H;wNn zA<-ByWkCGEZPp>YxUs?H=g^}?i>lm7acym}4t`8YWwb7DeG$x_(o~8>~Djws|JCe0szJNQ;_ zBAR*?lm+TRRFloQjj>7KcIL4o@gNEM)m1l6?1#$_HW{SeL|FS^CK+V6-kt<2ldLm* zEe-OD@$2`hGtoUo1GQx?VIS@sU1#Fn21jsAl}D$}ksJ6FJZv4JbObA>q(iQfpDO3u zlmw4EOY&{CPOfNhGWre|dm_0DF>ko}wK163X|8xM5!Y{rxc!hGTOrmAMVQaCzIUs? zz1nuk9sIwY^cU%{X$hpQ+~xTf7VWMx8(jGs*1)RCFkZdvwexhSUiGiTLQfU?m*`?Y zKV{CAOP5@+~KjTAXsUW&_AMnuj|jT17*{Ndq{%ypdcW>)b||PFEDy z18JDH^N5FC@ye{lwt!CiZ|%tw>__Svj16h`%1SHI2rkP5e#XD-8m-FLo&1>( z7RcB7V%pZ+){o&&a#pBP0(ZNp| z{%<)G9LkkCf5-$4n|-Y}iYsrl K3;}A?#Dw<`TkFq39gboj+rj_nh3Q2P}DlKWH zF73N5&f!C8uEeHZCjRxq%K+&h3W+UubYT`XguA1#kPa};Cc=gBBaRv=NemRocsLMe zNNWYJ^sm3A&`Sb8j(epnEqT{Jc(rIKJ+ZARnhl2S(td9NW?&hIQ7zn@f3Eci$>6<8 zpnf`}AT}OJwgXO9z%C+AJOf#czCl0OgU7pUDf@^bcA@3-F7_*;my>EeH zIHc;LY_tzpARV9N_G9&J%VXupETI)mX&KrbD)1)tuu;BaRhvN_# zzoS?u4_8_Y9pf_uX6b35K*}wsXNL^Pq5jdj{x79juj@0#Ix6Q6W%)~7`5!{`=U-zV z?mpthX8e0e|KoEL7=UVOM)e2b+^16iA8-8i)NfD}5Xwiaxro>Uu2v5Q*w*{Wl!?JO zDS07!IddzWEIxK-@MZQx{QV>FO*5CbgZbJw28%47Hf5seq9EnobHCUpTSs8nH#_dT2Pb2J7u{IP~v+xf=1zW{O&-O zEz~XuC3ZN9xX(Ksz{|W` zZFT;BIFR3psACQJrHSp)M)|f=thh#;%66kw4QI5#qP&c3%S#u5-8zn7^R_JLL_u?Y<}qdk+>r+Y?V z!lys(P+>tchsuuZ2uoRHuYKkrc^8gewxUoltg5Wa)nu4%&sj$xT?T7`-?0Na__(Bc zSl>$U75zn`>*e<>wE9fTYGBe(`@6<$ZP_@m3e#1 zNQQm>D7;9%ZE)~#5AU4s+qzW3#7{kSr3kzb zL}0z*q08KgiG4Ff{ceOFT&W^#**XR{f!f<6rKa}?qpHMU=7%D#>S<<@tf~4=+Mr^o zJ_u8WSE&58K^Zvpy_Wbx8%;4hu?e9qc3x+}%0Ew)cfVxJi!{}KEh0e9WL*sc<$JGd1o;j3SdYLrb9t=(s0&Di14Ep} zm89_)SF4V6Om3v!#`lxXY?cf35QE(#s<^-D!@*d55=tvS|CLpTjg{F+iSr(a`F;`{ zli86#rKNL*Ej&}paeSR!gnUZ~z`XBy_=S!kx%m%_BD+FRHOT$%m@6W}06 z?Xqd#1uH0FQm7J#hAw+&XJ>n-xXxh{0E({0Tz%&v4AKv;A6MIYHXfA_RBBJtGxhbI zs*gs0uCHx@I$wqUtLY@B7 z9U30o>M9Ww^XaaZeN8tqxx4G1X^=#raDSam+joQvmHJuP+pBC~&q#ewm$=@lL=AOe z{Au}lR|eK!61^@|UF0E@_Tn$d$G%OQY+$4^aVQ!f)*~`|($_ZG!qp>d^$65sCde-a z$M@_yqrlf=Cbvd(nm@)+mdi$TnuFOK+)r zo;H!`;m*jVa`6{-1&&cB5BDKaP>xy}+7g=FS?Uzw7( zwlOs8#NB*=Y)3d3ZWjQDJWgL8>H02QXq}U<3#V#XD&_F%dhMQlQI!YR*l6h~;46$t zT?=3#9~1_p%bwHWENoi$MK>5ixi>TiPj3deZAIO0kNJ?FQ5RH5|BeX?g|`2u3@sH3 zq1l}DLWTjDVGl{osWb|MCv`p2s(d_{%%m8(i=!;FcyM{grHfMr=np-Pq@vA7!}}+W z3f~(9SblIxgVJT2(zptJZW_5b@Rwz^aZ{s28@}H-NTF}ncy!d9wXn@iePrYtcz@F^k4{W!%A38tUZzZlsmyQhNdxF{eiqlhvNQlYR< zs3oDrdLtY03-f#G#u~>;g$xB}klJ|-BhEL0R6|s+1=Bb>cBZfJ4mQqA^2`Iv^bgae zdwUwgm0`l2kstLKfnhQi5g7&i!DRM9$05)2WUC|tTztf%sa2qwpI9*^@COTX3J zR2)7voI`FocQ_-UCG8!1vx~j8$hOEwdl3)%N(SM!*ZmHw9#5OLeqOp=HSG5I2oayA z(9|u%A@}Yrlv}yTVMp@0Q^b_rPLSjeLk-7iYp4M0vf5^Ax+-MnH!Y(a$fk(n#;s0wy4MyBJabb82S)(N6Vf-@ zw4MBK^m~8wt7Q7(NB~5pO0be>s|g7F_5hMjWN19!V1DX%LEnI%MP{!G?2=5p)mz2b z;3rkleG52{CIVb3hYCH@>&zVLtO=mgz+0Wqm*p7o#`CmkIJ?R@hG|vL^A1IgM|&y> z)3`sKITdXnjSbQ0%QKM@#fq$pt)_7ae!Wf1O}N-G3|}erfkmiu$jcd9=qll5TEJz* zG0Tp_`Wf)@U;+>RJlUn|H?W6eKDqS=*XeF)sEZW`%|upai81SM6qp-Gcaom?Ld5Aw zN~Us^(yy6mur=ez?Rw|Eo6c^xn;0w|GbDQ^+EnGVt`?u1C-418?}H|YI5~t`l(FM6 z81}t|>wBz^h}wrTwq{oGO@!~MaB{3^3SuU3!8@3ru?SGbhmg_;IZ}LlC<{x%@buPp zyi736{N;$E&x^6t>V&bBUY4I?k0dK=>pSFU5y$9+*4u_KfEOkw{ZKj7OC&28^XITos{wFhYMU!(18qgA<^aWRrIBoB z_ymirgT4Hf7o*d(9`re$@*$rxPuVV|1jk8fg&2#NHBt`btNWkF#l5Z049bfen=XHn zlCUj$&%GaN9lr9uQpxII86XF^3W$H^V}6p6uv=o!t@|0sY`5&62QOfUXmJQN{oEli1b}>KZkiwm3u3K)o6dniGCD9{hkdWNvGA$!&T?&vP*&!! z^_PAI0f{5VKm+E0)Nng=^PW?MyxUlLYSS~O&dZJ)d7!h6(YsFFMeCRSBlhbg7zQZ} z;)<(Z^~n=fp~7>r;Bttt_dYHEoG`T1PepULeaEzjenjzvlai3v*tL%dp6(mf4k1&T zSO2-33)LdCO6#%%INn*x{JgzfqgU=Vyg@OlZd^vJa_jw^6Npm6^=kq|G@v{9zBJ#a z!T=Jrk+iDlm+}0S1a0@*DVadP^ax0VfuZS8_Othbu7WWkI*F7mDJmB`9xcZGI@_O? zHNMj{)B8Sghx(tvO`vGo))TpBD4W2;WA?-REa6~OcLB?l$NFC=C~wD@r*BZ)uFakC zC}X~IR7ly6B?muOftRpnd~R%>-R715 zEbcsMFK4`2O)ab2^Yzzkg8i^Uv50{PR*}{Jggm=!M!ruswU**Ghx)=3EsE(*un4kA zyqA#Jv3as;Ou|O(L@&{ekw{*F0m#@3$sZSRUjwq)R~c|N5o@-y(qN|Vg*dc6$=GT7 zLS)DK10FUsR#xS#(>B$k<+{3Jdo-GG%dIAOaUki%zqQxIEbuOjkus>X{$hw0`F!g? zMrGKBJsSHfXDiU%LQeGtC%1)!Lq&YlyVm@~~w0gO;Mod)%CoSsE|+L*)#g-{YwjIUGc zzVFJ%mtX(DMH|yHC9;9i02xadVe>0Z;?>|lD6}@sHlb^(U@D=et~7J;D@EaH2lj8r z3H;elOEQu$&Ph2ah^xE(6_6Mdn#eaS-~fsH8kp|AO^?-Y=_b#T8txP%E8J`-zP*f& zQz$UvKx;QSMI)yERlExI;DulvdtACBH}wY>g+JtsuI`g=Y8{$wCj%{w1?$n^nM)Nt zbJ2XNH_f(x`~QoT-u+e4o$Y9{E&Cdxftn+iKbGLjownc#1(NEGjEO2Wn*X)S;)u zvo_W>M+?rt_hXIDZxEC)7lDK-FYpP7`QYy*ZOB?G?(#DmAtXX|#~UvI1qM`G1jc#xUK9_SRBY5C1R429HqFMwNQ1_u zA)UiY#~HF>NX_zxbD~HaZebe~48SFc_LrzwSp(60jc&#XHE@M%Z{?qWE{=;BtnFQ6 zcV(WnV1sds&h3Wnd%pNCWJ9J@&73C700dUp_FfqwG_YO`^j@y++`|{97eBA5i zy_mtl47OttzZb#yldm1_ptLp(WXi99Bt6fG-w?lA$9U}@f{o$I$v;VK{+Ua7BQ>B$ zas^YAdBP1N`ipJ-D=PuWg5XFJmSOW)uSFCT`O&6zT`u{CNw%0u(5$wN*PLvJT>|rx zu`Yo<$$RRi*Ks7jzxCp=1}O_8smp+cCY~ES;>4x_mOn*koBv~vKRPLY@E}_pZzP)36uKVXf7eLPbc|9DNa2T`cet!gj zyYD>G&ppN@*oUALKxTY`dTvP1%|5T}vhzv#ygskZKGG}`2gg!#;RaxH1#1^cXM}j%^RBo5v4BI+eO#lfJ1VU4QE66VE;GnRpUF5NFhIQS(5l>uW}uJ2 z8{@q04kurqls>XC>gRo)+PB_6hYcExeGnFqV;=?_o6Jf4H&oUZ66&p+yCgta@V{ZU zP#}n(pee6J0}ryjtgsR!;5uOR_$~oW&?1wk*0&qXf!je;s;H-8RbVKGIpt>6BL7F z2L`dvk>`(9vd=*%uGDu>A^PKSJ?7Fp;{tkarB>kaw$n(UPaq_#ly;WV{Y>aCb($vo zS^1_5qgaFiKFDx+1}z($7SQWOV+>=%NIxzqAvdy}wPh(uAO_Wb65-XdE{k^HE26D~ zaD^H)Zik~B?#|}fTH1v)3hY5R_M8jAr@EFp>}`;e3>hvPx#yzN-PZsL0~A8EIG+q^ z-2KC+3-aTcCxVaTo^mfV{ zkO7sn^vovT)dc_X&K{>MzAwT#X;aft_r|rAX2#wkzgv;d#rz|wlFgFK4#^KJ*5V%Z z$cmbp^2b{n;1DzWFh3;QF`N>HnUg}SNf))rzXL_nJ=QeUwH>7jwNCg2LwDq!;WW`7 zQmTGgIr%Ul>tCI}y)ipa;|Iqbw!h6ow3weHV+8Y)7X--Q+5Y&WpEaxrR1jl@FDJ>U zV@B97L`v0n3fnGJl9&?dfq-zsY`lL|V*96aZlg)#F11h#N|;=w>kwh+m{T3b`?a*@Px-cT z@{Da6JQ4Qu-(>Glg0SYM72Q5}Zv1JUos(oIOQETp>e}go0m7kHj4?zxu39oPqCAGL zAuMmjEZR7um<1E%)bk zba72w`pGJAiuooQ`FV#VjOqh{GovfZ8tpNJhQvx-oF@z5O%fY)=ZCLc)<>T6ag(&m z8#zJ_<>nSgTs!{CV~)a=-5np%QT_(UFZV+*o}Hq!l^*R8EsPXu z9THG}=|EhN%=`=klLdX_d#+g##Kw&FL!c>qA1ZnR1vxUB1N?tyeKEFNUAuQ#<9j41`9XDy5On5Mj{fx0U5&WNlyT7Oc;i;5#43 zj5Pw^;JE!|`HU6`#Jq#jT={lftT+toIgA7%>=0*{TbnNtvyo-gCb-=mjI_2TZB?OC z7jE20`R$Nt^kyQ&+?lqcfe+a;>Qz?%CvC;07P@+Eyy3;G-(vXPAPbm`;niC*?snFw{ODJs^hatqN9$I6I3aenng-WmdlF3(#Q6{!`4vY zhxzZdmCjrFM=oZ^fco!IhZ)+-@I+#Z?PeWT%oMn9w%KjPaqfSk3(zwh`2NcCRC>MD zVOm1d<91jTiZ-mR9N~XXShw?IiJlsMG<0JyS5QZVoKxprw6~0cGs&l=9Deeu zCQ<5*3FqRO4aZCc3q}sj%vpn%;q%Td4KE8Z^%UxEpJsi3x$B6SsAUP_l?kK|T)QTR zNRJj_iz=UnqQ!xi^ne0k7yUQG?1sdlrJH+vc*$u!dm{iJ0N@U z4o@MnX-+oYz_}yEBB47U`QjD zEGYm>2kh+`{T-#$a&yRl3TT8c&`MWQ*4~=8Q4wT%B7pQ{4lq{O_oyOJ(8Ee6r1`!% z(EKJ4UZT60aD#vo01M?GYA)@{4rJy0WmVp(|CpCfV(-90C(iZsmsJm(@Rve7u$oC@ zDh4f44|)a62b!^GPs+BAlG}+IRmNRr=q2?-X*VuQ@g;aRw`gQqY-7!AEV0%j zgohV&y2`80V8~&L)wAL(P{w;>=jq!5O)zh@0Zi@L5%9Kal9*>s3GfAV-!YI~D!68s z;-w3gcfOL*atL_w`U5;z^n>2lQ*AVbw38Ivjt{xwyLj3se)`psT$#JRVaPqECEQF= zg_~!w;y)#qWV;I3kJ7pgLpA70wExIkAh!NO+_;@UV>w?aA=b3V4BOqqAyGF__-5F? zhqZaX!Ou>0gFmm;UrX8`I}h#4l4=c51(`>ldquM-*o7v z5whOb_@14s*TkO-U4Wn%-a(QpjwJw)od9O0V~q8sj#-cOj7rOW;l%ygQ!5SW{OU*# zwWYQ{%SM!ztIO%rW)I_IJT*N>l59@N+U%v2L5W5Gf+35If?AgsZ z*dRwBRaHe^j;zs?%53vd4k>06aY37D=HzQ@X6eU2j9)O5IiSO7k5@m?AX3xDXp1SjO;$=4={ehf34TiS_8DgbB zJ1p`S{+VFoe*By7MEFz(?z7Vm^IfFiGsCbB2MTE*V1{cyV1RIPV5p~!&@+dEC`2ZB zoMEk7eS@vM4`{`dX=>5Gso{iP=8Jq3G+7Lvi1?}f-NtA_VoS zrmBpPV(l$>cmJ|z3Ze=^`vwA%fOhEI?Y7PBjn@xEU7rWxt~i43kFgRPmz|d}9Ac*I zQA`Ax;@O1?%D}4EVDpRC%pEy_uWbhKpfb1qPbwRB4Mg`;iJSXhyiq+JoaTb+U9b?I7}T+Dyb$z2RV+L3{nf-XY530K*|fq zz8H?6dZ^f=?Y+{&d-T_i8Bnw&N1t$~CEF5m;N?|`;S)XvpZb(>NoD>9C@Ho~npBY7 zUO4T8^AHZLVljLR6}(J>ad`AxDesM;LIfbg8RHKg8_AT+0RXgG=dO!ugYLyFsd(ZZ5o^Ia&uPgghh`@z)n$$Im`lJr~<|rcH(4GvPTuL z9$nxt)GGMaRGueA)7%Hv^WF@9UIQkrTmoi6#Fm|e8Nvuw7}tv`TIFD05G%r>=q^yF zZ*Lg<7EY)}YiEU~a`@q`D9&S+7mAoXFGD$xFgXehx}OR=>EE(hdEvDkRS7bbCw+;d z{7a_$zOn6bhLHpAPs(u9ltA#$ShSAF7eSlPVdoS!2y#9C<7<}CFc;um^K?4SIq45J1f&6DXkh_!5Hr@8QZMll@T!N~3`IsNqXu>d=2%Bm)A8O^WQ4dw}fIQb4#&YyZnoWfuzL+5R1XH+z7f@cJ*RN zbgk@^0#^YjPsXv{MCp!w32y^wL@2qr1v9Xo3Ejs;prM?TAK;8e>NG$>BFJD>!rOne zlYMz8{?VX%WRZ@+1t*Z#YSRTo=REPjpbq#DqYDyNp^%~{hk|*CpeDosQENsNyxZU1 z?oc4g{Ia{&$p+i&vx{mJMbIS9Vi>AZVpWjMmqm7P%W*#ikSK+=>y_v)2DdX<#MlGl z1dsqu<;p1H+AXmWdoXzTF@nVX*hFxG+xbWUG@Ae%XY_A-0DL%*>TIv4hh_iMf#rsj z9dzy1O#wFNwZ3ItpQm%y)C*nE<9@HVR4Q>N?pwfXch7@sA+S_tzez!Cboo9G&#DjP z9Pt>8)Y}n=020HuPyp_!0R;<4e*Y_Rs7V57?6rZ%8vRFu`34J;`1F3p1o)h64xzqAvM3T43qHT+=)TXo$t|ENnD)$TnIY ztwUC1UG4WJ?G&I&CT@}!t{XT4_Ad6(=3_2+Ulu5mpErfs*Nde3YS(f^=Qh6p-|MI0 z_ltpf`m61>WFT-gWTwp5@DI}2+CHL`$VU|f@Bm}H5gkn;z+PJ zMa#(%ivvTFeWvY|^ySW$!))km0BCT`iG92%ws6&EXn8Q)#Ma)vW1f080(wNI^!0xO zj*FS8$6ff_RUZF>2mNxkB-7Z7QAb~e%r4K!aMNGuzncOhck8^rorh6k?X1%!^hy~7Q}uY7#k!#RKbf1)lZvtPjMRsVzxfee&FED z$0^+fvfS5P-!-kyl8#OqpSsmA6JL~9?Od8Plv6em6Zl$)8X zH9C-Zhqqngi6*A4iT}VjDt9Nm4?-0&U zhlQ;jE({vlab2>XRqi`2>kis0GMJUMoHakPj=v$2^#FD@B!9ad@Id@S$ zu8CE-St%)BL++VKLgv<=iRFZl@S|(<(-S@=f@!GK3$;#Ll}S?Jhqgl3q^|c5Z<&Xs zQvp~}-CUi_NK~ zO0FTqZO5?pjFuWU#LNpQRnD3DFojFU@K^9Rxw^b;lY@*;Ra={bG54xgW7+pY(2~H) zhWA7Khb9u-_H33^qQMa#gy&kdPAHKwG<@1VK0em_ftO?@J-IPYd$YP}9z?yk?DB|! z@Yu9${?p#+4;(M|nuW3aBO)T=lgoAf62ABSPX7nBtO_g*&$@apwFM-{;*Cin{o-lV z(o*lu$?D1MrFmkWMf&jb4ggX+f2d?>enb$g*oYGAgz%8<7y638ZqA~uz40rgIK)G9 z5AVPd>vcZ-}(Q04GMSs|GWlmT|d1BQ~X11En(u^A_NG?p%NeZGmJ1GEm8l99lNER*#n&4-C2-u=y^SP`ZR*TNbk^I@1A2O?B&@i=7>+tFnpbwo|20}bXm5@A*(`UDKb0FAgHLE zX`};N4wzw>mVaDkHq=WQ>qmS0n-wgConTB^MSQkzi}LC7^m2MKV$kPbwa$ZksJOEI zBLtw~LRKdbnYJII4Y$eY^=jCdD6_|qvM;0UCQk6P(iH^S=ga3}e>_F@Gz7b)8iu4h z4WvMLkMqGLwpnugwq;bm%AuaT*y}y>f{^}>WNxtv1qQ1W3N4IBGS5&<2|kXk8Jcfu zG)mD8g5Ot5)03Jk{-2J64iXIaDnclE#)*rS0ka|QI^W3bWk{|mTKXmaKGCFO1olI& zOch2otnK3_=1tIs5+uO&Wnj}->cE3=l^rtTjMH>t)s&tFXJ@el8 zpAXAul+iJj6mZt6$BUN8|Pd-rm}9+M>} zrEBJ|!r(joHxwu>hjKi?1RML#B6;hXYmv5k2&p}I=B+l&X$uhy>fU)Ve0*i64veby zU)=|_%zmd0S!-H%d%`M>1RGmWl0Z8||I3UJG;kC%K1oAw=#`I6ivV8lZ))zF{wFne zf%mr;;d2tjfw9zVUNGkvVR+};f&K0HHFC7Jm{FI{zd$@u+C_NOjSI0!vR;Tfkm8F~EK8DFzlt8hr9Gpv_F{S@7ce9kITikB9EVWJk%Kcsg;m zmy3h@hi~2kGBCiF&|d!m#4M__08%B{LFH(_r?1oP& zOgt{yc;9J4b$h(szOx{}P5%hH$tOL|K~2=%fdkZki$UCHY1v1BijTkH=U4|b{zQ4F zKZe(0b4Sw%e|{#@I@dI_{GBy~NbvIy2E_kxDjenFk_W-QT*PLt{y%*R(@7_drTp;I z;H4(?wz28#W&6cA*rG;>2MJ-{qc(cj11*(Px&0L?4k?a~eMhmkJRWYvMH}}-L7oN2 zq#hXvjGE)cP{HuYLFyOqL%R+9@&C6}(+L#RqhNe?N8~!#0e>&9EN^AFrRso9DblNr zo4-U_lLM>p+^wM*VOIUOXW@Y&ESB7aR&|dGY{he>*$VZ#F-vTTg@v$(k8$)kQgXhZq6lTk+`viaZJ+2Tn+3WB=l=#62o7>jWkTIYL=}`nc z1(EC`Yz9Jrkd-<@)5RRg+EC!Z7`4Ny2EcU$2@#RB!^B|b6OW}>6ho^-y46CfWIzRF zAQ6$ZNsH1_s$^6SE8n>LUv-~OvmdT>P9{Js>^OX;JY6q;{y3a~JExSMu6oWbepoo% z??~C@g2t^hOf4^s*<=~rwbVq_Ab*O2H1c+5xGz=5N*xRMkv)R#xE2UsR5r4Nj~HX< zc_xqMbk6W9*%I7o#H@ztMfOR3{MiKe@94NoJ$tp8c}e-`rzaB$yQGp_#N3`b-@|IUDn;9@TBIzUv1gL` z?u=qxTM)okPKx0c47+JzT@wM3(Y@TGk%(kYx*dej?K{~QyA=HM?$+16J3seJ?>~6w zILaa~t1|gS8pBi|!l6u z`qE&Eg3SHhR#SB0Nt%xey!6ra1e3+KCGbePnV;$A>kc%q-%83TIdld~SR%J)u&>jd zv;yu)+)Q*cR!R&phHs^$x#|#cD8OkzWSjCj5<1>ya0;5O_oYhpjf*LKzZ{-u=u46M zf{~{I(K=;#;|43_x*USuQBF_-%h~%nkD_cj_EXjeCIuz+%l%P3FbacVwq=3Ka{i?w zW(~v!=FumL?Cg%EIxR#B!tPX z3wSAP~n$2Y<4xONmB|*G`&I<+K&K_FSLnP~s!=Ub(*W)f?Geq%$Yb zed)LQUN$K=y>hKF5BQO~`=&h|aFTx?B?NTig?&|AOgZ|MHj|&+^t=vC7NoTmzoK{x z)qR1VN{UF~*vdm#*!Qy)H)Bye?ytQ!*lMiCwi>gsZQG5L9ouGO+vvC3bKZN;x!=z`JA3WLx#k*k%rVin zDXvJ|@2j5&)%Q}F5`xq;4&KZ!1t5z1mb=*KiL8+p++W_dRdUCQ`U@RMPGz3wu%N;V zEEDmh0NnBffjq93s%Q!JmFTkUSxG`(&A&fZI&-o&PMNLgH8EE-5CB|gJz2hk&&Rpe zKs(?K=dVyhD2NSToD&xSP8J0$xzWrnuI|i<7InT{<8~C(Vf;^{fdl zj@rY9*P^bh!IhjpFLTWSkI<~Pj!#f7MJrvY9uF$phH2vSHqV%3{4WGSP2C_THn?Tg zR8GLKlQkIZw*xI#k{1`=o~?ZK2-oc@`f+yJdW0tEu~{Kwb}rGH1*;JDamnK&8K6%##c5Dxbf1NV3JjG?sYorq!LOT8>4{vv4bCOtrdf>J( zJ`<>l{r3@f>7ZHSPe9WU+v9>t&6mcBb2(2(Wh{ENFs>m_!bV@Dga#I^jjRyoz8?R! z1XNHY0xB9hB%G}6Jx6vwMLs`2+l@v3WQ^&_;I#YrqmFnp+C3Wo`=#dhiye>>R$VVq zc`62KPkl2KL!==483pF!!S441)9)K5Dv|j$8gkE(&B?>)Dhbd(_8j>Wmw6&u6&$T~ zX5=%l;@sn(;&8k4JIAJTE(8qYbGvQziP?wGU4Etux|ZVI@8@PdN@<*lDm{GL<%UP}DZHLsz-fBz9r%$SMATZFA06`HV>DEN4r`u%3czDL>g z>w0(S9#M&t~r9lgow z&MaA`6+6RlkH6gk{^Pm;=r_a7F7s%HzoY-xD*Wl{pQ%9nv@Y}D*lJV;{NukqA@*OB z1YsCodmOJD`2Rl}BLN!weft6BxPLeFzu#?%0ntuvxlA*TjO#(5RD6MDTL5Y?qLDAa z)+<59>61Y(1GO0Ew{%gFjhEEJrg3e2wt;fzdJk&MoBqQ@EI0EHVDU~hM|u;M73 zR8;blTr*;|MPBn9B)oMI0G{}^vF`tH0pfa^&`L?OGK|o>v17@d$ju@a`g)1fjbK$a zh<1)CVq#7OMSf(Vj+=;CKsZ-zDdHuH@$B1;af?LRg+(BAZM5GtS&K2vGq#24>SA(* zIGe{G^asWmm42|^v-_t7Oui|$XbeX8v8lZCYhR3HO*&BniHa#`!fo0T&h`GYBMn(b zCU&)DKjE}FwQ-^os(hPhK2U0!3oxQOG=sWQ`W#%~H`~o10wG)OD}Fb5HwNs-6-a}2 zFybVmVE|;Jg%&JLk|08c<4Y2dP-xI(Bo@qD$dwy`^9XZr9xO|boe=OqX&RqyNC zx@EB3)C>i!hqH$AdT+`JM@w9iGe8mIAbsFz@Z?b4?crfo=#a$kpzNkI7g=uUac4e$ zyps%XN9l5$kJVJ^{#NuJR=u(1(bdZI0B^t7T58)2*??TlmOX*`m{TKyTs~u7m=8OH zLR$X1Ue-5A16fm;176ppCbwLs)X2=^^n6p#B)ss9%#*~i;tPYg2ewi@#9=%SZN_qh z(V!$k8p^41d}SfedJL9ta?!TTLS@G{rU5iL*L;C zm^lB*8hE9QZv5R$%^Yo=$+NJG8ovttn^y#34|Xqb)~@-+BQ-eaQDBJuPbqq!%d4ez zfe9B_6xVsXVvxX6p*7z&R=HQ4_gUih$Bm}phu{K)Zq(w8yN!}tjP)tDveS#N>J3)2 z%bEoaAV(Q}MB#Ag3DF;okzV_dCOOC zyo!o>QHd^Y7fa9+6Yvk%1F!F&?@&N?ICKh1xeWF`Y+% zNhK4&_cnip3!F+kNg+j!xR1m^t#(t0*0OnDMU{S(FHQL!b0)ub04IjO%C-k)T~{o) z{cq?CAyaf#u4d~K_+~Zje}6&69tn9>#FpLKSH)>Rw-T<)HzJ?(Wm-R?RUVrwETgqlaM6@ z52D0I&ZzLK15Im+m!s!vDcNdvRw34RwNIUT-}8IejlkI=2-3{4f1WRGQaKZ*e>q2m z)y?o&HwplDSG_wWz|DIt3%zZit3%DF028v~pa7tE%fQr|j8|ToEU|GPt z2X_5CiBUiJloEluhhwO?($Jj zk>%Ti!fv_A`q1x?8^)BGBa+6cvMV0PU4=ngzUO0?=|`Ci!Cc1?2P#&ef)JqL*nJUZcDtx?y=Z_EiqdfrB%GY-<}75YdLonDkoi%}H&_8@5Yb z+rQpUcG{!{qc3=YijjVPQnB1DVmaf`L~>hwr*`qGoIZ~gR)uBAR5u>AYS04(U% z_$&a?h=aj?2DjX|RWS-U@>HSor}QCRW7Z!%AjrL5>L#eE1>X`rfhAyrm<+J!bunM%6~!72L%Z&&ojURbx{!+uRn zGCYkZM>h#3N;0JXrKy$l63T4A$`_dCgb_6&P&M|;J;(NOiBq)Kz~qxNjgwHt$zkkk zaRO|J`QsoO&(z^8I=rd`C*)WE9R%&!t5N(S+{{A!{Up62X-Lb@2Z2pT6uc*OM)<(H zpEE)TW-f(!lWB3oJzJxrqx@Cfs#O7VEXWTTvW#remKJ_NmX_jvZHq%qnfX`!xx0jq zM)C^#2b5r#%#SDRxVizKFR&Y)b+C3g^0!FJ-VRvRts6S@G+V<`^{ zBFwrYs7cv68FSAUl~5w}Y;pq-E>gPhqLm19X}Aveq2j5pjV2?pxzIfg33G0uCHvdd zBi?cEdpy3?>3mWCN)rk+yC&HJcTSZ?Q+`AAHW2FP%rIGSMbM9%2t|0Fgh0zAEg-8F zFUUPNyN*RT42f23x(D&f_7ZJ}%AKcxN-OO>cMPYz6!M9M=~`!2DjxiOg;jAlbrxG|ZGc)tI!QppcuB z1R0Q8KY|SDK^DKCoLDMTsrf}#;}n)PYb4{Ibc>BvbJ3XH40bT~IzlRh1)f6{bu5CL zEWl!X$K2Bb*0r^qwLUXQiO(k}nMj`3Ubn{_exT0OfR?%K?E{U2zvL+T5Fb~Br#Wh@ zI~?f7u?^w^eZ^KIP_-2&UYgtfmLG>XW|tWZY*C?kzD)|m(T6y+biKqU;KCPLK|soe zQuk9w&kTBNV!-=CL^fu-Oo3*!X_>?p2iy`xU6VXK^VfB}7!&SlGUd49+N{vkwaEfP ze_zZch`tbnt27v*WVNB3BnT|tCwD2>NbJzei+~>%xMl<3X?ukQXEM|wXx$!Wb=9fk zy$?{VhO$|G09WEUlds4VmhHddvOu^oNk{7!)Wbj^u&+B=apXj4hC+nA%DyTe!>aU| zP_TetGy9O|!S~>UOf9d1AY0EesP<>o<(2#BkLJTtmP6fUQR=$^%dtei-HgvQk`pMj zQw3&&ByY?xMXLA{6SnD^9zMy}44X=O3Pn1Otx`}dY*?()vc(vlhzAXrZxBZMd*I?8 z=&?n}64XbWTIQRjMOBLi(0uql_k7NLWDgb~i(rBA!kZ(JTvzoBk@* z7Aorv>&*e65_E;Q$v33&w9#eA8#Jh4@+^|6RsU*i3V5=AIsGXCck9_|T=mDHC-i?W z5&V7C*!^_;{HoKn_I}{+BGio6Z?X9qGhNQIjfs2W=Dpyl7Jja|lX@Z|-&9)YCb(%lz9*npI~rylL+Bqh7P_hna@Hxvd4P zpdjxzn0d_=f1leLIj+j5Vq^mowc;Fryg(x9BBU9=hBS)Kv|Jd&V4RJB`F1X7STG@| z8QXBGDw7bf;&*tVf2e3|#X);;7|0C!)7-E=cChqD6 zj%+7;#w(KF+r^H-Vj4azMOB{Wg1_3K8&P+vj@O2c$>;q1v9NM1X?$F~3MR>xdsea5 zcviz??-vKDC~1z!_;?L+!jpXi=mJXw!=C1PCB65{u!BV5NW0f{$byytWj518bjwX0mEMRK`%pptmmj&;i#W*N%0o3`_lxA@agS_V}PIb z9%G;-YD(So$H0%_>Td8hfA*hFO8XYV@VR^wGO!OU_&5gofghw^5G+?X+^mvR^&jr! zU8d+;avFQ$32_pBbA&p}Pg^D!yz4A$j;ukBrjWeaDTCWi*uNiggLb8JHx)4#Y0NOd z*y3!!zNeTQX{c-;R?WQtlnz{4~A%lCz2E5*Tw?k6x)Jlhi92 zgwh~^-A3!){@JZ;mbMdn?}PPKkc!Of^z-`8_Tnh>wkZ8@yjA%B=7kem#z zcD1Fq2J7?hxHhl#AHy6!m1-XYSgAYRxr`j~LgQl){1=tE>p|I$aX_}I;1Hj;!E#`w zS6q}Sx>i-=ky6D?Vnb2eyAUNgDjt3iW2(A&LszyJpysePf$td&eW&k?99z?&WDG1eA#3R#+9Wv#Zw&gHW`OLCM*92Fk6 z{4w;{a!9+~sJ|Ql7*9y?BUh4Oe*b|?qreTy@D(Uf+z%6;izkl|;O(JJQC|!&d%Vpm zG|CSP5aV+@@7d=?f!VY7D?Vf7F{PPU1ILu~yJyk*`-Z96Zf~-Q;BC@~n#Zc?AuP^m zRUpQGB5ZJdjZ{j6sUdURb1-Kz_1!U6<`$dcno@H=g9!w!%n+3Jhg zH=%q}Ofg<-u~7S6w?F`Cl&m4vDql=ljx;C&0!`Zy?vW*Mdk2ruJ9M7Z&*YSA13JJx z!0yJJ8T>nlZk1v#fi7H#j<+rnH9udBhqM?bM-rU~)7gjX4uIIE{ufHaM}fA-vjbPC z$8%I2BoI#<3^e~`GW-y^86zJu!U^YZx+mLW;9ej|K3u|0QG!DAQ6S{Mn4>)GXg6RWZ-8 z@zoLFxg?>vK^_@mXk}r?Zz);nbatl_S&P2OhEB_=R{aHZNXCQzk^z=t zdpj>yh4x_P+pr?{kg5k|jPBZ)xr;(z&ib`~oQuuFit*EWwQMlh7C#Wc9%Nx87{Xd= z8!Hd7+<<|Xv(HtaHJbX7#@&{n!hVKv8C_MDhydjs&$!Sm`o)fW;vn0Csv`~CWKhu+ zpP|h-EuF7X@%$@Z9sU|zUAYaD6cWeJ`t7JK)`zT;=Qq#kG^roX7l%Tt0R_R*NIK-g zi728xWaj9H22twB6VMT=y%JS>klYP0pAD~1oxW-+sSVWCsg58^>_ZBh1C|%T%0rpY zh^-y%`OFT}Hn1V+M3#3#XAIR~_^4)_5%eL6h)mr>?`2e&Y3t8Zl4?nN9Fl!Q?Q(44 zsWsF3z9ZEUkF|cEAo?|3^Jw`UOrht=oP10P&~E?K2+{3%V+gV~%4D57a)W7nOJgEc07L2Jed_9KgB?O&&%KOk^3 zMFk&j{Cqp>_&5>DNP1^qCWmTr+?Ec2#SX`B4u)#l>E^(i&Nr!++Bep#PL*cQs-|Y{ zW*5!lom*!KsImCER`GX+`qUtr68pTB}<2HmBn@eoHl7`y4~-I5?o)8 zh&;+PJVq28a%#79ix{;nopixoDUjQrLVIuO+4j_!xK>?Kz1^Rt$e$UM+&H)yn|^wM zhAu=~UOxYT+=|yXXlWrvchfMZUQR5BK=*BbZ13DTc_3T|>SN~+pShu#3&+IR=?*HDl${MiLFj*fDONy+IiVnAa1_zX>>zxvHA!m z*14z2`^S={e^&xtgnw~a3FPnBUJNaB;PUHya#92%2l+n_KyRNFR;6UvdVKMOUjwJz4x6cDW?lzx!%D)yYD{x2Z~Nf6;XT4aeMfj&J-or*4QR3S+TRMlY`oq%B7p`W!Byv zVY%2sfE63xv5LdpQXX?PL;RT#dkpYR{*Pyk7(*rX*KMs(#0H-jN)7D+y zS93MPHwQOg3Fi;hm;>yi{>+P0fD5j;`ypma#pqTSWpry$SFt>*{f^jI`*Pb?wH|6;20xVcHh3K$k?%x( zbp$pQDhJwG_2AuUix)Lz6pi|fnJD^UiAyR{g>@>GpHDpW3KRoT2?$wfMjaAeMLy}B9yDrm>a0FD*>2MZi)LlyO(F2a3y3FZh z;k$A8!Rgd$^E*3Rbm-UsYKG%2D5=&E89E!E-eH{yadSLqXK-hneTd{6N7i}kg%i!V z)=42=BR>7zWX90eOml^jIl3F~Pd^cf9$h&#%ua^~Zyr*vEsqY*_aWn~%{-hgZ&Rkc zGRE#(WQ;9NZ{1-cW}TsB69as_>xI~Jvh_Er@2#v%laH`EulB$MX09o&Je=Z+&D8bv z(}LP7*!FURBdad&P0>9)QNl0gA~VjT8$T$<*SF5QKmvra3A$x zloXw{&vjW3_p9{$K7L}8JN1L?W>DJ8nqAw6K3{d1M0+0=4|hK4Z)Kovg5SNp=(Vrp zlFxx@-})bIt-q`p(zgip^86U&mc8lvbJ_Gvkb)4F;N#aI9qqGiETZ#aqwH*@^6FqM8 z+THBB)uGIzj)gqYDEjF8-$2zcQ!O{<=6&x&QtqY_W4o$c(Aoq5GVPD$^o^@4QbfgB zqXgYsGMMu5CYt=?#wBX-a0FR+k0*?zk{Hq@qc`;zkF>T;g;Pwmdd<^X@UdR7ycF2{ zEkD*nq4i=yO8m+lolE8OCh~U#uIYdB;*5po3nhsQe1#jRm<%zeu|9-hj;EiVU~T8{ zx#6I}c0h!RIpDj#iw31rpD&QtE4@cZPJu+P@l|Y^&RU@4A2KV4=CF;J>$BxHCCH1u z+2slipfG&1(B|;e28f!=m>RP8Ihnt7pZbJTn#ufp|Ihz#NS>Bn3TewK8H9JA? zw=36IfxEa3kKVR~Og+sF@H*F)Ma;v)T&r{0C|{3ufvsF~%%$exjq@M-TC+{f=IDma zCBn}2+4?~+*%qF~a|DlcqH?CH-)WvF?~L?cdb!6*s>-S2<+&B!JYAAcue$gosk>JI7rN4yByN7^n)4gx#x9JzuBP?oXbHrP~0jVrWz(ZU+Z|?Jz zeM*atrLf~H0snh_J1pCDWCl?Pd;Wa`Z)X6)xZ&ces}xfBYUZTf$smm;6eEW$j`3fc zpYX+C8RV%$!w*I{Rd2;VCUwf58_uxoKAvQf-oECR)N0M0VQ8UQW?^%XbcA{9h4iI< zja8BN^Dd+NQ0%mCUT3gE{Y%@7XJpe@iNwnoyy8dwBlu=Ka&bztgg1-a3EEFXM}@+G zrcP7CJ!3Op4*j^~)-m2-oj zQ7a2uzo!$Tzmi{FGhh!D^sOnu`cwu#@89FjZ%JAvig-W2p`0BN7$Ji7FX=+9W!M<% z0TVt+Fe$l`UGWvVeMvSuZ2b0K2Kd(SO5R2G&NNf+d4F!z;jniUyaccDPOuQw$s6d1LiDYk6qPRaEm6S;a zBBbj%8_YpQ7^yO;?O?k1qi+{|slylhT#*mrS5|F|&TsyG*K)1s3vH^ua;U_Ng0x!FKKf5sbwKKys@mo zFZ9ZRdaXT=bkpFzypSJtt^^F)+2q^|xUgGteWT@K-BQ1zl%L4Iq<7H3Wo2ZdBuSi# zOus8$(xvQrw59!UMCEUDY!!`+m{%i64(wv13oo#UE9P!Z8TF=eV_0lxb%AjtkfnM)`d>4 z0nowG9AgGLBmQ{eARRg>i_N>W?|S|acp>8rTRy+vk&t~mUf&E_o1%!XI+saZha*NW zPv8aLUQ#$G^6=Tc>zYUn)1aRN$@KNfYpvgzJY)zlMwgZMpqL(ZjLnXv_m+}rLPkiK zA7CMqC9a=$Lzgn>M4k_xMb&O^M)D%#A5e&?Y#aXJ0#M)NOTQ#{Q6nY`DBmw*Jrg;L zkxj5^mSAevK4sQuKe?8ktnz(>Xw_2A$$o#+)qlF* zeN?s5tdR%%Ts$!llXDPpx=7}df+3|(mlnPaOl`{afiVtjYXPd@-uEZMN}{L=+= zE4rh{ZyQTIYNWTt+h@fJuybPTAFzXeuoD`+1hn+B0-vsizgS~mk`wrCJiCb|X`XD9 zT%IP$CPV5QmsPyqI1Ue=jj2AheNpbAC0&c^ znT}diw+pRarWudgo}AnpK{Dj7_V(RJNg1EkTnKfUiI z<3oY+^^qt%K?vSL*gwE#GURrEZyc0nDC}Ki$Nnw)!Fezl;cg2ZcGT�VDt^;&NUJ z#W*YOh2h|$>Lmua1-}_03OII%{4oH?-|Y~g3$){dS=|_GnxUm}g$c-6 z`5MriGg8JdstV1n1;Dwk&^6lSs3(WUbp#oSA5&}0u{Ms!>TWnX&M{jvI7pDw`y{nL zI`;R|Rl1BB7g$vgE--STsLY5ALKp0YUf(RoIO;8UG5)NC&KMB@TML>sn7=U#Wl>|E zW`b<~fLOQ)D%W$H^*j-~HjGF}@g!UiRDc;wLg?1sTsN}WU~WQhrX%!?I#gm_PEWOh zlvc;th=a4oXZj|L>szu#3L(n0V#oo&CjSm@%j&C=;!$j}rOm;H^3di!=6%Cob z#VV9>etk=N2WphxS3X{#p_z%PkAi&wr|Gq#*~tHcY>ZE5C#>#e4m0w&I;m%w8XKy5 z^A&0$aGB-3kvfvTJ~Iq{uK`ScdkGefP7uxC^W6wn!h=%-Mcl$kf^^QNl3eD_7o9wd zrGa%in8<{eB62c3!O+dN_6wgNj$vh%bn-|4Rq%w02fo`rtx%LLL_R4eV$bloOP$&; z+~h3*Kv|S5ni;09PGDfs-a5ybQYdd-1lo_)Lvv1^ANn>pxIC5Dck;|kd2;qgqhMgZ z%miV~uxTJh>2K+_zSt@d#^BPqf)oy`9-wOQLxIkW;Nsa9l)lRg0`7IM#Vg!{C10!o zbU+xAV@R!{r}1TS`Ivv!i6*s__8 zUOr|p$!ftY^G$;LcpMzxzPokh)koJL)ESXOxX!vScwd$~QmRTTmpx*=mDHri6=4@0 zUW)>GiAwQl5b-bD`+@rJSGg%jlThDP+U-Sd=gEhtcerR$zwyxdKUn{MZ}1Z2Z(s7C z*Q{+*(J}@{6vrV}Eb>2e5iA{HkWh5iVn3(&U0rn&RTW83USEYZ|0`ipI;O++I2+b0 zgaxRG>aoYeR0^Rf1s&OI8uvj_tGg1vLVB&+9lwy~M&c|)L#5x}pJ{S*yUEmS#+(e> zYKyw9WLJ1O4~f1tLz;*PFELV^_*|hs%=Do zd5Vu$r&wmpzjc@#MSJruF#cVY>{hRgLW=RwjHNYNCZuU^>s4HdRVBfr_zT&Tm)_)N zi^QcMdzq!qejaBle#VZe9Lk_-FZe?2xmR_ug9%(0spCdsMp48ftARxG*}zqHk!6xi zPYo#hmaDOp)~Uuxx-H|+JVc~)Wdcde5k8NT2SiC5LBK5#tuUnNFsD|vE&rMX9-Y#c zSKYLP7Nk>Z;L;w-c{cFp9K`k%qGAik8kJJy=-bgxDa^6+^TScuo%=kuwftmX%RL@E zd(WCWpZmgd*xJPEx2&gIsS?{j=ql*@-q8*V(es>Y68OGoK>@Uuq3T9l)j`fx_rmGq zjS3Oi^mQ($M`)YL&23)0E%Cp84+Zsz(P&orwc|Zv9h%|h*5qE_%2?^--e3==^`N|q zJlcwew%xY{3E z`w?tGoqGXLH*b^FKn%d)L2}w27jz_*er1OlR(DG%)DQA?G`&5y;)(^220KfHDya(~ zlt+URIvH4)NH*DQLTuPuEjw)axe!sHwj3MTb3_5JzA(96f; z)2c{N@KXOMXQU_!B$Ua)GNd*Eb51Vc&sO1!fjjEC zzRKkNRxQ;jHpto#=;IIk@a)We;RYC$^y5{4o8DXJP=wjqE zH3QDZP*j;;1O_)@LAOl`_g!IE?aL1~zx*+iD#wtNz)Qy94d!hJ$&VS{_-&u>M^N|k zgq`p2*R32%isGNFpi?=5zc(&@-TsnB+p6tsZS^cMG`Uby(Lp^>)dF~pUQ?tkKt{PY z!EH+d@`AZHwZmsgfEb4*sEmw^-@S&P%gLT`*467}H<8t|ZIS)&KSWSqVec5ug5yAQ z+RdoXhIFw##GjV!*~hkX33raz$WGvaSXs1(`_3WqMEUq_eo&_ulVd_*o;I_QGIY02s z~|A$T5iVdW$>cLCW|HbY8;@d)?hZO%Nl`dB)D_0R$ zOzPYPA212YOE})zQL{UdhbRTy;MMDiB`IfWQW-M@^^ zOcb|)5&oa=^Vl3*0*~VCJA9^MxWMXQD)3{G5j#{%_k7Xc#(}QKGmKAxB_YY!N`5E? z{LMd|kJV*kEH%N8-c(DOP3E+D@R1*iP=0oYQl>Wy6}a~p1t}8$bNPh6qp0d8v{C;L zS5kH=3W`L6bv-+1mt?N2D3>N0D>%6+yY1oD(t!=pC3Au9iC05v*!j>hUMM0S8Wk~GM~w&_H`u*VDqFEuR7!fY@0GMF3N>>L z9oFE_QpoS}QB6AezYdAjDsbf$D@%1?)F`VL85IInk$xIKc&kdkr(+y2aL$p=x6;VN zweKgo1mWJnUPwXe9~6jI6viQZlPp$BoU5;1JfMK7ET{YNPK|#&F?=el{5p(?moucs z`tZo5mULv6-gpm}y{t)t?fww3X7y!Wbz3M>>(gER;b*-C&M;r6d`oeg_hXSMGRXw( zXLluzz-BaO3_MW4sNSZuCIw}z6Ta%tMMT%}Cp;PS45=vpbAXzgQe( z+ZK`QWcN86+(@ylKKH<>`CzNnqYcK{h1Z!rL2y@gVL0k1T|yRFLpk6>B&T}0nRGVO zaxb`~lukG5$W<7v!Ip^+ErbXxkOltsHHqDbb2TE`qE)CbChh?@;bab=o)%9=z^g~6 z7>3+<9`LV1Z{9KG zf2aLR!vndG*=ha?>1NtAIWinw(hzRb97@l+bmutpAmN^$Bifrws1z5f+MMi7{hb8v zSOFkNq@s;1kTVFsO}{pCgG@L}^{bawW|_R`ZQIv*>$vA$b3qo_3Pb)%h^Zti_a zf0$$aB5~pIQZhCnd#g0EjIQW#@sa1T`wTfw`&q!6etQ0Fb?u^asX~#eQ?q4%?BX5A z($gd3Z3C7lhkm*<9~gv^n{%?V>-OHz$tl5$>{NGS>mkZ{bd?NUB5IqsC<>)LX52<9 zYzw9$f{IuRkepsh|L{8RcDqz?TU~N{1Pt=YG1>embrM5a!Vo8vGxjjAeR<(_ZZvvB zw@xYDKeJQ_*RKg^Wth!rNyg1B;N^!1EfI22y**#WSIXGDt@4xJ82asD4w1O?F?m=+ z^cf59?#)v}4(bUA5P0s-ccV`48x!(9bI;z6Jz|;R^~V;L940R|zdE~pfT#@>O6h1mP*dQ!mWFsp8Nfo7JB1BHS{67+~xFe(}3m=xiL$-Axz;%(M~4rp<8FR zF|X?KM(UQ&7y)91ONGG~9HP}D7i1;Sne>1t3%b~glsz)RAMDZ@bLFJ2m(t~Y@5^?& zW=b6jGSTCzSu}X+A%(7)=*b{xf|(aQ{@?zgsI_c;DjCt+C_!!#ls-@4dTKX?9x8vQ zncK=y*G;0ji75%-ATKJJ&%JS$VfjffjU*K86=`N=`fW=0kcI0b zqHWyuJ~9qH4$=6EGet;OYrus=iA_!-%&1p>ygD6`Z}^saS&Kd3%_!YM8@t#X#ySnM z`u!;S>sevfEa!-CGHMMAgLdi?G*JaGwPV`|CeeRz%WK6o#$Wht93#Z2FtQ3&rdVT7K-YH`vfm+}l0R5^{gzx0 zdia-3p{$40iCvv<-#0G1c~$rD>uOF_J1)4pp0*vDYU6O<@=Ol9D|JzkxXBk6p`8*! zBN6r1-*?0XI|k2g5{6G1B?u%T$j0a;`7R?$k>S7HZGhdEtP$>uXLJ4so$IdRW2Clx zxAfP>5Yxe;UdBdTG75⪼X&8kqvgX#c6gWH7LagK37*&>YEz0eVj{futyww9X;@J zu1*4}Sl)?V!SvtRxo5$5ssc6IPXLN}XVJ1P&+z>CW&dPUpL2;VE!f$(<0j~4_#5vN zqSaYM3Qzf)3}ZDn%RKCEhLT#)}p$-czc~ZtYGSLHES4unU$S$)qf6z6I7%5jo2t1`#2PKo59DgMg z);z?ymZ5!|IWJ%))^VV4-9H`}(1QR5L$MQ?^Jbi+8ALc=#88Fru0p@@KhGYDX;Nh8 zu-Dj&1fh`o`)2pkEa*jO4H$kMlL^QjW{Uw89KVl-qR-?rd`>dMLtTbe1H}tWjyOEQ z1%%|8X?y+}={#O{rHRM5ILRV?{avN26^#^-57T@O@pCbm7^&mPp!qk~>@HlZgBz7> zuh?`=iFBf>wii4F!tNtFhrS>db{R^#D{`^ATP_KbER}|}3`sHha*90dYFXJ$J|S%} z&Tua@-Yp-jE9CnyzS3JTt3H*yx$jTt?WNV&J!47fNkxDfB`~Z_YhkSWAD2HD)TiZ8 z^B0FB?N3mYN|^1o<5qBYyX^32(gf+SCOfBz|M#IUYg4?Q*C1lp zMa4FmRvT^~`!5+Nxm>N8r2a1#Nc4*lokp2e+t0ne9V&Z>(|wZ}Ad+=nEqSiUwQ#syP3km{wV9yNzjOpP_<3FtfehAjvGA2WV_#0$T-aSSDy4$4m8zym}+bX%W*p z58eBhGFwZ-}fxnGU9-@^F0o`Nm}!}l`?$9gO%TwwGC ztZQ5#f^JD3JP&`w-jcMLSM~g4)}1T5^zw79@@XCy!>44qX^a<+H7+sOYmE07b+Vk@ z5H%C%_MKI7;Mz1k1mmMr%)D6PCURX&IoyE)`*dgJ;Zx$`D(xd6{Tqc^e`JMgejGMa z*Rh(p)L`!^%W${WDIGSmNyIG%*RsM$QuE1!smiujntuZbo-W3@!hT@eat;2k>;7kJ z`e#TGe|D!(-d(p*P|2R|>QiYP{V{>|ZetiRmjtAjj49%|Y6#?EcKA3!XUg@KD<&AR z@h5$j2NKNvI1j^(Wd7qk{{d-&QK)yh4k14JMgak_LW2IZ;mRMwK4Fw;7}bExy^_#+ zAPM4spJAEdLJ73n_OQzb$%n5<%J@nWYRvI-8=-Du0j{brc&X|hof2qZs9c=^7)jr9 z%(C|PXt1?YpLqYv8EoYTf5_Q-Kjwyz;b}nYK*JH{Px7O9qn( ztZP`*5P1VEu7&puY^55&_YnR|?mznS&%AJP04X3Ima3`9|MANC=M+a%0AI>=?pKum zxgG!grp!zSW>=O-1jGE-GyY%mv#kkyEfYpU*#Vgo%v#{v*!5+-J++y{R`RJojtTw5 z&ra%l>c-IlH~E$AN-SE#O4`7lF~;}wyBL}N!iyMgfS)_Dsj54-VY*>4)B^SfUk1eO zg&PxTQw)fG)I;|Stz;dV2zKX9Yc5O4yO=#FPRc}4lj*AFoFl>VIr&lDqd%Zl4CDj5 zhOGmdun?{mPBRj-h45fFC!=Fg>R=p+4FODD-JXTn#y=<`8-xXRyVVi2p`x8gaA{|E z^WtEXUQP+B#+nzu!DD>9Aq1|LT~47dZhP*S7hE3X)k z#$`NxUA+FXD0Ju=K(1Bxf}gp1#kBr3ZTSa0;Bcr!0`9;uDxr?1ZDt$jZPJSNW>88> zmsm=ziW5G^##J_q+3S4Ehs-usNpMabE<@gMf?#v(URkMR)_b|8eaOLiia`R6#q#~x zK})4L_jED70WPK6*$7%R4K(i3fP|A%6#uw!IcR^o5xy|mlQuE%qH;@<8x#rybl?JV zCiGlqX`*;-r>)*pABqiJ%}ti7?$DW(f!Ga}mB7*iPLwvvO!A7p@EynHE+w#2jq{lo8 zS3!MBP$>qO{5^2eR6X+u)MTO`&&L7_sk(~fw2}Iq?%q@kZpt}a#eGXyL$MyM@%$T>R zUCooZ5>ERGnKCj$$lqWR1fK5wI6!rDkBLzxTfjNxr_{hIj}HTD zeMl{$BFCC+T>7qXssEObSCr|=GS)Ev@zIWj#33`z_j4iqAsaj@=UDl$o_4M$@_{A- z6CJ`+5*|1Q>w2I-lB`z_e=|PJIxHv!{01+I76m)Q=+%tMpkd$m-o2LGZz zp1JyIWd(v)wvRcYvk1~b_8t(>W3l!(quU)sjW6Y~}nr^O1t@5jVPlx4h5eyTYT0rLPBnRLIbr?QS68 zGnv1REXX0V-8EpSn9LtfW_zO`rY23X{n(gVdjA8FJHH4jY2C+ z->N6C05yIdY#mw*6W8bPwZq|iJ6uLTMGRJ6wv4eT!d;awr5o$|FS?gw_ricVPHou~ba!wJIr>8tx4t4)&)dxt^>+qSv*NFnR75KCq zQnxAn02frF78g>BAU?Joh#?CM^VU3+{Y;|L;YD7=>IGO+H)>GEJ@qXu=x2Ao_A^x5 z*ouZsylxx1%04BpNHWWeWQ!G>qvWhA7=pB08~ouyV4`ugp#d{3eeQ022AFE>;uw!d z96Ntb=1nh!CdHNUxj&fY=xX|iuv*tlF6l%NOgeD=l3gl9+NIcNP&-mWiqCCHm`;o6 z?f-L(;QxpRL>-1z0b9<^0jXP-`gRVO^P_?@X5HLUv|dh^R6@sI5_I)M`Mdya3@j}5 zM{P!Jn51^doa;Sdm@nA++t~5U4r)vWJ(tgODk{(xhf~;_P^tGD!m7mnzv`~~FUsa? z1A>C2gwi6QG)p&vAYFni-LQ0bE+8e{NSA=Hgp_nicSyG&A&o2{{oeS9Pkg@rz{_vL zx$k?=%$&K-HF3^VTDm3N)sFF0dVE(CK`g_kEo`mk?nc|03HMOfJ!c$TJ-E8c5A4X` zrNF6ms?gf#mK;1`bcZh)6upsvkxiqHO^|wCMR^yFIc%|lYdCMu0%-f_Jy}}<<;WK> zZGD_x@M=bF%ygD2>~%=-6&Q-oz)H?Qn!R6<&G@C|0GARIbbpZzZzh`XS`28sHQeuE z67i6ElT}~M*~qbALENHNYPY^$X_COI1OcOnt4R!TuUunu-i+R9H8z<92w2f6x$s@hwtV<#z#7i%{cNUg?&=iczDJmjmL#jAmnOICv2 zFoVlu;P$ecbS3P@tt&Uf1?R@=h>}xt0jul2H@j3G*G&rnCI@X9R~s3F17zOxy(`?f znm1}w_%ilAhhDg{=kE(#e~T1a2&>X#06>C=0XPCYB#klpFBS_x;LS^>NHwEFrp$zZ zqQ4p8UxJ&0$xYv>2kX1vJoz6fp&Ve9BU&sE{|Db3lm=uKxNV0w)Zw4Z87Q#a@XhzN zdW^vLZ^b1s!UP+0`0syq2^w+`RG={M$y3exWbBvw= zr-3t>IR_~Y5jfvSvJLz)%mbPd2!7}GAg3>r6_nGmH-FaN$MG(W(M%GM&3sOhU#x+tyvJol~%>sgJYoZJjvxsDYACI zGWD1hd{E!T_*XrYaTdqu#^f31x`d?F;%X0%=%+Dn;N5Xz<3mC8PbPAy)ulQx=7~di z%5g&IMdYrgBZ#JtJ%>h4=e#rXt;H1^);@T6@E1B9zOnW>ONM`@+U)dU_6<3n#@<`+fXW-)uLqWjC>LZH%4=j2L zxCnbxzGXLgwx)zA@&BGW)DR%^un=!3Ty|0?0qfAB%Lmkh{1|Vir!9SWH6-xM3j2^G z8rXPWd?{kt>-T*VN=ucfxZN$#*4@VZ$<34=Np(GTfP#O9y>d~83&`4#DEE5NM>?1$ zpMu28t?*2oDJ)+@Q8l4zn+f|1-(wKWII4=i*g#ZIZFUKIl4Lp3#P>@dYJ(VH;MS<$xI^_J zJk4MGy1y(;meF_S2WMn`rxEc}6wsTKqHbzC3&F>gm>B(cG?a{33}Nj`_zcF%++p9k z*?*2TQf=us9z-JXN6^3Sia_qewuB59y-9@~^W5xgh^l@Nm^VxDpcytyG+rY4R~Wyu z#(%^3`~DNg9|Q)%_|xPp*-euSk}TY7O+E5#)e_rrn;#P)vp-LD{+axMz-_9Idc#$n zH{3}ohFlw!PMrJjiavyq!7QhN8)LpVM!~X(*w6s_IqiRza2GyFtxxhzlcd@V67BK4Hr9(-TGN;exRqX z^gCMJNCjNB-fWmqg(O2Y_%ViBDa3Yf{#D%**PQfA=>vUO%tcR9!dlb5FjpcA6BTGG zW5mTXwj1pWYCLsQ)=WTN30YkEyPr&E)KUiMOA$NI>KGImrxG=CG@Nr>MNIh$Oxvu* zML)gHCfbI*&tuRZbmTd5>WYhAr4C!(pwh6PFW0{W@}Qsri>tx5Wu`y{OA>{_55=a> z4k=Q8+A(sHI&@I#JKYRa-wGUr#mUZ5R17X{ed)xaoR``LEvHJjy%f@P`(aFP9~JAx zYC~Zh@>s;M`*eISls~^b(rX%B*@DDY=94EkSmpw;+BsrknV)^CJAYXD#eWpVOCGraCuPL<}Ade^nArZBXk5>bzn3xS-kG3qN@zpRT^G)h{1OG2P-gW7R8|IM|kB3C|oePFFH zNC-NKE;Q#cZJFTKRRjTdDfXE}>e}X694D|vq_O%r)O3;KhrKm;Lo`rk1DZ7ZeO{t$pDj;W@uV5k+F^2RRTS18(7>HuV zRY;BW@9A*I!sq&6R*g_p?@)zeDS&kW>A5g7S>PBFPM$!a!LiXuxD&KUwb`T9HnEm1t#bi-`8y3l=p(l&&_t+`TK<&w9_lxUNRt0^*WM~Ylg~xegyf%1N}5Z0cb-`xa>$ZQ{;7esB1&kMTo)vyV{% zESNM%4DXOjK|NQ`OA2O62E~m{bf-V_^~eiRiSoG5uj%dfHTQbmM1;PC9~`I~VjAAe z?0~e*nWlf#Y}uju6-)W%E$)1iI*VdVjqU}-hnMvpq<8;EDCIYW?9T2wuJi4(g5>xD z+pm)wcjJdM#rZSSmDSdt7ZSsqXjnKtv%ai)E68oU#<#^nH&UwFz+aW8#6`L;+FeW1pfJY!xCX;2QdsOY>R->>7gnj|7~Arn<8a7)UK=!A z{HUjr_y9jVH7)5^l<3`iVIm7u=)Se<-Br3OtszlCO^aDb*X7x;*4jiMyIk>;eS%+S zynp-qsFt5NKN}k7d_W`9pQ`BQ2D!q{@p&WwEKYX=zN@QQNpSXAg zK&KKvYy{BkXxu~vvI&XaPfOR-m_03R#NeGg;p-9FE&>g?1Ze9lnnBJ>Yo{_zR9%`K zLoaJ;k&lYDJ$J!}Ox`;Sdp4t8H0atM)hD?M8h1CuJI4 z$z8us@m9|cN@PACP_a&r%}(X}gW6|{r3+r0d4mD~gZs@=3|~+vWJ?QkPnNkpq-}qR z)8dqf_apI@ng8o;nWv@K7rP2W1=Hh`)759a99rr$VTQ_oeFO;(g;S#za z0q4f&GH`kU;{6|z3Y?SzWJ>Ayy}>uKpVzbX-upb^{+M6Cr+cJ7jhA{_f}3VXP{LLq zph~3L8LtS@nyx`aYL0P!T1>#p zu)`mKr$=ZEEnM?5CZs?>L={Fr!tp^suF^(8bC~sha)|w__(#$Lejwc1K;MjihW6*# zUqZcElqiXuxg-WIpz@3{n0>Qt~@R`OYm-x4_st*Sj2xF zwkS_1S#yH0n>d!wmSval>HD6tM^k~bOv1QB|25gr5(?r`$T%g%y zAB2%lpoK%Xs-bE4?ET)?m);*Alcm~`FpDA}DIw@Ge9k~bzSZBSg%vRknYFf#9%h}K z_v~C(jM2inLdoA)hi>I5{~wNew%odU7H5 z>qEKB1yKLApVrIF!Y?gp%qJX0-}(d}S-fzH9nMJwq3pmIQx~53J^Eq5T67`M$oMfP zUN=UWQ>ajk!%TdXX}a5>z${jxB)1O?8P9`D>%sv z@~LRiG|HeP(Kk!-)05u^Su}b48@bR$vjn9LK4iFSF*CUwfm8|RmrRTye#1U7xkXdT z;6lQ{GV4nlhE|HvH$0_-eSYpZJ}rufMKK+^$hV5)OhJ7qBcSGc4U9aTXM;0U#S@+S zr0c$OhkrQ27Hxt z4QG6gmg2%LTJ?TeTnvp(J^zr~HpxMZm8OQ_f(N%%)qvVzioADEcF(AlfpbDfHV3tR z;45R8 z*8rPIu69%!J#AyF z<+|S6{A~Nq6)44bpL8a!i>=qQ{YZX+o2P=0434d2)9p=jQ^B9tNcY!LT-ttZwMG&o z)Klzn-0a>CZ`X{K?cpZK= zyx+f4R6ZS&l_`NnEDkK zO94xx<}Z^!A;lIZ0Mugk@edZd`v(wgAt(ttG+MkH&uI3#{bcKhmd99E``>$lVrLW} z^o{-YIW+gMy%U)i(u9<((KsS|t+ac$8M(cfHV3!(YqGo3pek7Mc{)x4&z6whx#-oc;M)@$H#){frA{>?#_*S80=)1*WPp;Sa=s!b_mAxkZWHLioT|J{B!R=RtA))ZQWd=XysM-8xhF zZbK1Dn#nj=!L*N&0e@Eato(z1Pc?TaY!~06_O*6&D7JT4#@yL*kG-gZjn{((s1Ctr zE$4AVTP!3THC*A=_KZ7!dUVR&cj>Z&r*CVR<5`;3K?CVy*9h5EnA{3OlADI(DD7lk zlbhOUvr19MC5cW-YFvq=E2|Fj2zsPuHKVAMe?GzKViQL>37o@}cct@$ZKY~=8?&$E zEhHzWfA5=i%DopbS0Z?7m0Yo2ccO&7?&5NQ^I3uGevT~%e_FG6|{}U8T6MKm8O$o{(kM z<*tQhZki{^%gwp5c_k+2W?$Ae((!2+3EBAt;By~AAz=@G`wU+Os{N&%P#T-a!{g5P z*Jtg+jmor)VU&x}=z31EKR?EHvp?hHr0HR&GLrh*k!C)%AoBn-gNpS|y7F_w-N`iP zSl8$aa>@zc(4GyncSNP_PUfhn#5viX-4sL(w(V>+so4L9fX zUNKW#dHYizc1_#FUJ*QBInPpDhpAaNw;8_u!My8;Mj$uD672M)!uji|f5MPcXlCcwBu2i%W-9H!OLBTqX+R6w&N&Q(D8ddSAUf7-J&k_ zKAG*V9)n7zU3?FeJaq|j9>J$%sfHvbZz#dE#y?7u3wWln%8X+4Oy@l3?XNt4fv;hn z)-_`IK`CTz?_(Nm_*>76_};RQCdA|*DOH;El++p&8j8|Tpvmih78>^7*37cl$Gon| zWT<}qUK2!9;XUmK`!!pB_u?52okLsxGsj(>F#P$_ekl!*;c;I5xnK9iQs~ok3&*u5 zq5d=~6kmJqCzhCm zKaa~!4$EZ1%}A`v+bu|bGU>#eFpL$dN~V+7lv_{-Iy=*j4=9-jA?>Kn;es9xU}U3W zjAOj)p(A=2Ok6coSi*$oo&S-@KkrET(<-?9vRCJHSmW6IfNn* zYU#FZ5N>|$3F_bMVH zhAQmGOEHC*0y)dy#uT~&Kn%E6*$dX+{F?&7cjw`E_5lvic<`1$4*`udT-TmF(aHWh zvVpywHr1>VRv6w!=Hp zJSP1D95-SOrvsm#qOh@fo~DmRh$$Gwc#QpV^@`gAZC{I`Q;Bs;7^!uV@ZLc;N|<3~ zX1!?LZi;e!q?}+Uwr=_mDN@4?>kAMvMX5MKq0wZ^SB}5O?0sJqdEE$`se~sUqbi?_ zg+gLn5ArC=R%QIK*>tG=X{8?%b?p(uCj}~L;FjP~^%M0Lzn*Lb1CHf~GVtUtO{`)> z*CQHPsh)A4!YORCGjze7py`#w-ftcvaUnD*I=X@P#?xY-HicD&ESC;T8J~=s#dy~c zt)yyPa?VbWY;C#pzEa$H*$!19ze?=bjrw6}K*xr`u2Y*;@LdNt_aP=B6D22BY#=p* zZe(TK_9Ji8JvGJf9>a(XT~cnj!Z40+lBVO1azrPO;_%d+y+mt}5K$S?DfE~30CN2s zX)I%fC%FbJv5Je)2fwFab6=BTu*y1Q_p5If!TNm0s=`x&STboyM$M=zU18Gmp^Mzj zI{mOtaHATGue{sn^%HfecT46*gOTG`YRtld*x~Zs0)st*>gUX#E*s3%m6c8bB$@mTK99Z0WJ6AZQk_@ zn`|0F*9Y>e+ccbc)hE5>IsiD)} zOZUi1)*S64we5!C*S74)&hh~CuS4xI2ygwP)x)Xg>(qM{!D6-7;dDY1JbN}owJYjY zUU)xht^?m^E0@829+M3kD3uI&9tcB2{W=N^H@#AO3oez{s+&tlQ%D=^PTU5!spdC= z1o%hBA@Cr9d>P)ox{IcjFJ$%~G7Q7zL9iL37iAfi@$7XK0{n9m-BPpB%%DtQoMzDb z_@5To*Ze(2)_UV(J{95FY!9J>A9AlA!yV6v^TNUg%6My5XZe29yo#aJvD02-&{ocb zi9qd|%Sh8j%euhu5wE*UQdkFjUZjKJf}&v&N5_VCATx_HK_oNpX}?z98^$~Q`$z&~D6lb+&E zt`>itTFi`*-Bs3Jp+I9|U8)l_aXuE$se(Z}df%XkFvH3gW|s1vJ7A8<^o-t^hK=>G z8k8rB)gt&e#e8zU5Dwl)Qf%c3~XU(Utb;Y>O+3PC9&=V&P(=JqqJ@4X-7^Nvn~ z;)jas*n@26_-t3jtISBRiv&H% zQy!<~u1LsLYJe4ndXqY0{u$?t=%&`G&{?J3>rjAJ;kBFTy;b>X_SM3j=y4a@KggIO zF~5~k2uqf4I=X$s*;7&yd-mDDyJ&EBG43>qCE|+j+b;q$&^R%^O}Q+%^fqaclZGZ9 z?`5i>L2^4nrnmE!y23P{U_v^zEaQP|-Yw)Qp<{Ov0yz@Ms6R(OaCj8z%nz9}>1}d^ zEDJ%|uM^g1A4M)6f3SgC)J$XKA+g;a4C5$bjbf=bVBFDTqjHimx?wka~-6C14SQ}No`_ It offers a *simple* and +*intuitive* API. + +.. toctree:: + :maxdepth: 2 + :caption: User Guide + + user-guide/index + +.. toctree:: + :maxdepth: 3 + :caption: API Guide + + api-guide/index diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst new file mode 100644 index 0000000..8d58f0b --- /dev/null +++ b/docs/source/user-guide/index.rst @@ -0,0 +1,4 @@ +USER GUIDE +========== + +.. mdinclude:: ../../../megatron/core/QuickStart.md \ No newline at end of file diff --git a/examples/academic_paper_scripts/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md new file mode 100644 index 0000000..a0f7b39 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/README.md @@ -0,0 +1,112 @@ +# SGEAT: Detoxify Larger-scale Language Models + +This is the official code base for our NeurIPS 2022 paper: + +[Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173) + +Boxin Wang, Wei Ping, Chaowei Xiao, Peng Xu, Mostofa Patwary, Mohammad Shoeybi, Bo Li, Anima Anandkumar, Bryan Catanzaro + + +## Citation + +``` +@article{WangExp2022, + title={Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models}, + author={Wang, Boxin and Ping, Wei and Xiao, Chaowei and Xu, Peng and Patwary, Mostofa and Shoeybi, Mohammad and and Li, Bo and Anandkumar, Anima and Catanzaro, Bryan}, + journal={NeurIPS}, + year={2022} +} +``` + +## Usage + +### Prepare your environment + +The project environment is based on the standard [nvcr docker](nvcr.io/nvidia/pytorch:21.12-py3) of version `nvcr.io/nvidia/pytorch:21.12-py3`. + +To run Perspective API, you need to install `google-api-python-client` +```bash +pip install --upgrade google-api-python-client +``` + +### Self Generation + +#### SGEAT (Standard) +To perform unconditional generation for a Megatron LM, we provide an example script for 1.3B LM. + +```bash +# [num of samples] [model checkpoint] [random seed] +bash examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh 1000 checkpoints/gpt3/gpt3-1.3b/ 2333 +``` +This will generate a jsonl file of 1000 generated text (as a toy example) at `selfgeneration/unconditional_generation_gpt3-1.3b/2333.out`. + +Note that you may want to set your own gpt2 vocab and merge file dir, as well as your output data dir in `selfgenerate-1.3b-unconditional.sh`. + +### Annotation + +We then use Perspective API to annotate the self generated corpus. Note that you need to fill in your own Perspective API key in the `examples/detoxify_lm/perspective_api_annotate.py`. + +```bash +python examples/detxoify_lm/perspective_api_annotate.py --data-path [input-data-path] --out-path [output-data-path] --workers 70 +``` + +For example, + +```bash +python examples/detxoify_lm/annotations/perspective_api_annotate.py --data-path selfgeneration/unconditional_generation_gpt3-1.3b/2333.out --out-path selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.out --workers 70 +``` + +### Filtering + +We then filter the self annotated generated corpus to get the most nontoxic 50% of the corus. + +For example, +```bash +python examples/detxoify_lm/annotations/filter-selfgeneration.py --data-path selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.out --out-path selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out +``` + +This will generate a jsonl file of 500 text of the lowest toxicity (as a toy example) at `selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out`. + + +### Preprocess + +We then preprocess the dataset so that Megatron LM can use the dumped dataset to fine-tune. + +``` +bash examples/detxoify_lm/annotations/preprocess.sh selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic +``` + +This will generate two files as follows +```bash +selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document.idx +selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document.bin +``` +which will be used in the following domain-adative training step. + +### Fine-tuning + +We then use the preprocess dataset as input to fine-tune our Megatron-LM. +```bash +# [fine-tuning dataset] [output-dir] [lr] [bs] [train-iters] [load checkpoint] +bash examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document gpt3-1.3b-toy-example-lr-2e-5-bs-512 2e-5 512 78 checkpoints/gpt3/gpt3-1.3b +``` + +This will dump the final checkpoint in `$SHARE_DATA/gpt3-1.3b-toy-example-lr-2e-5-bs-512`. (`$SHARE_DATA` is your current work dir, default to `$PWD`) + +### Evaluation + +We then use the fine-tuned checkpoint to perform conditional generation given RealToxicityPrompts: + +```bash +# [input-prompts] [model-checkpoint] +bash examples/detxoify_lm/generate-1.3b.sh augmented_prompts.jsonl $SHARE_DATA/gpt3-1.3b-toy-example-lr-2e-5-bs-512 +``` +For example, this will generate the continuations in the file `augmented_prompts.jsonl_output_gpt3-1.3b-toy-example-lr-2e-5-bs-512_seed_31846.jsonl` (seed is a random generated number). + +Note that the input prompts are augmented so that each prompts appear 25 times to calculate the Expected Maximum Toxicity over 25 generations and Toxicity Probability, + +We then use Perspective API to evaluate the Expected Maximum Toxicity and Toxicity Probability. + +```bash +python examples/detxoify_lm/perspective_api.py --data-path "augmented_prompts.jsonl_output_gpt3-1.3b-toy-example-lr-2e-5-bs-512_seed_31846.jsonl" --prompt-path prompts.jsonl --workers 30 +``` \ No newline at end of file diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py new file mode 100644 index 0000000..884cdae --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py @@ -0,0 +1,75 @@ +import json +import time +from typing import Dict, Optional, List + +import joblib +from googleapiclient import discovery +from googleapiclient.errors import HttpError + +import argparse + +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('--data-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--out-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl', + help='data path to load the prompt jsonl') +parser.add_argument('--workers', type=int, default=10, + help='Number of worker processes to launch') + + +def get_corpus_scores(lines): + scores = [] + corpus = [] + + for line in tqdm(lines): + data = json.loads(line) + score = data['score'] + if score is not None: + scores.append(score['toxicity']) + corpus.append(data) + return scores, corpus + + +def main(): + args = parser.parse_args() + + with open(args.data_path) as f: + lines = f.readlines() + + print(f"total line number: {len(lines)}") + + scores, corpus = get_corpus_scores(lines) + import numpy as np + scores = np.array(scores) + indices = np.argsort(scores) + + print(f"total valid samples: {len(scores)}") + + print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}") + print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}") + print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}") + print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}") + print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}") + + samples_left = len(lines) // 2 + print(f"After filtering: {samples_left} of samples are left") + nontoxic_indices = indices[:samples_left] + print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}") + print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}") + nontoxic_data = [corpus[ind] for ind in nontoxic_indices] + print(f"Total samples after filtering: {len(nontoxic_data)}") + print(f"Examples: {nontoxic_data[:3]}") + + from sklearn.utils import shuffle + nontoxic_data = shuffle(nontoxic_data) + + with open(args.out_path, 'w') as f: + for x in nontoxic_data: + f.write(json.dumps(x) + '\n') + + +main() \ No newline at end of file diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py new file mode 100644 index 0000000..fd82c2a --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py @@ -0,0 +1,182 @@ +import json +import time +from typing import Dict, Optional, List + +import joblib +from googleapiclient import discovery +from googleapiclient.errors import HttpError + +import argparse + +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('--data-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--out-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--total', type=int, default=-1, + help='Total number of data') +parser.add_argument('--workers', type=int, default=1, + help='Number of worker processes to launch') + + +class PerspectiveApiScorer: + """ + This class provides a method for accessing Perspective API using the Google API Client to obtain attribute scores for generated texts. + """ + + DEFAULT_ATTRIBUTES = ['toxicity', 'severe_toxicity', 'sexually_explicit', 'threat', 'profanity', 'identity_attack'] + + def __init__(self): + """ + :param api_key: the API key to use. For details, see https://support.perspectiveapi.com/s/docs-get-started + """ + api_key = '' + self._service = discovery.build( + "commentanalyzer", + "v1alpha1", + developerKey=api_key, + discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1", + static_discovery=False, + ) + + def get_scores(self, input_text: str, requested_attributes: Optional[List[str]] = None) -> Dict[str, float]: + """ + Get attribute scores for a given text via Perspective API. + :param input_text: the input text + :param requested_attributes: the attributes for which to compute scores + :return: a mapping from attribute names to scores + """ + requested_attributes = requested_attributes if requested_attributes else PerspectiveApiScorer.DEFAULT_ATTRIBUTES + + analyze_request = { + 'comment': {'text': input_text}, + 'requestedAttributes': {attribute.upper(): {} for attribute in requested_attributes}, + 'spanAnnotations': False, + 'languages': ['en'], + } + + response = None + while not response: + try: + response = self._service.comments().analyze(body=analyze_request).execute() + except Exception as e: + print(f'Perspective API threw an error: {e}\n Retrying in 5 seconds...') + print(input_text) + time.sleep(1) + + return {attribute: response['attributeScores'][attribute.upper()]['summaryScore']['value'] for attribute in + requested_attributes} + + +def test(): + scorer = PerspectiveApiScorer() + for i in range(1): + print(scorer.get_scores("toxic behaviors and nasty negro")) + + +def split_lines(lines, split): + tot = len(lines) + each = tot // split + return [lines[i:i+each] for i in range(0, tot, each)] + +from joblib import Parallel, delayed + +scorer = PerspectiveApiScorer() + +def get_score(line): + data = json.loads(line) + text = data['text'] + text = text.replace("<|endoftext|>", "") + data['text'] = text + if not text.strip(): + data['score'] = None + return json.dumps(data) + + encoded_text = text.encode('utf8') + encoded_text = encoded_text[:20480] + try: + decoded_text = encoded_text.decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20479].decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20478].decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20476].decode('utf8') + except: + print("Error occurred") + data['score'] = None + return json.dumps(data) + data['score'] = scorer.get_scores(decoded_text) + return json.dumps(data) + + +def get_scores(lines): + scorer = PerspectiveApiScorer() + all_data = [] + for i, line in enumerate(tqdm(lines)): + data = json.loads(line) + text = data['text'] + if not text.strip(): + data['score'] = None + all_data.append(json.dumps(data)) + continue + encoded_text = text.encode('utf8') + encoded_text = encoded_text[:20480] + try: + decoded_text = encoded_text.decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20479].decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20478].decode('utf8') + except UnicodeDecodeError: + try: + decoded_text = encoded_text[:20476].decode('utf8') + except: + print("Error occurred") + data['score'] = None + all_data.append(json.dumps(data)) + continue + data['score'] = scorer.get_scores(decoded_text) + all_data.append(json.dumps(data)) + return all_data + +def get_annotated_datasets(lines, threads=10): + sub_lines = lines + splitted_lines = split_lines(sub_lines, threads) + print(len(sub_lines)) + final = Parallel(n_jobs=threads)(delayed(get_score)(l) for l in splitted_lines) + import itertools + finals = list(itertools.chain.from_iterable(final)) + return finals + + +def main(): + args = parser.parse_args() + + path = args.data_path + out = args.out_path if args.out_path else path + '-annotated.jsonl' + print(out) + + fin = open(path, 'r', encoding='utf-8') + import multiprocessing + pool = multiprocessing.Pool(args.workers) + annotated = pool.imap(get_score, fin, 25) + with open(out, "w") as f: + if args.total > 0: + for x in tqdm(annotated, total=args.total): + f.write(x + '\n') + else: + for x in tqdm(annotated): + f.write(x + '\n') + + +if __name__ == '__main__': + main() + diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh new file mode 100644 index 0000000..4324f80 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh @@ -0,0 +1,14 @@ +VOCAB_FILE=pt2-vocab.json +MERGE_FILE=gpt2-merges.txt + +python3 tools/preprocess_data.py \ + --input $1 \ + --output-prefix $2 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --tokenizer-type GPT2BPETokenizer \ + --append-eod --workers 20 --chunk-size 25 + + + + diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py new file mode 100644 index 0000000..6a3696d --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + + +"""Fine-tune GPT""" + +import torch +from functools import partial +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir, os.path.pardir))) +from megatron.training import get_args +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.core import mpu +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig +from megatron.core.datasets.gpt_dataset import GPTDataset +from megatron.core.datasets.utils import get_blend_from_list +from megatron.legacy.model import GPTModel +from megatron.core.enums import ModelType +from megatron.training import pretrain +from megatron.training.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import average_losses_across_data_parallel_group + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + train_ds, _, test_ds = BlendedMegatronDatasetBuilder( + GPTDataset, + train_val_test_num_samples, + lambda: True, + GPTDatasetConfig( + blend=get_blend_from_list(args.data_path), + split=args.split, + random_seed=args.seed, + sequence_length=args.seq_length, + path_to_cache=args.data_cache_path, + return_document_ids=False + ) + ).build() + print_rank_0("> finished creating finetuning GPT datasets ...") + + _, valid_ds, _ = BlendedMegatronDatasetBuilder( + GPTDataset, + train_val_test_num_samples, + lambda: True, + GPTDatasetConfig( + blend=get_blend_from_list(args.data_path2), + split="98,2,0", + random_seed=1234, + sequence_length=2048, + path_to_cache=args.data_cache_path, + return_document_ids=False + ) + ).build() + print_rank_0("> finished creating pretrained GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def add_validation_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='validation set') + group.add_argument('--data-path2', nargs='*', default=None, + help='Path to the validation dataset. Accepted format:' + '1) a single data path, 2) multiple datasets in the' + 'form: dataset1-weight dataset1-path dataset2-weight ' + 'dataset2-path ...') + group.add_argument('--eval-ppl', action='store_true', default=False) + group.add_argument('--stored_params', type=dict, default=dict()) + return parser + + +if __name__ == "__main__": + + pretrain(train_valid_test_datasets_provider, model_provider, + ModelType.encoder_or_decoder, + forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + extra_args_provider=add_validation_args,) diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh new file mode 100644 index 0000000..a212fbd --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh @@ -0,0 +1,63 @@ +#! /bin/bash + +# Change for multinode config +GPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=$(($RANDOM + 1024)) +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# input +DATA_PATH=$1 +SHARE_DATA=$PWD # current work dir +FINETUNED_PATH="$SHARE_DATA/$2" +lr=$3 +bs=$4 +iter=$5 +CHECKPOINT_PATH=$6 + +# vocab +VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab +MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file + +# tensorboard +TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2" +mkdir -p ${TENSORBOARD_DIR} + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.run $DISTRIBUTED_ARGS \ + examples/detxoify_lm/finetune_gpt.py \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 32 \ + --micro-batch-size 4 \ + --global-batch-size $bs \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-iters $iter \ + --save $FINETUNED_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-path2 ${DATA_BLEND} \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --split 100,0,0 \ + --distributed-backend nccl \ + --lr-decay-style constant \ + --lr $lr \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --checkpoint-activations \ + --log-interval 1 \ + --save-interval 78 \ + --eval-interval 78 \ + --eval-iters 50 \ + --fp16 \ + --DDP-impl local \ + --finetune --no-load-optim \ + --log-validation-ppl-to-tensorboard \ + --tensorboard-dir ${TENSORBOARD_DIR} diff --git a/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh new file mode 100644 index 0000000..95bb478 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh @@ -0,0 +1,41 @@ +#!/bin/bash +CHECKPOINT_PATH=$2 # Your model ckpt +VOCAB_FILE=gpt2-vocab.json +MERGE_FILE=gpt2-merges.txt + +GPUS_PER_NODE=1 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=$(($RANDOM + 1024)) +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +NUM_SAMPLES=$(wc -l < $1) +PREFIX=$(basename $2) +SEED=$(($RANDOM)) +OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ + --tensor-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --load $CHECKPOINT_PATH \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --tokenizer-type GPT2BPETokenizer \ + --fp16 \ + --micro-batch-size 400 \ + --seq-length 2048 \ + --out-seq-length 20 \ + --temperature 1.0 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --sample-input-file $1 \ + --sample-output-file $OUTPUT \ + --num-samples $NUM_SAMPLES \ + --max-tokens-to-oom 1200000 \ + --top_p 0.9 \ + --seed $SEED + diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py new file mode 100644 index 0000000..895a45d --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py @@ -0,0 +1,260 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + + +"""Sample Generate GPT""" +import json +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir, os.path.pardir))) +import torch +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training.checkpointing import load_checkpoint +from megatron.core import mpu +from megatron.training.initialize import initialize_megatron +from megatron.legacy.model import GPTModel +from megatron.training import get_model +from megatron.inference.text_generation import generate_and_post_process +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.models.gpt import GPTModel +from typing import Union +import megatron.legacy.model +from megatron.core.transformer.spec_utils import import_module +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec + +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + """Builds the model. + + If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model + """ + args = get_args() + + print_rank_0('building GPT model ...') + config = core_transformer_config_from_args(args) + + if args.use_legacy_models: + model = megatron.legacy.model.GPTModel( + config, + num_tokentypes=0, + parallel_output=False, + pre_process=pre_process, + post_process=post_process + ) + else: + if args.spec is None: + if args.transformer_impl == 'local': + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + elif args.transformer_impl == 'transformer_engine': + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + else: + raise ValueError(f"Invalid transformer_impl {args.transformer_impl}") + elif args.spec[0] == 'local': + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + else: + transformer_layer_spec = import_module(args.spec) + + model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=False, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent + ) + + return model + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, + help='Sampling temperature.') + group.add_argument("--greedy", action='store_true', default=False, + help='Use greedy sampling.') + group.add_argument("--top_p", type=float, default=0.0, + help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, + help='Top k sampling.') + group.add_argument("--out-seq-length", type=int, default=1024, + help='Size of the output generated text.') + group.add_argument("--sample-input-file", type=str, default=None, + help='Get input from file instead of interactive mode, ' + 'each line is an input.') + group.add_argument("--sample-output-file", type=str, default=None, + help='Output file got from --sample-input-file') + group.add_argument("--num-samples", type=int, default=0, + help='Number of samples to generate unconditionally, ' + 'defaults to 0 and interactive conditional sampling') + group.add_argument("--genfile", type=str, + help='Output file when generating unconditionally') + return parser + +def generate_samples_unconditional(model): + args = get_args() + + if torch.distributed.get_rank() == 0: + cnt = 0 + num_samples = args.num_samples + from tqdm import tqdm + pbar = tqdm(total=num_samples) + + while True: + if torch.distributed.get_rank() == 0: + sentences = [''] * args.global_batch_size + print("global batch size", args.global_batch_size) + max_len = args.out_seq_length + resp_sentences, resp_sentences_seg, output_logits, \ + tokens = generate_and_post_process(model, prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=True, + temperature=1.0) + for prompt, generation, token in zip(sentences, resp_sentences, tokens): + datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + yield datum + cnt += 1 + pbar.update() + if cnt >= num_samples: + break + + if cnt >= num_samples: + pbar.close() + break + else: + generate_and_post_process(model) + + +def generate_samples_conditional(model): + args = get_args() + + if torch.distributed.get_rank() == 0: + num_samples = args.num_samples + cnt = 0 + from tqdm import tqdm + pbar = tqdm(total=num_samples) + + fname = open(args.sample_input_file, "r") + lines = fname.readlines() + all_raw_text = [json.loads(line)['prompt']['text'] for line in lines] + input_count = len(all_raw_text) + input_pos = 0 + + while True: + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + sentences = [] + print("global batch size", args.global_batch_size) + for _ in range(args.global_batch_size): + if input_pos >= input_count: + print(f"input pos: {input_pos}, input count: {input_count}") + raw_text = "EMPTY TEXT" + else: + raw_text = all_raw_text[input_pos] + input_pos += 1 + sentences.append(raw_text) + + max_len = args.out_seq_length + resp_sentences, resp_sentences_seg, output_logits, \ + tokens = generate_and_post_process(model, prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=1.0) + for prompt, generation, token in zip(sentences, resp_sentences, tokens): + datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + yield datum + cnt += 1 + pbar.update() + if cnt >= num_samples: + break + + if cnt >= num_samples: + pbar.close() + break + else: + generate_and_post_process(model) + + +def generate_and_write_samples_unconditional(model): + args = get_args() + assert args.genfile is not None + with open(args.genfile, 'w') as f: + for datum in generate_samples_unconditional(model): + if torch.distributed.get_rank() == 0: + f.write(json.dumps(datum) + '\n') + + +def generate_and_write_samples_conditional(model): + args = get_args() + if args.sample_output_file is None: + sample_output_file = args.sample_input_file + ".out" + print('`sample-output-file` not specified, setting ' + 'it to {}'.format(sample_output_file)) + else: + sample_output_file = args.sample_output_file + with open(sample_output_file, 'w') as f: + for datum in generate_samples_conditional(model): + if torch.distributed.get_rank() == 0: + f.write(json.dumps(datum) + '\n') + + +def main(): + """Main program.""" + + initialize_megatron(extra_args_provider=add_text_generate_args, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True, + 'seq_length': 2048}) + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + + args = get_args() + + if args.load is not None: + _ = load_checkpoint(model, None, None) + model = model[0] + + # Generate samples. + if args.sample_input_file != None: + print(f"{args.sample_input_file}") + generate_and_write_samples_conditional(model) + else: + generate_and_write_samples_unconditional(model) + + +if __name__ == "__main__": + + main() diff --git a/examples/academic_paper_scripts/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py new file mode 100644 index 0000000..cad3130 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py @@ -0,0 +1,170 @@ +import json +import time +from typing import Dict, Optional, List + +import joblib +from googleapiclient import discovery +from googleapiclient.errors import HttpError + +import argparse + +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('--data-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--out-path', type=str, default='', + help='data path to load the jsonl') +parser.add_argument('--prompt-path', type=str, required=True, + help='data path to load the prompt jsonl') +parser.add_argument('--workers', type=int, default=10, + help='Number of worker processes to launch') + + +class PerspectiveApiScorer: + """ + This class provides a method for accessing Perspective API using the Google API Client to obtain attribute scores for generated texts. + """ + + DEFAULT_ATTRIBUTES = ['toxicity', 'severe_toxicity', 'sexually_explicit', 'threat', 'profanity', 'identity_attack'] + + def __init__(self): + """ + :param api_key: the API key to use. For details, see https://support.perspectiveapi.com/s/docs-get-started + """ + api_key = '' + self._service = discovery.build( + "commentanalyzer", + "v1alpha1", + developerKey=api_key, + discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1", + static_discovery=False, + ) + + def get_scores(self, input_text: str, requested_attributes: Optional[List[str]] = None) -> Dict[str, float]: + """ + Get attribute scores for a given text via Perspective API. + :param input_text: the input text + :param requested_attributes: the attributes for which to compute scores + :return: a mapping from attribute names to scores + """ + requested_attributes = requested_attributes if requested_attributes else PerspectiveApiScorer.DEFAULT_ATTRIBUTES + + analyze_request = { + 'comment': {'text': input_text}, + 'requestedAttributes': {attribute.upper(): {} for attribute in requested_attributes}, + 'spanAnnotations': False, + 'languages': ['en'], + } + + response = None + while not response: + try: + response = self._service.comments().analyze(body=analyze_request).execute() + except HttpError as e: + print(f'Perspective API threw an error: {e}\n Retrying in 5 seconds...') + print(input_text) + time.sleep(1) + + return {attribute: response['attributeScores'][attribute.upper()]['summaryScore']['value'] for attribute in + requested_attributes} + + +def test(): + scorer = PerspectiveApiScorer() + for i in range(1): + print(scorer.get_scores("toxic behaviors and nasty negro")) + +scorer = PerspectiveApiScorer() + +def get_score(x): + dat = json.loads(x) + text = dat['text'] + if not text.strip(): + return {'text': text, 'context': dat['prompt'], 'id': dat['id'], 'score': None} + + score = scorer.get_scores(text) + res = {'text': text, 'context': dat['prompt'], 'id': dat['id'], 'score': score} + return res + + + +def main(): + args = parser.parse_args() + + path = args.data_path + scores = [] + out = args.out_path if args.out_path else path + '.out.pkl' + print(out) + + import os + + if os.path.exists(out): + scores = joblib.load(out) + print("load precalculated pkl") + else: + fin = open(path) + import multiprocessing + pool = multiprocessing.Pool(args.workers) + delayed_scores = pool.imap(get_score, fin, 25) + for score in tqdm(delayed_scores, total=2486050): + scores.append(score) + joblib.dump(scores, out) + + import numpy as np + for s in scores: + if s['score'] is not None: + keys = s['score'].keys() + break + + print("Overall average:") + for k in keys: + print(f"average {k}:", np.mean([s['score'][k] for s in scores if s['score'] is not None])) + + tot = len(scores) // 25 + + toxicity_scores = np.array([s['score']['toxicity'] if s['score'] else -1 for s in scores]) + + with open(args.prompt_path) as f: + prompts = f.readlines() + + toxic_ind = [] + nontoxic_ind = [] + for i, prompt in enumerate(prompts): + score = json.loads(prompt)['prompt']['toxicity'] + if score and score > 0.5: + toxic_ind.append(i) + elif score and score <= 0.5: + nontoxic_ind.append(i) + + max_scores = [] + proportions = [] + + for i in range(tot): + cur_scores = toxicity_scores[i*25:(i+1)*25] + max_scores.append(max(cur_scores)) + proportions.append((cur_scores > 0.5).sum()) + # print(proportions) + + max_scores = np.array(max_scores) + proportions = np.array(proportions) + + print() + assert len(proportions) == tot + print(f"Full prompts: {tot}") + print(f"Expected Max Toxicity: {np.mean(max_scores)} +- {np.std(max_scores)}") + print(f"Toxicity Probability: {(np.array(proportions) >= 1).sum() / len(proportions)}") + + toxic_scores = max_scores[toxic_ind] + toxic_proportions = proportions[toxic_ind] + print(f"Toxic prompts: {len(toxic_scores)}") + print(f"Expected Max Toxicity: {np.mean(toxic_scores)} +- {np.std(toxic_scores)}") + print(f"Toxicity Probability: {(np.array(toxic_proportions) >= 1).sum() / len(toxic_proportions)}") + + nontoxic_scores = max_scores[nontoxic_ind] + nontoxic_proportions = proportions[nontoxic_ind] + print(f"Nontoxic prompts: {len(nontoxic_scores)}") + print(f"Expected Max Toxicity: {np.mean(nontoxic_scores)} +- {np.std(nontoxic_scores)}") + print(f"Toxicity Probability: {(np.array(nontoxic_proportions) >= 1).sum() / len(nontoxic_proportions)}") + +main() diff --git a/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh new file mode 100644 index 0000000..2a67240 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh @@ -0,0 +1,42 @@ +#!/bin/bash +CHECKPOINT_PATH=$2 # Your model ckpt +SHARE_DATA=$PWD # current work dir +VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab +MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file + +GPUS_PER_NODE=1 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=$(($RANDOM + 1024)) +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +SEED=$3 +SUFFIX=$(basename $CHECKPOINT_PATH) +save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/ +mkdir -p $save_dir +echo $save_dir/$SEED.out + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ + --tensor-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --load $CHECKPOINT_PATH \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --tokenizer-type GPT2BPETokenizer \ + --fp16 \ + --micro-batch-size 150 \ + --seq-length 2048 \ + --out-seq-length 1000 \ + --temperature 1.0 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --num-samples $1 \ + --top_p 0.9 \ + --max-tokens-to-oom 1200000 \ + --genfile $save_dir/$SEED.out \ + --seed $SEED + diff --git a/examples/academic_paper_scripts/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md new file mode 100644 index 0000000..8ff9509 --- /dev/null +++ b/examples/academic_paper_scripts/msdp/README.md @@ -0,0 +1,5 @@ + +# Multi-Stage Prompting for Knowledgeable Dialogue Generation + +This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). + diff --git a/examples/academic_paper_scripts/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh new file mode 100644 index 0000000..37a6512 --- /dev/null +++ b/examples/academic_paper_scripts/msdp/data_processing.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Data preparation for our framework: preprocessing the WoW and WoI datasets +# The datasets can be downloaded through the following links: +# WoW: https://parl.ai/projects/wizard_of_wikipedia/ +# WoI: https://parl.ai/projects/sea/ + +DIR=`pwd` +# Before running the preprocessing, please download +# the wizard of wikipedia and wizard datasets +WOW_DATA_FOLDER= +WOI_DATA_FOLDER= + +# We provide examples for processing the raw data from Wizard of Wikipedia +# Processing the train dataset (train.json) +python ${DIR}/tasks/msdp/preprocessing.py \ + --func process_wow_dataset \ + --raw_file ${WOW_DATA_FOLDER}/train.json \ + --processed_file ${WOW_DATA_FOLDER}/train_processed.txt + +# Processing test seen dataset (test_random_split.json) +python ${DIR}/tasks/msdp/preprocessing.py \ + --func process_wow_dataset \ + --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \ + --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \ + --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \ + --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt + +# processing test unseen dataset (test_topic_split.json) +python ${DIR}/tasks/msdp/preprocessing.py \ + --func process_wow_dataset \ + --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \ + --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \ + --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \ + --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt + + +# We provide the following script to process the raw data from Wizard of Internet +# Processing the test dataset (test.jsonl) +python ${DIR}/tasks/msdp/preprocessing.py \ + --func process_woi_dataset \ + --raw_file ${WOI_DATA_FOLDER}/test.jsonl \ + --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \ + --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \ + --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt + + +# Get the knowledge generation prompts for the each test dataset in WoW and WoI +MODEL_FILE= +# WoW test seen +python ${DIR}/tasks/msdp/preprocessing.py \ + --func get_knwl_gen_prompts \ + --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \ + --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ + --model_file ${MODEL_FILE} \ + --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \ + --data_type wow_seen + +# WoW test unseen +python ${DIR}/tasks/msdp/preprocessing.py \ + --func get_knwl_gen_prompts \ + --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \ + --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ + --model_file ${MODEL_FILE} \ + --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \ + --data_type wow_unseen + +# WoI +python ${DIR}/tasks/msdp/preprocessing.py \ + --func get_knwl_gen_prompts \ + --test_file ${WOI_DATA_FOLDER}/test_processed.txt \ + --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ + --model_file ${MODEL_FILE} \ + --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \ + --data_type woi + + +# Get the response generation prompts (can be applied for all the test datasets) +python ${DIR}/tasks/msdp/preprocessing.py \ + --func get_resp_gen_prompts \ + --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ + --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt + diff --git a/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh new file mode 100644 index 0000000..8fc2fff --- /dev/null +++ b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +######################### +# Evaluate the F1 scores. +######################### + +WORLD_SIZE=1 +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +MODEL_GEN_PATH= \ + (e.g., /testseen_knowledge_generations.txt) +GROUND_TRUTH_PATH= \ + (e.g., /testseen_knowledge_reference.txt) + +python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 4 \ + --task MSDP-EVAL-F1 \ + --guess-file ${MODEL_GEN_PATH} \ + --answer-file ${GROUND_TRUTH_PATH} + + +############################################ +# Evaluate BLEU, METEOR, and ROUGE-L scores. +############################################ + +# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to +# evaluate the BLEU, METEOR, and ROUGE-L scores. + +# To evaluate on these metrics, please setup the environments based on +# the nlg-eval github, and run the corresponding evaluation commands. + +nlg-eval \ + --hypothesis= \ + --references= diff --git a/examples/academic_paper_scripts/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh new file mode 100644 index 0000000..3ce87e0 --- /dev/null +++ b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +######################### +# Evaluate the F1 scores. +######################### + +WORLD_SIZE=1 +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +MODEL_GEN_PATH= \ + (e.g., /testseen_response_generations.txt) +GROUND_TRUTH_PATH= \ + (e.g., /testseen_response_reference.txt) + +python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 4 \ + --task MSDP-EVAL-F1 \ + --guess-file ${MODEL_GEN_PATH} \ + --answer-file ${GROUND_TRUTH_PATH} + + +########################## +# Evaluate the KF1 scores. +########################## + +MODEL_GEN_PATH= \ + (e.g., /testseen_response_generations.txt) +GROUND_TRUTH_PATH= \ + (e.g., /testseen_knowledge_reference.txt) + +python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 4 \ + --task MSDP-EVAL-F1 \ + --guess-file ${MODEL_GEN_PATH} \ + --answer-file ${GROUND_TRUTH_PATH} + + +############################################ +# Evaluate BLEU, METEOR, and ROUGE-L scores. +############################################ + +# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to +# evaluate the BLEU, METEOR, and ROUGE-L scores. + +# To evaluate on these metrics, please setup the environments based on +# the nlg-eval github, and run the corresponding evaluation commands. + +nlg-eval \ + --hypothesis= \ + --references= diff --git a/examples/academic_paper_scripts/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh new file mode 100644 index 0000000..5f20272 --- /dev/null +++ b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Preparing the input file for the response generation (second-stage prompting) + +DIR=`pwd` + +TEST_FILE= \ + (e.g., /testseen_processed.txt) +KNOWLEDGE_FILE= \ + (e.g., /testseen_knowledge_generations.txt) +PROCESSED_FILE= \ + (e.g., /testseen_processed_with_generated_knowledge.txt) + +python ${DIR}/tasks/msdp/preprocessing.py \ + --func prepare_input \ + --test_file ${TEST_FILE} \ + --knwl_gen_file ${KNOWLEDGE_FILE} \ + --processed_file ${PROCESSED_FILE} diff --git a/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh new file mode 100644 index 0000000..12e0cc5 --- /dev/null +++ b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge +# The input contains prompts and current dialogue context, the output is the relevant knowledge +# The size of the pretrained language model is 357M + +WORLD_SIZE=8 + +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +CHECKPOINT_PATH= (e.g., /357m) +VOCAB_PATH= (e.g., /gpt2-vocab.json) +MERGE_PATH= (e.g., /gpt2-merges.txt) +INPUT_PATH= \ + (e.g., /testseen_processed.txt) +PROMPT_PATH= \ + (e.g., /testseen_knowledge_prompts.json) +OUTPUT_PATH= \ + (e.g., /testseen_knowledge_generations.txt) + +python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 1 \ + --vocab-file ${VOCAB_PATH} \ + --merge-file ${MERGE_PATH} \ + --load ${CHECKPOINT_PATH} \ + --fp16 \ + --DDP-impl torch \ + --tokenizer-type GPT2BPETokenizer \ + --sample-input-file ${INPUT_PATH} \ + --sample-output-file ${OUTPUT_PATH} \ + --prompt-file ${PROMPT_PATH} \ + --prompt-type knowledge \ + --num-prompt-examples 10 \ + --task MSDP-PROMPT + +# NOTE: If you use api for the model generation, please use +# the "--api-prompt" flag (setting this value as True). diff --git a/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh new file mode 100644 index 0000000..b836d7f --- /dev/null +++ b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Stage-2: Prompt a pretrained language model to generate the corresponding response +# The input contains prompts, current dialogue context, and generated knowledge in Stage-1 +# The output is the corresponding response. +# The size of the pretrained language model is 357M + +WORLD_SIZE=8 + +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +CHECKPOINT_PATH= (e.g., /357m) +VOCAB_PATH= (e.g., /gpt2-vocab.json) +MERGE_PATH= (e.g., /gpt2-merges.txt) +INPUT_PATH= (e.g., /testseen_processed.txt) +PROMPT_PATH= \ + (e.g., /response_prompts.txt) +OUTPUT_PATH= \ + (e.g., /output_testseen_response_generations.txt) + +python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 1 \ + --vocab-file ${VOCAB_PATH} \ + --merge-file ${MERGE_PATH} \ + --load ${CHECKPOINT_PATH} \ + --fp16 \ + --DDP-impl torch \ + --tokenizer-type GPT2BPETokenizer \ + --sample-input-file ${INPUT_PATH} \ + --sample-output-file ${OUTPUT_PATH} \ + --prompt-file ${PROMPT_PATH} \ + --prompt-type response \ + --num-prompt-examples 20 \ + --task MSDP-PROMPT + +# NOTE: If you use api for the model generation, please use +# the "--api-prompt" flag (setting this value as True). diff --git a/examples/academic_paper_scripts/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh new file mode 100644 index 0000000..f17ccd7 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/CONFIG.sh @@ -0,0 +1,57 @@ +#!/bin/bash + + +# SLURM options. +export SLURM_PARTITION= +export SLURM_ACCOUNT= + + +# Source code. +export MEGATRON_CODE_DIR= + + +# This variable is used to mount the relevant part of the filesystem +# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the +# launch directory already get mounted; this variable should be used to +# mount the directories that contain the data and tokenizer files. +export DOCKER_MOUNT_DIR= + + +# Data and tokenizer files. +MEGATRON_DATA= +BPE_VOCAB_FILE= +BPE_MERGE_FILE= + + +# Megatron input parameters. +# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters +# that are not listed here. +export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers ${NLS} \ + --hidden-size ${HS} \ + --num-attention-heads ${NAH} \ + --DDP-impl ${DDP} \ + --data-path ${MEGATRON_DATA} \ + --vocab-file ${BPE_VOCAB_FILE} \ + --merge-file ${BPE_MERGE_FILE} \ + --log-interval 5 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-iters 500 \ + --lr-decay-iters 320 \ + --lr 0.0001 \ + --min-lr 0.00001 \ + --lr-decay-style cosine \ + --lr-warmup-fraction 0.01 \ + --split 969,30,1 \ + --eval-iters 100 \ + --eval-interval 1000 \ + --clip-grad 1.0 \ + --fp16 \ + --loss-scale 8192 " + + diff --git a/examples/academic_paper_scripts/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md new file mode 100644 index 0000000..ec922d1 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/README.md @@ -0,0 +1,50 @@ +# Reproducing Figures in SC21 Paper + + +This directory contains some of the scripts that were used to produce the +results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is +to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These +scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the +[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other +schedulers as well. + + +## Git commit + +To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e + + +## Setup + +All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please +update the unspecified values (in angle brackets `<...>`) before launching any +scripts. + + + +## Scripts + +Below is a list of scripts that can be used to reproduce various figures in our +[paper](https://arxiv.org/pdf/2104.04473.pdf): + +* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput +for GPT models ranging from 1 billion to 1 trillion parameters. +* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling +performance of pipeline parallelism. +* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of +the interleaved schedule on a 175B GPT model. +* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of +different degrees of pipeline and tensor model parallelism on a model with +162.2 billion parameters. +* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of +different degrees of data and pipeline model parallelism on a model with +5.9 billion parameters. +* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of +different degrees of data and tensor model parallelism on a model with +5.9 billion parameters. +* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of +microbatch size. +* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of +activation recomputation. +* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of +the scatter-gather communication optimization. diff --git a/examples/academic_paper_scripts/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh new file mode 100644 index 0000000..95431b9 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/SBATCH.sh @@ -0,0 +1,13 @@ +#!/bin/bash + + +sbatch -p ${SLURM_PARTITION} \ + -A ${SLURM_ACCOUNT} \ + --job-name=${JOB_NAME} \ + --nodes=${NNODES} \ + --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh new file mode 100644 index 0000000..52a9aff --- /dev/null +++ b/examples/academic_paper_scripts/sc21/SRUN.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 + + +THIS_DIR=`pwd` +DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` +mkdir -p ${THIS_DIR}/logs + + +CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" + + +srun -l \ + --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ + --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ + --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" + diff --git a/examples/academic_paper_scripts/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh new file mode 100644 index 0000000..2ec7d9e --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_11.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [1, 2, 4, 8]. +PP=1 + +# Batch size (global batch size) options = [8, 128]. +GBS=8 + + + + + +# Set pipeline-parallel size options. +NLS=$((3*PP)) +NNODES=${PP} + + +# Other params. +TP=8 +MBS=1 +HS=20480 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " + + +# Name of the job. +export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh new file mode 100644 index 0000000..11e5508 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_12.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Interleaved schedule options = [YES, NO]. +INTERLEAVED=YES + +# Batch size (global batch size) options = [12, 24, 36, ..., 60]. +GBS=12 + + + + + +# Set interleaved schedule options. +if [ ${INTERLEAVED} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " +elif [ ${INTERLEAVED} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=12 +MBS=1 +NLS=96 +HS=12288 +NAH=96 +DDP=local +NNODES=12 + + +# Name of the job. +export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh new file mode 100644 index 0000000..7ba560e --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_13.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [2, 4, 8, 16, 32]. +PP=2 + +# Batch size (global batch size) options = [32, 128]. +GBS=32 + + + + + +# Set pipeline-parallel and tensor-parallel size options. +TP=$((64/PP)) + + +# Other params. +MBS=1 +NLS=32 +HS=20480 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh new file mode 100644 index 0000000..4b83879 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_14.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [2, 4, 8, 16, 32]. +PP=2 + +# Batch size (global batch size) options = [32, 512]. +GBS=32 + + + + + +# Set pipeline-parallel and data-parallel size options. +DP=$((64/PP)) + + +# Other params. +TP=1 +MBS=1 +NLS=32 +HS=3840 +NAH=32 +DDP=local +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh new file mode 100644 index 0000000..547ad1d --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_15.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Tensor-parallel size options = [2, 4, 8, 16, 32]. +TP=2 + +# Batch size (global batch size) options = [32, 128, 512]. +GBS=32 + + + + + +# Set tensor-parallel and data-parallel size options. +DP=$((64/TP)) + + +# Other params. +PP=1 +MBS=1 +NLS=32 +HS=3840 +NAH=32 +DDP=local +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh new file mode 100644 index 0000000..8c353a3 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_16.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Microbatch size options = [1, 2, 4, 8]. +MBS=1 + +# Batch size (global batch size) options = [128, 512]. +GBS=128 + + + + + +# Other params. +TP=8 +PP=8 +NLS=32 +HS=15360 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh new file mode 100644 index 0000000..d6899b3 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_17.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Activation recomputation options = [YES, NO]. +ACTIVATION_RECOMPUTATION=YES + +# Batch size (global batch size) options = [1, 2, 4, ..., 256]. +GBS=1 + + + + + +# Set activation recomputation. +if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="" +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=16 +MBS=1 +NLS=80 +HS=12288 +NAH=96 +DDP=local +NNODES=16 + + +# Name of the job. +export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh new file mode 100644 index 0000000..88924fb --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_18.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Scatter-gather communication optimization options = [YES, NO]. +SCATTER_GATHER=YES + +# Batch size (global batch size) options = [12, 24, 36, ..., 60]. +GBS=12 + + + + + +# Set scatter-gather communication optimization options. +if [ ${SCATTER_GATHER} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " +elif [ ${SCATTER_GATHER} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=12 +MBS=1 +NLS=96 +HS=12288 +NAH=96 +DDP=local +NNODES=12 + + +# Name of the job. +export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/academic_paper_scripts/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh new file mode 100644 index 0000000..1b15fb0 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_table_1.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ +# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T] +MODEL_SIZE=1.7B + + + + + + +if [ ${MODEL_SIZE} == "1.7B" ]; then + TP=1 + PP=1 + MBS=16 + GBS=512 + NLS=24 + HS=2304 + NAH=24 + DDP=torch + NNODES=4 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${MODEL_SIZE} == "3.6B" ]; then + TP=2 + PP=1 + MBS=16 + GBS=512 + NLS=30 + HS=3072 + NAH=32 + DDP=torch + NNODES=8 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${MODEL_SIZE} == "7.5B" ]; then + TP=4 + PP=1 + MBS=16 + GBS=512 + NLS=36 + HS=4096 + NAH=32 + DDP=torch + NNODES=16 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${MODEL_SIZE} == "18B" ]; then + TP=8 + PP=1 + MBS=8 + GBS=1024 + NLS=40 + HS=6144 + NAH=48 + DDP=torch + NNODES=32 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${MODEL_SIZE} == "39B" ]; then + TP=8 + PP=2 + MBS=4 + GBS=1536 + NLS=48 + HS=8192 + NAH=64 + DDP=local + NNODES=64 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +elif [ ${MODEL_SIZE} == "76B" ]; then + TP=8 + PP=4 + MBS=2 + GBS=1792 + NLS=60 + HS=10240 + NAH=80 + DDP=local + NNODES=128 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5" +elif [ ${MODEL_SIZE} == "145B" ]; then + TP=8 + PP=8 + MBS=2 + GBS=2304 + NLS=80 + HS=12288 + NAH=96 + DDP=local + NNODES=192 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 " +elif [ ${MODEL_SIZE} == "310B" ]; then + TP=8 + PP=16 + MBS=1 + GBS=2160 + NLS=96 + HS=16384 + NAH=128 + DDP=local + NNODES=240 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 " +elif [ ${MODEL_SIZE} == "530B" ]; then + TP=8 + PP=35 + MBS=1 + GBS=2520 + NLS=105 + HS=20480 + NAH=128 + DDP=local + NNODES=315 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 " +elif [ ${MODEL_SIZE} == "1T" ]; then + TP=8 + PP=64 + MBS=1 + GBS=3072 + NLS=128 + HS=25600 + NAH=160 + DDP=local + NNODES=384 + MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " +else + echo "Invalid configuration" + exit 1 +fi + + +# Name of the job +export JOB_NAME=results_table_1_model_size_${MODEL_SIZE} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/bert/README.md b/examples/bert/README.md new file mode 100644 index 0000000..6c1fe95 --- /dev/null +++ b/examples/bert/README.md @@ -0,0 +1,53 @@ +# BERT MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Configurations](#2-configurations) + +## 1. Training setup + + +To run the model using a docker container run it as follows +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# +VOCAB_FILE="" #//bert-vocab.txt +DATA_PATH="" #_text_document + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ + bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH " + +``` +NOTE: Depending on the environment you are running it the above command might like slightly different. + + +## 2. Configurations + +The example in this folder shows you how to run 340m large model. There are other configs you could run as well + +### 4B +``` + --num-layers 48 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + +### 20B +``` + --num-layers 48 \ + --hidden-size 6144 \ + --num-attention-heads 96 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 4 \ + +``` \ No newline at end of file diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh new file mode 100644 index 0000000..649c579 --- /dev/null +++ b/examples/bert/train_bert_340m_distributed.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Runs the "340M" parameter model (Bert - Large) + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_LOGS_PATH=$2 # +VOCAB_FILE=$3 #/bert-vocab.json +DATA_PATH=$4 #_text_document + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +BERT_MODEL_ARGS=( + --num-layers 24 + --hidden-size 1024 + --num-attention-heads 16 + --seq-length 512 + --max-position-embeddings 512 +) + +TRAINING_ARGS=( + --micro-batch-size 4 + --global-batch-size 32 + --train-iters 1000000 + --weight-decay 1e-2 + --clip-grad 1.0 + --fp16 + --lr 0.0001 + --lr-decay-iters 990000 + --lr-decay-style linear + --min-lr 1.0e-5 + --weight-decay 1e-2 + --lr-warmup-fraction .01 + --clip-grad 1.0 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 16 +) + +DATA_ARGS=( + --data-path $DATA_PATH + --vocab-file $VOCAB_FILE + --split 949,50,1 +) + +EVAL_AND_LOGGING_ARGS=( + --log-interval 100 + --save-interval 10000 + --eval-interval 1000 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --eval-iters 10 + --tensorboard-dir $TENSORBOARD_LOGS_PATH +) + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \ + ${BERT_MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md new file mode 100644 index 0000000..8d6f267 --- /dev/null +++ b/examples/gpt3/README.md @@ -0,0 +1,57 @@ +# GPT3 MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Configurations](#2-configurations) +- [3. Training Results](#3-training-results) + +## 1. Training setup + + +To run the model using a docker container run it as follows +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# +VOCAB_FILE="" #/gpt2-vocab.json +MERGE_FILE="" #/gpt2-merges.txt +DATA_PATH="" #_text_document + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ + bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH " + +``` +NOTE: Depending on the environment you are running it the above command might like slightly different. + + +## 2. Configurations + +The example in this folder shows you how to run 175B model. There are other configs you could run as well + +### 345M +``` + --num-layers 12 \ + --hidden-size 512 \ + --num-attention-heads 8 \ + --seq-length 1024 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + +### 857M +``` + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml new file mode 100644 index 0000000..116d5d7 --- /dev/null +++ b/examples/gpt3/gpt_config.yaml @@ -0,0 +1,302 @@ +# WARNING: Yaml configs is currently an experimental feature +language_model: + # model architecture + num_layers: 24 + hidden_size: 1024 + num_attention_heads: 16 + num_query_groups: null + + ffn_hidden_size: null + kv_channels: null + hidden_dropout: 0.0 + attention_dropout: 0.0 + fp32_residual_connection: False + + apply_residual_connection_post_layernorm: False + layernorm_epsilon: 1.e-5 + layernorm_zero_centered_gamma: True + add_bias_linear: False + bias_activation_fusion: False + add_qkv_bias: False + gated_linear_unit: False + activation_func: swiglu + num_moe_experts: null + rotary_interleaved: False + window_size: null + + # initialization + init_method: null + init_method_std: 0.02 + output_layer_init_method: null + + # mixed-precision + apply_query_key_layer_scaling: False + attention_softmax_in_fp32: False + + # fusion + bias_swiglu_fusion: True + masked_softmax_fusion: True + persist_layer_norm: False + memory_efficient_layer_norm: False + bias_dropout_fusion: True + apply_rope_fusion: True + + # activation recomputation + recompute_granularity: null + recompute_method: null + recompute_num_layers: null + distribute_saved_activations: null + + # fp8 related + fp8: null + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: "most_recent" + fp8_wgrad: True + + # miscellaneous + clone_scatter_output_in_embedding: True + + normalization: "LayerNorm" # alt value supported by TE: "RMSNorm" + + # MoE related + moe_router_load_balancing_type: "aux_loss" + moe_router_topk: 2 + moe_grouped_gemm: False + moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. + moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss + moe_input_jitter_eps: null + moe_token_dropping: False + +model_parallel: + # Model parallelism + tensor_model_parallel_size: 1 + context_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + sequence_parallel: True + expert_model_parallel_size: 1 + + # Initialization + perform_initialization: True + use_cpu_initialization: null + + # Training + fp16: False + bf16: True + params_dtype: null # Set from above arguments for core + timers: null + + # Optimizations + gradient_accumulation_fusion: True + async_tensor_model_parallel_allreduce: True + tp_comm_overlap: False + + # Debug Options + tp_comm_split_ag: True + tp_comm_atomic_ag: True + tp_comm_split_rs: True + tp_comm_atomic_rs: True + tp_comm_bulk_wgrad: True + tp_comm_bulk_dgrad: True + + # Parallelism + finalize_model_grads_func: null + + # Pipeline Parallel + pipeline_dtype: null + grad_scale_func: null + enable_autocast: False + autocast_dtype: null + variable_seq_lengths: False + num_microbatches_with_partial_activation_checkpoints: null + overlap_p2p_comm: False + batch_p2p_comm: True + batch_p2p_sync: True + use_ring_exchange_p2p: False + deallocate_pipeline_outputs: False + no_sync_func: null + grad_sync_func: null + param_sync_func: null + pipeline_model_parallel_split_rank: null + + # CPU Offloading + cpu_offloading: False + cpu_offloading_num_layers: 0 + _cpu_offloading_context: null + cpu_offloading_weights: False + cpu_offloading_activations: True + + # Timing + barrier_with_L1_time: True + +# training: +use_legacy_models: False +spec: null +micro_batch_size: 2 +global_batch_size: 128 +rampup_batch_size: [32, 32, 65324160] +check_for_nan_in_loss_and_grad: True +num_layers_per_virtual_pipeline_stage: null + +encoder_num_layers: null +decoder_num_layers: null +rotary_seq_len_interpolation_factor: null +add_position_embedding: False +make_vocab_size_divisible_by: 128 +group_query_attention: False + + +exit_signal_handler: False +exit_duration_in_mins: null +exit_interval: null + +untie_embeddings_and_output_weights: True +position_embedding_type: rope +rotary_percent: 0.5 +openai_gelu: False +squared_relu: False +swiglu: True +onnx_safe: null +bert_binary_head: True +max_position_embeddings: 4096 + +transformer_impl: local +use_flash_attn: False +seed: 1234 +data_parallel_random_init: False + +# Optimizer +optimizer: adam +lr: 2.5e-4 +lr_decay_style: cosine +lr_decay_iters: null +lr_decay_samples: 255126953 +lr_warmup_fraction: null +lr_warmup_iters: 0 +lr_warmup_samples: 81381 +lr_warmup_init: 0.0 +min_lr: 2.5e-5 +weight_decay: 0.1 +start_weight_decay: null +end_weight_decay: null +weight_decay_incr_style: constant +clip_grad: 1.0 +adam_beta1: 0.9 +adam_beta2: 0.95 +adam_eps: 1.e-08 +sgd_momentum: 0.9 +override_opt_param_scheduler: False +use_checkpoint_opt_param_scheduler: False + +# checkpointing arguments +save: null +save_interval: 20000 +no_save_optim: null +no_save_rng: null +load: null +no_load_optim: null +no_load_rng: null +finetune: False +use_checkpoint_args: False +exit_on_missing_checkpoint: False + +# loss arguments +loss_scale: null +initial_loss_scale: 4294967296 +min_loss_scale: 1.0 +loss_scale_window: 1000 +hysteresis: 2 +accumulate_allreduce_grads_in_fp32: False +fp16_lm_cross_entropy: False + +# distributed arguments +distributed_backend: nccl +distributed_timeout_minutes: 10 +overlap_grad_reduce: False +delay_grad_reduce: True +overlap_param_gather: False +delay_param_gather: False +scatter_gather_tensors_in_pipeline: True +local_rank: null +lazy_mpu_init: null +empty_unused_memory_level: 0 +standalone_embedding_stage: False +use_distributed_optimizer: False +nccl_communicator_config_path: null + +train_iters: null +eval_iters: 32 +eval_interval: 2000 +skip_train: False + +adlr_autoresume: False +adlr_autoresume_interval: 1000 + +# garbage collection +manual_gc: False +manual_gc_interval: 0 +manual_gc_eval: True + +tp_comm_overlap_cfg: null + +#data +data_path: null +split: '99,1,0' +train_data_path: null +valid_data_path: null +test_data_path: null +data_cache_path: null +mock_data: False +vocab_size: null +vocab_file: null +merge_file: null +vocab_extra_ids: 0 +seq_length: 4096 +encoder_seq_length: null +decoder_seq_length: null +retriever_seq_length: 256 +sample_rate: 1.0 +mask_prob: 0.15 +short_seq_prob: 0.1 +num_workers: 2 +tokenizer_type: GPTSentencePieceTokenizer +tokenizer_model: null +reset_position_ids: False +reset_attention_mask: False +eod_mask_loss: False +train_samples: 268554688 +dataloader_type: null + +#profile: +profile: False +profile_ranks: [0] +profile_step_end: 12 +profile_step_start: 10 + +#logging: +log_params_norm: True +log_num_zeros_in_grad: True +log_throughput: False +log_progress: False +timing_log_level: 0 +timing_log_option: minmax +tensorboard_log_interval: 1 +tensorboard_queue_size: 1000 +log_timers_to_tensorboard: False +log_batch_size_to_tensorboard: False +log_learning_rate_to_tensorboard: True +log_learning_rate_to_tensorboard: True +log_validation_ppl_to_tensorboard: False +log_memory_to_tensorboard: False +log_world_size_to_tensorboard: False +log_loss_scale_to_tensorboard: True +wandb_project: '' +wandb_exp_name: '' +wandb_save_dir: '' +enable_one_logger: True +one_logger_project: megatron-lm +one_logger_run_name: null +log_interval: 100 +tensorboard_dir: null diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh new file mode 100644 index 0000000..b164ae2 --- /dev/null +++ b/examples/gpt3/train_gpt3_175b_distributed.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Runs the "175B" parameter model + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_LOGS_PATH=$2 # +VOCAB_FILE=$3 #/gpt2-vocab.json +MERGE_FILE=$4 #/gpt2-merges.txt +DATA_PATH=$5 #_text_document + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +GPT_MODEL_ARGS=( + --num-layers 96 + --hidden-size 12288 + --num-attention-heads 96 + --seq-length 2048 + --max-position-embeddings 2048 +) + +TRAINING_ARGS=( + --micro-batch-size 1 + --global-batch-size 1536 + --rampup-batch-size 16 16 5859375 + --train-iters 500000 + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --init-method-std 0.006 + --clip-grad 1.0 + --fp16 + --lr 6.0e-5 + --lr-decay-style cosine + --min-lr 6.0e-6 + --lr-warmup-fraction .001 + --lr-decay-iters 430000 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 16 +) + +DATA_ARGS=( + --data-path $DATA_PATH + --vocab-file $VOCAB_FILE + --merge-file $MERGE_FILE + --split 949,50,1 +) + +EVAL_AND_LOGGING_ARGS=( + --log-interval 100 + --save-interval 10000 + --eval-interval 1000 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --eval-iters 10 + --tensorboard-dir $TENSORBOARD_LOGS_PATH +) + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ + ${GPT_MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} diff --git a/examples/inference/README.md b/examples/inference/README.md new file mode 100644 index 0000000..bd8e738 --- /dev/null +++ b/examples/inference/README.md @@ -0,0 +1,274 @@ +### Megatron Core Inference Documentation +This guide will walk you through how you can use megatron core for inference on your models. + +### Contents +- [Megatron Core Inference Documentation](#megatron-core-inference-documentation) +- [Contents](#contents) + - [1. Quick Start](#1-quick-start) + - [1.1 Understanding The Code](#11-understanding-the-code) + - [1.2 Running The Code](#12-running-the-code) + - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend) + - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline) + - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend) + - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller) + - [3.3. Support Other Models](#33-support-other-models) + - [3.3. Modify Inference Parameters](#33-modify-inference-parameters) + - [4. Future work](#4-future-work) + +
+ +#### 1. Quick Start +This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py) + +
+ +##### 1.1 Understanding The Code +***STEP 1 - We initialize model parallel and other default arguments*** +We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. +```python + initialize_megatron( + args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1} + ) +``` + +***STEP 2 - We load the model using the model_provider_function*** +NOTE: The model provider function in the script supports MCore and Legacy models. + +```python + model = get_model(model_provider, wrap_with_ddp=False) + load_checkpoint(model, None, None) + model = model[0] +``` + +***STEP 3 - Choose an engine*** +One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine. +```python + inference_wrapped_model = GPTInferenceWrapper(model, args) + text_generation_controller = SimpleTextGenerationController( + inference_wrapped_model=inference_wrapped_model, + tokenizer=tokenizer + ) + inference_backend = MCoreEngine( + text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size + ) +``` + +***STEP 4 - Run the generate function and display results*** +We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. +*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)* +```python + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, common_inference_params=common_inference_params + ) + + if torch.distributed.get_rank() == 0: + for idx, result in enumerate(results): + print(f' ------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens' : result.generated_tokens + } + print(result) +``` + +
+ +##### 1.2 Running The Code +An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. + +For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) + +``` +#In a slurm cluster (You could also use docker) +ACCOUNT= +MLM_PATH=/path/to/megatron-lm +GPT_CKPT=/path/to/gpt/ckpt +VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file +CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11 + +srun --account $ACCOUNT \ +--job-name=$ACCOUNT:inference \ +--partition=batch \ +--time=01:00:00 \ +--container-image $CONTAINER_IMAGE \ +--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \ +--no-container-mount-home \ +--pty /bin/bash \ + +# Inside the container run the following. + +cd megatron-lm/ +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +TOKENIZER_ARGS=( + --vocab-file /workspace/tokenizer/gpt2-vocab.json + --merge-file /workspace/tokenizer/gpt2-merges.txt + --tokenizer-type GPT2BPETokenizer +) + +MODEL_ARGS=( + --use-checkpoint-args + --use-mcore-models + --load /workspace/mcore_gpt_ckpt +) + +INFERENCE_SPECIFIC_ARGS=( + --attention-dropout 0.0 + --hidden-dropout 0.0 + --num-tokens-to-generate 20 + --max-batch-size 4 +) + +torchrun --nproc-per-node=4 examples/inference/gpt/simple_gpt_batch_inference.py \ + ${TOKENIZER_ARGS[@]} \ + ${MODEL_ARGS[@]} \ + ${INFERENCE_SPECIFIC_ARGS[@]} \ + --prompts "prompt one " "sample prompt two" "sample prompt 3" + +NOTE: Other parameters which can be customized for inference are :- +--temperature (Sampling temperature) +--top_k (top_k sampling) +--top_p (top_p sampling) +--num-tokens-to-generate (Number of tokens to generate for each prompt) +--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.') +--use-dist-ckpt (If you are using dist checkpoint format for the model) +--use-legacy-models (If you are using legacy gpt model instead of mcore gpt model) + +``` + + +
+ + +#### 2. Flow of Control In MCore Backend +The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py). +* We call [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts. +* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. +* The engine will then run until all requests (waiting + active) are completed + * The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller . + * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop + * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits + * The output logits are synchronized across all pipeline parallel ranks + * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters. + * The sampled tokens are then appended to the input prompt tokens for the next iteration + * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition + * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. + * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool + +
+ +#### 3. Customizing The Inference Pipeline +The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. +* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine. +* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy. +* **Inference Wrapped Model** - Change this to support a new model. +* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters. + +
+ +##### 3.1. Create Your Own Inference Backend +This is the highest level of customization. The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. + +```python +class AbstractEngine(ABC): + @staticmethod + def generate(self) -> dict: + """The abstract backend's generate function. + + To define your own backend, make sure you implement this and return the outputs as a dictionary . + + +
+ +##### 3.2. Create Your Own Text Generation Controller +In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods +``` python +class SimpleTextGenerationController: + + def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the input prompts""" + + def sample_from_logits( + self, + last_token_logits: torch.Tensor, + common_inference_params: CommonInferenceParams, + vocab_size: int, + ) -> torch.Tensor: + """Samples the logits to generate outputs + + Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples + """ + + def update_generation_status( + self, + updated_prompts_tokens: torch.Tensor, + generation_started: torch.Tensor, + current_context_end_position: int, + is_generation_done_tensor: torch.Tensor, + generated_sequence_lengths: torch.Tensor, + ) -> torch.Tensor: + """Function to check which prompts have reached an end condition + + We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating + """ + + def generate_all_output_tokens_static_batch( + self, active_requests: OrderedDict[int, InferenceRequest], + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate all the output tokens and probabilities for the prompts . + + This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests + """ + + def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: + """Detokenize the output generations""" +``` + +
+ +##### 3.3. Support Other Models +In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following : +* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings +* Initalizes the model and puts it in eval mode +* Obtains the input parameters (batch size, max seq length) and has an instance of the input + +The main methods to change for your model might be the following: +```python +class AbstractModelInferenceWrapper: + def prep_model_for_inference(self, prompts_tokens: torch.Tensor): + """A utility function for preparing model for inference + + The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass + """ + + @abc.abstractclassmethod + def get_batch_for_context_window(self) -> List: + """Returns the input data for inference + + This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. +``` + +Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel. + +
+ +##### 3.3. Modify Inference Parameters +We use [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below + +``` +from megatron.core.inference.common_inference_params import CommonInferenceParams + +c = CommonInferenceParams(temperature=0.5) +c.add_attributes({'min_length':4, 'eod_id':153}) +``` + +
+ +#### 4. Future work +The following are planned for the future releases . +* Dynamic batching +* Paged Attention +* TRTLLM Engine support +* Support for Multimodal model inference \ No newline at end of file diff --git a/examples/inference/gpt/simple_gpt_batch_inference.py b/examples/inference/gpt/simple_gpt_batch_inference.py new file mode 100644 index 0000000..5c7ae5b --- /dev/null +++ b/examples/inference/gpt/simple_gpt_batch_inference.py @@ -0,0 +1,115 @@ +import os +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig +from pretrain_gpt import model_provider +import torch +import sys +from argparse import Namespace +from megatron.core.inference.engines.abstract_engine import AbstractEngine +from megatron.core.inference.engines.mcore_engine import MCoreEngine +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController +from megatron.core.transformer.module import MegatronModule +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir, os.path.pardir))) + +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training.checkpointing import load_checkpoint +from megatron.core import mpu +from megatron.training.initialize import initialize_megatron +from megatron.training import get_model +from typing import List + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, + help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, + help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, + help='Top p sampling.') + group.add_argument("--return-log-probs", action='store_true', default=False, + help='Return the log probabilities of the final output tokens') + group.add_argument("--num-tokens-to-generate", type=int, default=30, + help='Number of tokens to generate for each prompt') + group.add_argument("--prompts", metavar='N', type=str, nargs='+', + help='Input prompts with each prompt within quotes and seperated by space') + group.add_argument("--max-batch-size", type=int, default=1, + help='Max number of prompts to process at once') + return parser + + +def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine: + """Utility to get the relevant backend for running inference + + This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. + + Args: + args (Namespace): The user arguments parsed from command line + model (MegatronModule): The megatron model . + + Returns: + AbstractBackend: The chosen backend + """ + tokenizer = get_tokenizer() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size + ) + + inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) + text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) + return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size) + +def main(): + """Main program.""" + + # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) + # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) + initialize_megatron(extra_args_provider=add_text_generate_args, + args_defaults={'no_load_rng': True, + 'no_load_optim': True, + 'micro_batch_size': 1, + 'exit_on_missing_checkpoint': True}) + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + load_checkpoint(model, None, None) + model = model[0] + + args = get_args() + + inference_engine = get_inference_engine(args, model) + + common_inference_params = CommonInferenceParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate) + + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, common_inference_params=common_inference_params + ) + + if torch.distributed.get_rank() == 0: + for idx, result in enumerate(results): + print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens' : result.generated_tokens + } + print(result) + +if __name__ == "__main__": + main() diff --git a/examples/inference/quantization/README.md b/examples/inference/quantization/README.md new file mode 100644 index 0000000..ea7ad8e --- /dev/null +++ b/examples/inference/quantization/README.md @@ -0,0 +1,128 @@ +# Megatron Model Optimization and Deployment + +## Installation +We recommend that users follow TensorRT-LLM's official installation guide to build it from source +and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`): + +```sh +git clone https://github.com/NVIDIA/TensorRT-LLM.git +cd TensorRT-LLM +git checkout v0.10.0 +make -C docker release_build +``` + +> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`, +> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is +> called later which requires `.git` to continue. + +Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support: +```sh +pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com +pip install zarr tensorstore==0.1.45 +``` +TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`. +You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/). + +## Support Matrix + +The following matrix shows the current support for the PTQ + TensorRT-LLM export flow. + +| model | fp16 | int8_sq | fp8 | int4_awq | +|-----------------------------|------|---------| ----| -------- | +| nextllm-2b | x | x | x | | +| nemotron3-8b | x | | x | | +| nemotron3-15b | x | | x | | +| llama2-text-7b | x | x | x | TP2 | +| llama2-chat-70b | x | x | x | TP4 | + +Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear +and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the +following checkpoint formats with some remedy: + +| GPTModel | sharded | remedy arguments | +|-----------------------------------|---------|---------------------------------------------| +| megatron.legacy.model | | `--export-legacy-megatron` | +| TE-Fused (default mcore gpt spec) | | `--export-te-mcore-model` | +| TE-Fused (default mcore gpt spec) | x | | + +> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will +> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional +> `model.` wrapper on top of the `GPTModel`. + +> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions. + +## Examples + +> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For +> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's +> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server). + +### nemotron3-8B FP8 Quantization and TensorRT-LLM Deployment +First download the nemotron checkpoint from https://huggingface.co/nvidia/nemotron-3-8b-base-4k, extract the +sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name. + +> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face. +> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/nemotron-3-8b-base-4k` with an access token. + +```sh +git lfs install +git clone git@hf.co:nvidia/nemotron-3-8b-base-4k +cd nemotron-3-8b-base-4k +tar -xvf Nemotron-3-8B-Base-4k.nemo +mv 586f3f51a9cf43bc9369bd53fa08868c_a934dc7c3e1e46a6838bb63379916563_3feba89c944047c19d5a1d0c07a85c32_mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model tokenizer.model +cd .. +``` + +Now launch the PTQ + TensorRT-LLM export script, +```sh +bash examples/inference/quantization/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None +``` +By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the +quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can +be restored for further evaluation. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and +built in `/tmp/trtllm_engine` by default. + +The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the following structure: +``` +├── model_weights +│ ├── common.pt +│ ... +│ +├── model_config.yaml +├── mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model +``` + +> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor +> model parallelism. + +> **KNOWN ISSUES:** The `mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model` in the checkpoint is for +> Megatron-LM's `GPTSentencePiece` tokenizer. +> For TensorRT-LLM, we are trying to load this tokenizer as a Hugging Face `T5Tokenizer` by changing +> some special tokens, `encode`, and `batch_decode`. As a result, the tokenizer behavior in TensorRT-LLM engine may +> not match exactly. + +### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment +> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow +> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and +> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec +> that we support. + +```sh +bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} +``` + +The script expect `${CHECKPOINT_DIR}` to have the following structure: +``` +├── hf +│ ├── tokenizer.config +│ ├── tokenizer.model +│ ... +│ +├── iter_0000001 +│ ├── mp_rank_00 +│ ... +│ +├── latest_checkpointed_iteration.txt +``` +In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as +the source of the tokenizer. diff --git a/examples/inference/quantization/ptq_trtllm_llama_7b.sh b/examples/inference/quantization/ptq_trtllm_llama_7b.sh new file mode 100644 index 0000000..8c4777f --- /dev/null +++ b/examples/inference/quantization/ptq_trtllm_llama_7b.sh @@ -0,0 +1,82 @@ +#!/bin/bash +set -e + +DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0" +NAME="${1:-$DEFAULT_NAME}" + +DEFAULT_QUANT_CFG="int8_sq" +QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}" + +# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER. +TP="8" +INFERENCE_TP=${TP} +DECODER_TYPE="llama" +CHECKPOINT_LOAD_DIR="${NAME}" +TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model" + +# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2 +if [ "$QUANT_CFG" = "int4_awq" ]; then + INFERENCE_TP="2" +fi + +additional_options=" \ + --export-quant-cfg ${QUANT_CFG} \ + --export-legacy-megatron \ + --export-te-mcore-model \ + --calib-batch-size 8 \ + --decoder ${DECODER_TYPE} \ + --export-dir /tmp/trtllm_ckpt \ + --inference-tensor-parallel ${INFERENCE_TP} " + +trtllm_options=" \ + --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \ + --engine-dir /tmp/trtllm_engine \ + --tokenizer ${CHECKPOINT_LOAD_DIR}/hf \ + --max-input-len 2048 \ + --max-output-len 512 \ + --max-batch-size 8 " + +# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!! +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +options=" \ + --disable-bias-linear \ + --swiglu \ + --no-rope-fusion \ + --untie-embeddings-and-output-weights \ + --use-rotary-position-embeddings \ + --normalization RMSNorm \ + --rotary-percent 1.0 \ + --no-position-embedding \ + --no-masked-softmax-fusion \ + --no-bias-gelu-fusion \ + --no-bias-dropout-fusion \ + --no-async-tensor-model-parallel-allreduce \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 11008 \ + --num-attention-heads 32 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --micro-batch-size 1 \ + --make-vocab-size-divisible-by 1 \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --save-interval 1000000 \ + --use-dist-ckpt \ + --load ${CHECKPOINT_LOAD_DIR} + --fp16" + +# Precompile CUDA extentions +python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)" + +# Acquire launch configuration where variable launch_config will be set +launch_config="--nproc_per_node=${TP}" + +# Launch multi-process with torchrun +torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} + +# This script is using mpi4py which will fork multiple processes. +python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/inference/quantization/ptq_trtllm_nemotron3_8b.sh b/examples/inference/quantization/ptq_trtllm_nemotron3_8b.sh new file mode 100644 index 0000000..d5f7fa3 --- /dev/null +++ b/examples/inference/quantization/ptq_trtllm_nemotron3_8b.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -e + +DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0" +NAME="${1:-$DEFAULT_NAME}" + +DEFAULT_QUANT_CFG="fp8" +QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}" + +# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER. +TP="8" +INFERENCE_TP=${TP} +DECODER_TYPE="gptnext" +CHECKPOINT_LOAD_DIR="${NAME}" +TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/tokenizer.model" + +if [ "$QUANT_CFG" = "int4_awq" ]; then + INFERENCE_TP="1" +fi + +additional_options=" \ + --export-quant-cfg ${QUANT_CFG} \ + --export-legacy-megatron \ + --export-te-mcore-model \ + --calib-batch-size 8 \ + --decoder ${DECODER_TYPE} \ + --export-dir /tmp/trtllm_ckpt \ + --inference-tensor-parallel ${INFERENCE_TP} " + +trtllm_options=" \ + --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \ + --engine-dir /tmp/trtllm_engine \ + --tokenizer ${TOKENIZER_MODEL} \ + --max-input-len 2048 \ + --max-output-len 512 \ + --max-batch-size 8 " + +# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!! +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +options=" \ + --apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --no-rope-fusion \ + --no-position-embedding \ + --use-rotary-position-embeddings \ + --rotary-percent 0.5 \ + --squared-relu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --micro-batch-size 1 \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --save-interval 1000000 \ + --load ${CHECKPOINT_LOAD_DIR} \ + --fp16 \ + --use-dist-ckpt" + +# Precompile CUDA extentions +python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)" + +# Acquire launch configuration where variable launch_config will be set +launch_config="--nproc_per_node=${TP}" + +# Launch multi-process with torchrun +torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} + +# This script is using mpi4py which will fork multiple processes. +python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/inference/quantization/text_generation_ptq.py b/examples/inference/quantization/text_generation_ptq.py new file mode 100644 index 0000000..13b327b --- /dev/null +++ b/examples/inference/quantization/text_generation_ptq.py @@ -0,0 +1,223 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Sample Generate GPT.""" +import functools +import os +import sys +from pathlib import Path + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +import modelopt.torch.quantization as mtq +import torch +from datasets import load_dataset +from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group +from tqdm import tqdm + +# [ModelOpt]: changing the default model provider to the ModelOpt version +from megatron.core import mpu +from megatron.inference.arguments import add_modelopt_args +from megatron.inference.checkpointing import load_modelopt_checkpoint +from megatron.inference.gpt.model_provider import model_provider +from megatron.inference.text_generation import generate_and_post_process +from megatron.training import get_args, get_model, initialize_megatron +from megatron.training.checkpointing import save_checkpoint +from megatron.training.utils import print_rank_0, unwrap_model + +QUANT_CFG_CHOICES = { + "int8": mtq.INT8_DEFAULT_CFG, + "int8_sq": mtq.INT8_SMOOTHQUANT_CFG, + "fp8": mtq.FP8_DEFAULT_CFG, + "int4_awq": mtq.INT4_AWQ_CFG, + "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, + "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, +} + + +def add_trtllm_ckpt_export_args(parser): + """Add additional arguments for TensorRT-LLM.""" + group = parser.add_argument_group(title="trtllm") + + group.add_argument( + "--export-dir", type=str, help="The output TensorRT-LLM checkpoint.", + ) + group.add_argument( + "--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.", + ) + group.add_argument( + "--inference-tensor-parallel", + type=int, + help="Tensor parallel for the inference time, can be different from the training config.", + default=1, + ) + + +def add_text_generate_ptq_args(parser): + """Add additional arguments for ModelOpt text generation PTQ.""" + group = parser.add_argument_group(title='ModelOpt text generation ptq') + group.add_argument( + "--calib-dataset", + type=str, + default="cnn_dailymail", + help="Calibration datasets from HuggingFace datasets.", + ) + group.add_argument( + "--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration." + ) + group.add_argument( + "--calib-size", type=int, default=512, help="Samples to use for ptq calibration." + ) + parser.add_argument( + "--prompts", + type=str, + default=( + "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a" + ), + help="Input texts. Please use | to separate different batches.", + ) + add_modelopt_args(parser) + add_trtllm_ckpt_export_args(parser) + return parser + + +def get_calib_dataloader( + data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512 +): + if data == "pileval": + dataset = load_dataset( + "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train" + ) + text_column = "text" + elif data == "wikitext": + dataset = load_dataset("wikitext", "wikitext-103-v1", split="train") + text_column = "text" + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + text_column = "article" + + calib_size = max(min(len(dataset), calib_size), batch_size) + for i in range(calib_size // batch_size): + batch = dataset[i * batch_size : (i + 1) * batch_size][text_column] + for j in range(len(batch)): + batch[j] = batch[j][:max_sequence_length] + yield batch + + + +if __name__ == "__main__": + initialize_megatron( + extra_args_provider=add_text_generate_ptq_args, + args_defaults={ + 'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True, + }, + ) + + args = get_args() + if args.num_layers_per_virtual_pipeline_stage is not None: + print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.") + exit() + + print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.") + args.exit_on_missing_checkpoint = True + + # Set up model and load checkpoint + # [ModelOpt]: make sure that output logits are allgathered. + text_generation_model_provider = functools.partial(model_provider, parallel_output=False) + model = get_model(text_generation_model_provider, wrap_with_ddp=False) + + if args.load is not None: + load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights) + print_rank_0("Done loading checkpoint") + + # Removing virtual pipeline parallel and other wrapper + assert len(model) == 1, "Above condition should have caught this" + unwrapped_model = unwrap_model(model) + + all_prompts = args.prompts.split("|") + + def custom_prompt_forward_loop_func(model): + for prompt in tqdm(all_prompts): + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: + ( + prompts_plus_generations, + prompts_plus_generations_segments, + logprobs, + _, + ) = generate_and_post_process( + model, + prompts=[prompt], + tokens_to_generate=128, + return_output_log_probs=True, + temperature=1.0, + ) + print_rank_0(prompts_plus_generations) + else: + generate_and_post_process(model) + + def hf_dataset_forword_loop_func(model): + dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size) + for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size): + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: + ( + prompts_plus_generations, + prompts_plus_generations_segments, + logprobs, + _, + ) = generate_and_post_process( + model, + prompts=prompts, + tokens_to_generate=0, + return_output_log_probs=True, + temperature=1.0, + ) + else: + generate_and_post_process(model) + + ptq_forward_loop_func = custom_prompt_forward_loop_func + if args.calib_dataset is not None: + ptq_forward_loop_func = hf_dataset_forword_loop_func + + # Setting data parallel and tensor parallel group + set_data_parallel_group(mpu.get_data_parallel_group()) + set_tensor_parallel_group(mpu.get_tensor_model_parallel_group()) + + if args.export_quant_cfg in QUANT_CFG_CHOICES: + mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg] + if "*output_layer*" not in mtq_config["quant_cfg"]: + mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False} + if "awq" in args.export_quant_cfg: + weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"] # type: ignore + if isinstance(weight_quantizer, list): + weight_quantizer = weight_quantizer[0] + weight_quantizer["block_sizes"][-1] = 128 + print_rank_0("Quantizing the model...") + mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func) + + custom_prompt_forward_loop_func(model[0]) + + if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES: + save_checkpoint(1, unwrapped_model, None, None, 0) + + print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}") + + if args.export_dir: + assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported." + Path(args.export_dir).mkdir(parents=True, exist_ok=True) + print_rank_0("Exporting TensorRT-LLM checkpoints.") + + from modelopt.torch.export import export_tensorrt_llm_checkpoint + + # In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default. + export_tensorrt_llm_checkpoint( + unwrapped_model[0], + args.decoder, + torch.bfloat16 if args.bf16 else torch.float16, + export_dir=args.export_dir, + inference_tensor_parallel=args.inference_tensor_parallel, + inference_pipeline_parallel=1, + use_nfs_workspace=True, + ) + + print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}") diff --git a/examples/inference/quantization/trtllm_text_generation.py b/examples/inference/quantization/trtllm_text_generation.py new file mode 100644 index 0000000..17a47bf --- /dev/null +++ b/examples/inference/quantization/trtllm_text_generation.py @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""An example script to run the tensorrt_llm engine.""" + +import argparse +from pathlib import Path + +import numpy as np +import torch +from modelopt.deploy.llm import LLM, build_tensorrt_llm +from transformers import AutoTokenizer, T5Tokenizer + + +class CustomSentencePieceTokenizer(T5Tokenizer): + """This is a custom GPTSentencePiece Tokenizer modified from the T5Tokenizer. + + Note: + The modification is kept minimal to make `encode` and `batch_decode` working + properly (used in TensorRT-LLM engine). Other functions have not been tested. + """ + + def __init__(self, model): + super().__init__(model, extra_ids=0, bos_token="", pad_token="") + + def encode(self, text, add_special_tokens: bool = True, **kwargs): + return torch.Tensor(self.sp_model.encode_as_ids(text)) + + def batch_encode_plus( + self, batch_text_or_text_pairs, add_special_tokens: bool = True, **kwargs + ): + return {'input_ids': self.sp_model.encode_as_ids(batch_text_or_text_pairs)} + + def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs): + if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences): + sequences = sequences.tolist() + return self.sp_model.decode(sequences) + + def decode(self, token_ids, skip_special_tokens: bool = False, **kwargs): + return self.sp_model.decode([token_ids])[0] + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--tokenizer", type=str, default="") + parser.add_argument("--max-input-len", type=int, default=4096) + parser.add_argument("--max-output-len", type=int, default=512) + parser.add_argument("--max-batch-size", type=int, default=8) + parser.add_argument("--tensorrt-llm-checkpoint-dir", type=str, default=None) + parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine") + parser.add_argument( + "--input-texts", + type=str, + default=( + "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a" + ), + help="Input texts. Please use | to separate different batches.", + ) + parser.add_argument("--max-beam-width", type=int, default=1) + parser.add_argument("--profiler-output", type=str, default="") + return parser.parse_args() + + +def run(args): + tokenizer_path = Path(args.tokenizer) + + if tokenizer_path.is_dir(): + # For llama models, use local HF tokenizer which is a folder. + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True) + elif tokenizer_path.is_file(): + # For nextllm and nemotron models, use local Megatron GPTSentencePiece tokenizer which is a model file. + tokenizer = CustomSentencePieceTokenizer(args.tokenizer) + else: + raise ValueError( + "arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext" + ) + print(tokenizer, tokenizer.vocab_size) + + if not hasattr(args, "profiler_output"): + args.profiler_output = "" + + input_texts = args.input_texts.split("|") + assert input_texts, "input_text not specified" + print(input_texts) + + if args.tensorrt_llm_checkpoint_dir is not None: + print("Building TensorRT-LLM engines.") + build_tensorrt_llm( + args.tensorrt_llm_checkpoint_dir + "/config.json", + args.engine_dir, + max_input_len=args.max_input_len, + max_batch_size=args.max_batch_size, + max_beam_width=args.max_beam_width, + num_build_workers=1, + ) + print(f"TensorRT-LLM engines saved to {args.engine_dir}") + + free_memory_before = torch.cuda.mem_get_info() + + # This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM + llm_engine = LLM(args.engine_dir, tokenizer) + + torch.cuda.cudart().cudaProfilerStart() + # outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width) + outputs = llm_engine.generate(input_texts) + torch.cuda.cudart().cudaProfilerStop() + + free_memory_after = torch.cuda.mem_get_info() + print( + f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB" + ) + print(outputs) + + +if __name__ == "__main__": + args = parse_arguments() + run(args) diff --git a/examples/inference/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh new file mode 100644 index 0000000..e8e61ad --- /dev/null +++ b/examples/inference/run_text_generation_server_345M.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# This example will start serving the 345M model. +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +CHECKPOINT= +VOCAB_FILE= +MERGE_FILE= + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --load ${CHECKPOINT} \ + --num-attention-heads 16 \ + --max-position-embeddings 1024 \ + --tokenizer-type GPT2BPETokenizer \ + --fp16 \ + --micro-batch-size 1 \ + --seq-length 1024 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --seed 42 diff --git a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh new file mode 100644 index 0000000..368cec3 --- /dev/null +++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# This example will start serving the 345M model that is partitioned 8 way tensor parallel +DISTRIBUTED_ARGS="--nproc_per_node 8 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +CHECKPOINT= +VOCAB_FILE= +MERGE_FILE= + +pip install flask-restful + +python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --tensor-model-parallel-size 8 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --load ${CHECKPOINT} \ + --num-attention-heads 16 \ + --max-position-embeddings 1024 \ + --tokenizer-type GPT2BPETokenizer \ + --fp16 \ + --micro-batch-size 1 \ + --seq-length 1024 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --seed 42 diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore new file mode 100644 index 0000000..940f479 --- /dev/null +++ b/examples/mamba/.gitignore @@ -0,0 +1,4 @@ +checkpoints/ +data-cache/ +tensorboard/ +triton-cache/ diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile new file mode 100644 index 0000000..2e19409 --- /dev/null +++ b/examples/mamba/Dockerfile @@ -0,0 +1,32 @@ +FROM nvcr.io/nvidia/pytorch:24.01-py3 + +RUN pip uninstall -y triton && \ + pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful + +# The causal-conv1d and mamba-ssm packages below are built from scratch here +# (which takes significant time) because there are no wheels available on PyPI +# for these relatively newer versions of the packages that are compatible with +# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we +# are using (in the NGC base container). Generally, if the package is not +# compatible with the PyTorch version, then it will generate a Python import +# error. The package authors tend to only release wheels for new versions of +# these pacakges which are compatible with the versions of regular PyTorch and +# NGC-variant PyTorch that are newer at the time of release. So, to use newer +# versions of these packages with relatively older versions of the NGC PyTorch +# container, we tend to have to build the packages from scratch. + +RUN cd /tmp && \ + git clone https://github.com/Dao-AILab/causal-conv1d.git && \ + cd causal-conv1d && \ + git checkout v1.2.2.post1 && \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \ + cd .. && \ + rm -rf causal-conv1d + +RUN cd /tmp && \ + git clone https://github.com/state-spaces/mamba.git && \ + cd mamba && \ + git checkout v2.0.3 && \ + MAMBA_FORCE_BUILD=TRUE pip install . && \ + cd .. && \ + rm -rf mamba diff --git a/examples/mamba/README.md b/examples/mamba/README.md new file mode 100644 index 0000000..5c3934d --- /dev/null +++ b/examples/mamba/README.md @@ -0,0 +1,91 @@ +# Mamba-based Language Models + +## Introduction + +This document is an entrypoint into the code used for +[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887). + +We are releasing the parameters for some of the models described in that +technical report via +[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c). + +## Installation + +Create and run a Docker container using the [Dockerfile](./Dockerfile). + +``` +docker build -t your_image_name:your_tag . +docker run --gpus all -it --rm \ + -v /path/to/megatron:/workspace/megatron \ + -v /path/to/dataset:/workspace/dataset \ + -v /path/to/checkpoints:/workspace/checkpoints \ + -w /workspace/megatron/examples/mamba \ + your_image_name:your_tag +``` + +## Train + +[`train.sh`](./train.sh) is an example pretraining script, showing how to run on +a single node. Select between 800M-scale and 8B-scale models by setting the +`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as +the one described in the technical report. + +## Text Generation + +Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text +generation server using an 8B hybrid checkpoint. This is configured to run the +8B hybrid model described in the technical report, with tensor model parallel +set to 1. + +The arguments in the script will need to be changed if using a checkpoint with a +different model parallel configuration or other differences, such as model +architecture. For example, to run the 8B pure Mamba-2 model, change +`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them. + +Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start +a text generation server using the 8B reference Transformer checkpoint. + +## Checkpoint Formats + +For inference, the model must be configured to match the checkpoint file used, +including the hybrid layer configuration and model parallel configuration. + +If you need to convert a hybrid checkpoint file to a different tensor parallel +or pipeline parallel size, use +[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py). +There is an example run command at the end of that file. + +Before running that script, you will need to set `PYTHONPATH` to include the +root directory of your Megatron-LM repository clone. + +``` +export PYTHONPATH=:PYTHONPATH +``` + +## Hybrid Options + +`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers +to total layers. For example, 4 attention layers out of 48 total layers is +specified by `--hybrid-attention-ratio 0.08`. + +`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total +layers. For example, 24 MLP layers out of 48 total layers is specified by +`--hybrid-mlp-ratio 0.5`. + +* (`ATT` + `MLP`) must be less than or equal to 1.0. +* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to +total layers. +* `ATT` = `MLP` = 0 is a pure Mamba model. +* `ATT` = `MLP` = 0.5 is a transfomer model. + +If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern` +is specified, the logfile will include information about the hybrid layer +pattern used. `--hybrid-override-pattern` can be used to specify a different +pattern than the default, algorithmically-generated one. + +## Mamba vs Mamba-2 + +This codebase currently only supports Mamba-2, and not the original version of +Mamba. However, the +[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba) +can be configured to run the original version of Mamba. diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh new file mode 100644 index 0000000..8d3137f --- /dev/null +++ b/examples/mamba/run_text_gen_server_8b.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Use: ./run_text_gen_server_8b.sh +# To launch the client: python ../../tools/text_generation_cli.py + +CHECKPOINT_PATH=$1 +TOKENIZER_PATH=$2 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +export TRITON_CACHE_DIR="./triton-cache/" +export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" + +torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --untie-embeddings-and-output-weights \ + --num-layers 56 \ + --hidden-size 4096 \ + --load ${CHECKPOINT_PATH} \ + --num-attention-heads 32 \ + --group-query-attention \ + --num-query-groups 8 \ + --hybrid-attention-ratio 0.08 \ + --hybrid-mlp-ratio 0.5 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --normalization RMSNorm \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type none \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --distributed-timeout-minutes 1440 \ + --bf16 \ + --micro-batch-size 1 \ + --use-mcore-models \ + --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ + --seed 42 diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh new file mode 100644 index 0000000..5413b24 --- /dev/null +++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Use: ./run_text_gen_server_8b_gpt3.sh +# To launch the client: python ../../tools/text_generation_cli.py + +CHECKPOINT_PATH=$1 +TOKENIZER_PATH=$2 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --use-flash-attn \ + --apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --num-layers 32 \ + --hidden-size 4096 \ + --load ${CHECKPOINT_PATH} \ + --num-attention-heads 32 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type rope \ + --rotary-percent 0.5 \ + --squared-relu \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --distributed-timeout-minutes 1440 \ + --bf16 \ + --micro-batch-size 1 \ + --use-mcore-models \ + --transformer-impl local \ + --seed 42 diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh new file mode 100644 index 0000000..3952a99 --- /dev/null +++ b/examples/mamba/train.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Use: ./train.sh + +MODEL_SCALE="800M" # or "8B" + +case "${MODEL_SCALE}" in + "800M") + TENSOR_MODEL_PARALLEL_SIZE=1 + NUM_LAYERS=48 + HIDDEN_SIZE=1024 + NUM_ATTENTION_HEADS=16 + GLOBAL_BATCH_SIZE=32 + ;; + "8B") + TENSOR_MODEL_PARALLEL_SIZE=4 + NUM_LAYERS=56 + HIDDEN_SIZE=4096 + NUM_ATTENTION_HEADS=32 + GLOBAL_BATCH_SIZE=8 + ;; + *) + echo "Invalid version specified" + exit 1 + ;; +esac + +DATA_PATH=$1 +TOKENIZER_PATH=$2 + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +CHECKPOINT_DIR="./checkpoints" +DATACACHE_DIR="./data-cache" +TENSORBOARD_DIR="./tensorboard" + +mkdir -p ${CHECKPOINT_DIR} +mkdir -p ${DATACACHE_DIR} +mkdir -p ${TENSORBOARD_DIR} + +export TRITON_CACHE_DIR="./triton-cache/" +export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" + +SEQ_LEN=4096 +TRAIN_SAMPLES=73242188 # 300B tokens / 4096 +LR_WARMUP_SAMPLES=50000 +LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES + +options=" \ + --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \ + --sequence-parallel \ + --pipeline-model-parallel-size 1 \ + --use-distributed-optimizer \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --untie-embeddings-and-output-weights \ + --init-method-std 0.02 \ + --position-embedding-type none \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --num-attention-heads ${NUM_ATTENTION_HEADS} \ + --group-query-attention \ + --num-query-groups 8 \ + --hybrid-attention-ratio 0.08 \ + --hybrid-mlp-ratio 0.5 \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --train-samples ${TRAIN_SAMPLES} \ + --lr-warmup-samples ${LR_WARMUP_SAMPLES} \ + --lr-decay-samples ${LR_DECAY_SAMPLES} \ + --save ${CHECKPOINT_DIR} \ + --load ${CHECKPOINT_DIR} \ + --data-path ${DATA_PATH} \ + --data-cache-path ${DATACACHE_DIR} \ + --split 99,1,0 \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --micro-batch-size 4 \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --lr 2.5e-4 \ + --min-lr 2.5e-5 \ + --lr-decay-style cosine \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --normalization RMSNorm \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 10 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 32 \ + --bf16 \ + --use-mcore-models \ + --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ + --no-create-attention-mask-in-dataloader \ + --tensorboard-dir ${TENSORBOARD_DIR}" + +torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options} diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md new file mode 100644 index 0000000..1025ded --- /dev/null +++ b/examples/mixtral/README.md @@ -0,0 +1,120 @@ +# Mixtral 8x7B Model Inference and Finetuning + +## Download Mixtral 8x7B Checkpoints +Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/) + +Or you can simply run this following script to download Mixtral 8x7B into a specific folder. +```python +from huggingface_hub import snapshot_download +SAVED_DIR = "" # Specify the saved directory +# Download HF checkpoints +snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False) +``` + +## Convert Mixtral 8x7B checkpoints from HF to MCore +The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format. +The target model parallel size(e.g. TP,PP,EP) should be specified. + +``` +TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model +MEGATRON_PATH="/workspace/megatron-lm" +export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +TARGET_TP_SIZE=1 +TARGET_PP_SIZE=4 +TARGET_EP_SIZE=8 + +HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf +MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE} + +python tools/checkpoint/convert.py \ +--model-type GPT \ +--loader loader_mixtral_hf \ +--saver mcore \ +--target-tensor-parallel-size ${TARGET_TP_SIZE} \ +--target-pipeline-parallel-size ${TARGET_PP_SIZE} \ +--target-expert-parallel-size ${TARGET_EP_SIZE} \ +--load-dir ${HF_FORMAT_DIR} \ +--save-dir ${MEGATRON_FORMAT_DIR} \ +--tokenizer-model ${TOKENIZER_MODEL} +``` + +## Text generation with Mixtral 8x7B +Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed. + +The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script: +``` +#!/bin/bash +# This example will start serving the Mixtral 8x7B model. +DISTRIBUTED_ARGS="--nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +CHECKPOINT= +TOKENIZER_MODEL= + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 2 \ + --expert-model-parallel-size 1 \ + --load ${CHECKPOINT} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model $TOKENIZER_MODEL \ + --use-mcore-models \ + --max-position-embeddings 32768 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 32 \ + --normalization RMSNorm \ + --disable-bias-linear \ + --position-embedding-type rope \ + --no-position-embedding \ + --swiglu \ + --untie-embeddings-and-output-weights \ + --group-query-attention \ + --num-query-groups 8 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 1024 \ + --seed 42 \ + --num-experts 8 \ + --moe-router-topk 2 \ + --moe-token-dispatcher-type alltoall \ + --mock-data \ + --rotary-base 1000000 +``` + +Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on. + +``` +python tools/text_generation_cli.py localhost:5000 +``` + + +## Finetuning from pretrained Mixtral 8x7B +To finetuning pretrained Mixtral 8x7B, use the following scripts: + + +```bash +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3 +CHECKPOINT_PATH="" # Speicfy path to checkpoint dir +TOKENIZER_MODEL="" # Specify path to tokenizer.model +DATA_PATH="" # Specify path to data + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + $PYTORCH_IMAGE \ + bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH +``` diff --git a/examples/mixtral/train_mixtral_8x7b_distributed.sh b/examples/mixtral/train_mixtral_8x7b_distributed.sh new file mode 100644 index 0000000..ed44d60 --- /dev/null +++ b/examples/mixtral/train_mixtral_8x7b_distributed.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Runs Mixtral 8x7B model + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=${MASTER_ADDR:-"localhost"} +MASTER_PORT=${MASTER_PORT:-"6000"} +NNODES=${SLURM_NNODES:-"1"} +NODE_RANK=${RANK:-"0"} +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +CHECKPOINT_PATH=$1 +TOKENIZER_MODEL=$2 +DATA_PATH=$3 + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NNODES + --node_rank $NODE_RANK + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +MODEL_ARGS=( + --use-mcore-models + --disable-bias-linear + --seq-length 4096 + --max-position-embeddings 32768 + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --init-method-std 0.01 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --normalization RMSNorm + --position-embedding-type rope + --swiglu + --untie-embeddings-and-output-weights + --group-query-attention + --num-query-groups 8 + --no-masked-softmax-fusion + --no-position-embedding + --rotary-base 1000000 +) + +MOE_ARGS=( + --num-experts 8 + --moe-router-topk 2 + --moe-router-load-balancing-type aux_loss + --moe-aux-loss-coeff 1e-2 + --moe-grouped-gemm + --moe-token-dispatcher-type alltoall + --overlap-param-gather + --overlap-grad-reduce +) + +DATA_ARGS=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${TOKENIZER_MODEL} + --data-path $DATA_PATH + --split 99990,8,2 +) + +TRAINING_ARGS=( + --micro-batch-size 1 + --global-batch-size 256 + --lr 1e-4 + --train-iters 500000 + --lr-decay-iters 320000 + --lr-decay-style cosine + --min-lr 1.0e-5 + --weight-decay 0.1 + --lr-warmup-iters 500 + --clip-grad 1.0 + --bf16 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 4 + --expert-model-parallel-size 8 + --use-distributed-optimizer + --sequence-parallel +) + +LOGGING_ARGS=( + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ + --no-load-optim \ + --no-load-rng +) + +if [ -n "${WANDB_API_KEY}" ]; then + LOGGING_ARGS+=( + --wandb-project ${WANDB_PROJECT:-"Mixtral"} + --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} + ) +fi + + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ + ${MODEL_ARGS[@]} \ + ${MOE_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${LOGGING_ARGS[@]} diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile new file mode 100644 index 0000000..18f0e65 --- /dev/null +++ b/examples/multimodal/Dockerfile @@ -0,0 +1,27 @@ +FROM nvcr.io/nvidia/pytorch:24.02-py3 + +RUN apt update && \ + apt -y upgrade && \ + apt install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + python3-pip \ + python3-dev \ + bash \ + git \ + vim \ + python-is-python3 \ + default-jre + +RUN pip install --upgrade pip +RUN pip install einops einops-exts sentencepiece braceexpand webdataset +RUN pip install transformers datasets +RUN pip install pytest-cov pytest_mock nltk wrapt +RUN pip install zarr "tensorstore==0.1.45" +RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main +RUN pip install black==19.10b0 isort click==8.0.2 +RUN pip install pycocoevalcap megatron-energon +RUN pip install git+https://github.com/openai/CLIP.git +# Use --no-deps for the following to avoid outdated and unnecessary dependencies. +RUN pip install mmf --no-deps +RUN pip install open-flamingo[eval] --no-deps diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md new file mode 100644 index 0000000..4c7617d --- /dev/null +++ b/examples/multimodal/README.md @@ -0,0 +1,148 @@ +# Multimodal Example + +The following walks through all the steps required to pretrain and instruction tune a llava architecture vision-language model (VLM). It is important to precisely follow all steps to obtain the benchmark scores at the end. + +This example has been tested on an A100 based DGX cluster. Pretraining and instruction tuning took approximately 1 day and 11 hours respectively on 64 GPUs using four way tensor parallelism (tp=4). Training speed will scale approximately linearly with number of GPUs available. + +Multimodal support in megatron is still under active development. This example is not intended to produce state-of-the-art model quality (that would require more data and model refinements), it is merely intended to demonstrate the multimodal functionality in megatron. If you hit any problems, please open a github issue. + +## Setup + +### Docker container + +You can build a docker container using `examples/multimodal/Dockerfile` to run this example. + +### Language model + +Follow the instructions in `megatron-lm/docs/llama_mistral.md` to download weights for Mistral-7B-Instruct-v0.3 and convert to mcore format with tensor parallel size 4 + +### Vision model + +This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following: + +``` +python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 +``` + +### Combined model checkpoint + +Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder: + +``` +examples/multimodal/combine_mistral_clip.sh +``` + +## Training + +### Pretraining + +1. Download the LLavA-Pretrain dataset from Hugging Face and unzip the images folder (NOTE: 79GB of disk space required): + + ``` + git clone https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain + cd LLaVA-Pretrain + unzip images.zip + ``` + +3. Run the following script to convert the data to webdataset format: + + ``` + cd + python examples/multimodal/convert_llava_pretrain_to_wds.py + ``` + +4. Run the following command to convert to megatron-energon format: + + ``` + cd /wds + energon ./ + ``` + + select the following values for the presented options: + + ``` + > Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 9,1,0 + > Do you want to create a dataset.yaml interactively? [Y/n]: Y + > Please enter a number to choose a class: 10 (VQAWebdataset) + > Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]: Y + > Please enter a webdataset field name for 'image' (): jpg + > Please enter a webdataset field name for 'context' (): json[0][value] + > Please enter a webdataset field name for 'answers' (typing.Optional[typing.List[str]], default: None): json[1][value] + > Please enter a webdataset field name for 'answer_weights' (typing.Optional[torch.Tensor], default: None): + ``` + +5. Update `pretrain_dataset.yaml` so that both `path` variables point to `LLaVA-Pretrain/wds` + +6. Run the following script to pretrain a llava model for image captioning: + + ``` + cd + examples/multimodal/pretrain_mistral_clip.sh + ``` + +All being well you should observe training and valiation loss curves similar to the following: + +Pretraining loss curves + +These curves were obtained with global batch size of 256. Changing this value will likely change the curves. For pretraining and instruction tuning llava models we have found that loss curves are an unreliable predictor of downstream task performance. Therefore it is necessary to run test generation and evaluation on a range of metrics to understand model quality. We intend to add training time zero-shot evaluation in a future update. + +### SFT + +1. Prepare an instruction tuning dataset such in [megatron-energon format](https://nvidia.github.io/Megatron-Energon/data_prep.html#). NOTE: we do not provide instructions for this. + +5. Update `sft_dataset.yaml` so that both `path` variables point to the train and val splits of your instruction tuning dataset. + +Run the following script to instruction tune the pre-trained llava model: + + ``` + examples/multimodal/sft_mistral_clip.sh + ``` + +## Evaluation + +### Generation + +Run the following script: + +``` +examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \ + --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name +``` + +### After pretraining + +#### COCO captioning + +1. Download the COCO 2014 test image set: + + ```wget http://images.cocodataset.org/zips/test2014.zip``` + +2. Download COCO test image annotations: + + ```https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json``` + +3. First, run text generation using `--task captioning`. + +4. Run the following command: + + ``` + python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file + ``` + +For the mistral-7b-instruct plus clip llava model you should obtain a COCO CIDer score of approximately 94. + +### After SFT + +#### MMMU + +The official MMMU repository is not pip installable currently so please clone their code in `examples/multimodal` by running `git clone https://github.com/MMMU-Benchmark/MMMU.git`. + +The MMMU dataset is loaded from HuggingFace automatically as part of the code. + +Run text generation using `--task MMMU`. Then, run the following command: + +``` +python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation +``` + +For the mistral-7b-instruct plus clip instruction tuned llava model you should obtain a MMMU score of approximately 38. diff --git a/examples/multimodal/assets/pretrain_curves.png b/examples/multimodal/assets/pretrain_curves.png new file mode 100644 index 0000000000000000000000000000000000000000..7981a73ba1c9eb9178218fb4e58ce279cce6e18b GIT binary patch literal 329882 zcmeFa2V7Iz^Dm0nKn$R$prMKiN>>TJ9HUsMR#ch+5tT>_CDe#QKoPK`(nM6O5JaSd z1`r#df)FXe01~8yPy>YI&E_1>@%Q_^x$2+Th&G!K3feIoRRfWWMS$FNwN?O3e5jgRM%V|Z5q0p$xP z&TDNG#7U9cRLoV_m;@4 zbGNK@q?hgeG-u1;gLq6cCvHrOAUi$gSd&FfvwjcxttGsAAx%5 zV7g$@!(erZVr#=!-;~?2FADTi7g#k;+`78`m{$9G@3ISx8!D_T(tIzax;wvZ(Ju86 zZ=9bq=5x{a2nO6=PlTT>Px_OkRmi39C(X`13@ z(YmL$uh=reEh;11=FdNJRj2{&*fUG{zNvLKBQ}0k3Ho{U#<~Jt-Ge=By~N!Y^ylln zXLki=DV{8wv!{Srl1g;EqL0r=wlzpyE#@+JIs5zP`N!d!z;C;;+qc7Uw~LRfs|Vqzr|)sG6D#2d!d}}B5CjC4tw#RM+P-Vm zAbdaV*q;5q`zT|7^y;|_Z|yQ&jUcp={-U`W)5k0)Gxaq`3ycn^X;(P-t&9s2MY z`LV`I`I%dM-Hlf6x3rPRdiuD^W7IX(HCGxhke8P?^f`P)f7fQSKOToOqm@T}eZBNG zGy(zw)B`rCd-}L(XzA(cX=tw3SifEk?ocBHdidgqY90i|-@oMh_iT10xcD6N@;&D1 zA&-17&e_w?*J$NRAG%_Yz!vu%0L2rM||0 zG3<|rf4um|gN7Q&+<&0O?}VQDDNx#YfuY8qTr*zq^3u9Jz)0z1o2`)}h#B%{b}sy< z{QD6;3tro|Ru{EiK)_UB`{v*F5NGwZiab5BH@@Lr(mgg!cxk3v9?TX?3^uM z(+tnulRstH`R%J+${Q$nH9^nSY2_y; z2cOmvoFgPIZ+cQ-*57_GB?~4-pq5)zUi_y!1;WitX4fv)4*{J)c!= zGG%nop7_sv?^#0Z{9qeG%XPOMwl0n-Z+P5Hy-Hh*AFM%{N1?nG#EdwH2qI@ri|Q-m zo1Z-vmhO>`aro!))(MIkH!TTa8nhjAoaNm6Y+5cf@N8_WGVU8i!3@QARd%iyXEnU9 zJfV13r#ti?#pzIsph3XELU!u(GyDn1tImoiUhKkQLyXSNmup%2%;WRN2$Y@d^e6N` zi}0a?@}(;(LdRRjyJIEBHfGJq4xAluV*ffktt2>r*qt`Nd~CMZ!sa~BfIRZI`^41m z0{yoOcmJYj7|DUNC)i(=Fk6gmfC zA#~wIU+I4w#Z3a}1?FwhZGTIp@9*+mEsyWs@zMLAHC)Zy2vqVVeffW6>H8}uDQPO( z{~H#mzN9$D*Q)2_KWk!JK`J?Ol}ywxVt!}bf7q`71Bm%l$dEf&&bY8u_SnR`N!g}C zKX*{g(+%U5rDvZDfqW@S-&p-qU3R|Abm5AD;^7*vj#wS?)Id>I&Rn_Bkq?aR)MZW!Rvy{F-Y518>z{pGzYe`1Hu+k8ojl3jqP z$?+C~HKNd|8?r(lAbO~emYrT;@ZzePT|=19k<@8gmGLs(h&6v>i+7bwa#r7!tPodS zR`KwP#2T@IlXKQffY*Q#fBQ4DcmAL}{SJeS_7Jjl@Y^eaxoylZ9+nC0yuj$osTF0f zmU_8F#~wT)hN9fWw*0XsckIHg?Is_BN<8iIJW%sxiH8UY)Vqx^l^=o@q{ba`PFPRpnKX)^k`t0>qEpNn^94m);iK-17%fWK} zEIdDqJ$b7xqSF>a;53&_H+$7t>O5BQK$xaGTEDyeSVnR~%N@o@owZn%Y;fN-eeS9javWDq-IMY&4R_NbTqQGqWym*kr{j*N zR`S#aIC%v+o^(Ubkq#MAe-^#uNgRvMV~JUZ_UyJbmTfjju=m*>&99Kt)Rs^+Y)^u?bJzhhD z@a;|lne$Oui*m+utF-rKf{_t_qBiy0^-Gs13K_@ti~K}^hR>SAo6O?t+XuAzRbLm= z>pbs86;_Ji^uG6WnrdpKKF*`B80-67r~0*^Hq?VqbxQhH%qUF=ebsjAVcFzEai@=p zrIj{}`C(t?Q+KBfU2d4&O6-Vb487G5%WDe@9-dG#nRv~(UK?^mcjP_RF4d|0wAL8s zeUx$iX4E)6nC0?Jn#E(^Xe?e+QAFzG?en8i0$B9BgJ;`>TmqNzKR-QIcXLCG`^IG^nl+mB&q(P0MkJn`;5vc4cxw3(d$}(*^)-x|48~FsdFH^d< zPN9CBcF8iS_O_;YPq$>heY(T%Hf$wKaX+CYF^`sA=d(M4W$L424K=#Y?N=F}eT~%XiZ^jJ8=Js#eKbCF4D&7z>@zzO{?Jv(x9D;?I*Qa3k zGk@dxE2lLYiEr)bi|O1hohWb~4VQ;CpJ{)* zCu~@Uw81{`>lKC!i@2508~=12Dpt)eLKw$K&h4c8}yj(uiy zr^h?mw#ef-^oU55LcUA+cxKd=wxW(W18OSwb!vn)X|j&-0_6N%LS@2Ow22~#A>zcj zRz={N(&X{4*o9Bm{JrxL9wF!ufzmpj-~X(c$~56mbj5wU^tCFVvAmXOrrUEbo!V8M zs3fK_@{qy^wxG_#!Q07S=|x>P+^Ln4PST|#cQhh7QyWW9w3Qkq1@+ulpl36l)>+vN zoSbcF2k)2q@`SMCC+|<1Qz(QUC>f`BS%FLY#kMTj=gnHiTI;|a!{zm#Qtf(O;FZ2R z#L)f&Ez1YW_`EA}#n%43DiXJCA~y@A%-BkB#jv zqMB5(c9r=dLzhw)!s@h*4mhsc`zVJR<2Z85t1r7M&RGYUk+S-pn4jp#ZtI&dz!96E z(Z>TBwju9-n;$t_RQJ$iS^-N0UAlEo5X}owAAGPk0mG-qS^fGEoLvjFZk+LaOGY8u zgqH16HNvWj=`m%|{vgMh@LvIG- z0*V!jyR@^8g&r^7vdz$%t3-d|#-AP+YkqdZu6xYLTaMk^E<4`rSr%F5U*gnQI7{p& z4(wsY_giT1tBypV+IFiM;qyB$ns0({+`3bK4Q3toLmWSss34T>cRqQ^+E}l`u1gxszonEHGdX~4=TWe_ z#j6~}c!~DJ%f{56$ueJ@Dv^GA3%x562}4lJ%bg=7%G5JAgzb*bN?u&1KCqg8WYtxlhuCwYVXZ5tMt|ah zhcm#DId){wgjdBElH(0V*$$Xe-odnSvAy|O&I&^rXyF29xkRR%v-|tJIw2j=P6e-D&1(wR7znDmWz1~3=R$@ zlocPS>$H+!C$R?>COCx!=422>5~NfriVQ1?G%IQ=JtLT@q_F6$4TSErs7zHtiBq1B z0PF7+ywG6{$~KxfOIenG3g7HRY3-iZk=;?-*53LuBi`ww47$Cxl~+fav}TZ|v-r8{ znVg|nS)}RSm{8$J>#(6qvDKZfN=@f){l-1eeZVdG)7Uq!YVF2|_>fM;YNySfq0|65 z?)}us`-5Dj3F&dhlJNrKbY<$=DqH2~vbJ`olO~)mrzIzs`-@6Z$$Aj@6`UbTo=%U` ze;_Q~E+f&N4F8L@J4mMtCowm+J@05;kuq{yDH5@R)m4oS0S?gtq0wdR88eYOaXt0e zK#Qhlw>zlM{UlBliB=)ZJ{41jtJUo|C*J5gJ=z$h5yp?{0RLGrrDYxz=$-j^GfvfqnSEkY+mNeDe=D`yj=#b-cuH>iPrYO*5c(-EI7kFFu zh6cm3KsydyOxQ>Y4=kyAHQL*r%xpZnVoh(HH?~;RtwhZyDX{9AUthLyDS=tjmC4iM zMw>9>NC)`fba}SNaoz~9H7c0AHTY{1n2L3MpR&UIh;@ndC06?1ZXZcu>y)#m>8!{Y zjne#y>ZS8#IwCpN=jOHBh13|(*>QtuVxji}_0Hy%^q;71mAhe`$>tx4L=^?FJZi?} zc;%#{j%QcW9T*v;$qd^XNyQiujbh4#lH)agU+Q$uk=7X(^cjiI;)m`&SxHsPTz%Do z9VeO{GS%m1f)1O?BN=e3+qjm)+Ma=L?e>io(zEPbVvM)^eTaMh`twIXq3NlyW=}C& zd+#m7k}j$2b8Z_07O{dR*mvkf)K4 zQVPxrmc!Y>TKp@XP$T|_-!zCxMN z?@0>gu;uvds*uwD(+6-^?hzj1(lIo6?HZek70d0~CL6j2#Otb}mo^vCM>D%_opYba z;E$wYG_)3_4!f6)xih9&BocA5E|tDnjP1JdZX12PbX;5G&LaNJ=oc0DGg6vDf=Zlp zLh-Dh#8{nb@0VdJibAUlu2&V2tI7@sbJt#6Yniam=($jPV}5)$1OA&b1_*aJDy*qt&y|HJ@0^L?{wjTs$t!~_t8zm z1W$mP5k3p?0B8Low%T5rW@x6JXGyNG%1fZ!Xc+jA9${9*&#no6i#Kk>at8P@8h5{(e`D-~$kO6y5IRuEFnjqlvpeaGDrs zis%#k{G{{Kk1ZMl={?2q4ePMKxVQeqTt59;bkQ-=T_mPcVa`1 z#URdaasLTV5fjd76I#%sP~LE=7{9BHn>gbhA(q6C*xKvZ*b=tJSdZ@79uL-T6ZiDm zQhX)2f6bf0iPvTmH9{C7i3vA#znq*cOXvg3?IPMf`zQV*er^+$EVaJEU`>VGxiFwQ z-f`tER>G}ljb<?1){lwAu3SU?2bWv>nP#?-jAC7V&p=hl`(SkIA)B}VZ%yOz|F zc*AnT#5g%2v>B)r1Nf_s2cMQZq`~)*P5b{@hrZc8a^s&p`(_rYXAZRh|2tID|B)-dH)S2bw- z6Mm%#!pg{RnqrleO?=+X4gByeBULtKFha?<^b9d?gt&DG0g@}qE~C@Q9MJpLYWQoO zd!{xxCyiW)54k;Iw2W2p#ILtOod0Ah#);cLh&36(er|iLP-9Kgyhu@SJjx2J4W`Nj zv@GI%E#oFI5MMg6NbM2NlK6UcLIi1?#dDa!vWtmYFYRU{uIC;aCXayGidnlcxD*ZkIK zyhWnjgza`Yhu-yhb2%Um?x_Y6kBsSywn}TAmFvoK95*wLGm$Lqd-8VEx#;$XbCN=a zURRsA6nFq;C5$cwJF+prIXd8QG~)%DK1lWg-^sYEDVB7NJHX<3aGuXM7O&CoeKZ(T z#))AZs14djh`GrhCh@EZ7fQd()ev$^4fVfP8iRf01FAhBGDf}d-r6s`$})~*b-!8r z zo%cr8})!$iyC+9;-W%?engatn39X?}#=7|;LH3)fEVxD*Iz^Njeag9LNIH^&VZ zZep)i3>%zJ(v&8C39nwdG@XnHBLmUb(sV+QI$-9Jt5{oN$@x(fRO*?l*o-DOcFpGI9Mf~w|XZEi`Bg!{siT^rX-KQrc~d? zj6{*#RPf!b6)KEZRPu!g6x(?U9K^Pk3yj^VlY0;&C>u7JLDEn1ez_c1ch}O=Z;5#( zCy7v_r^M|bapTKjiM{J0XgLyD!EI>X7criE#kkYt#{{xuN%t1UE5-PQ>;?>bpRU%m z=q$UD2eu{aDmHz)eaY&0N>NId3PXo%^!GmT@x|RM0fmSug+S{%o*mbosNMlG+e_C~_pXi267to|+!1nu$E1=ktNZ6Yc~GmTh*Do#c8EOdFL zXY5ZrQ<4(IapI&dUnJb(obLYQ$yV=-k_#gOj5pO|{dgjqg0oDyWxSlfer&0&{h|SY z?W+1zjKhYjb#&s@&=|vs*Vdv{awM*?NregEq6q9ac5V1F?*+0*kFGk1gyq#vr%V8! zN*u4Ej6WzRH+vMRajW=1fee%BZ-YIyW%|YKOS6RHN!*Uag3786bDfGJ*$M;2ikPs; z6O-;84x)~L;*xba-|~p#5mFKetmEN)I-$68bzQ1QC~49?jK?Gyb6>@CUm4fJa_^-^ z8IPn5ey$C}v_LA#a$*@@GdjRK+Ffs|FDEhK`u^$2@ir2PgDl^{1kywT5@9Z6cM$WU zWDNWxn8xD;jISD`z?|AopFRead5lhkTKvQ>%`lY&2k-hvNl+Yv;ALy~(My6|(;>2U z4-50_xt~EsK!LC9E%Zeg)mwYU2cG&sh5m= z1gBH$%w}gATf&4b!FZHYvtmn&4Ip@|ZB?V&iY>P-RBg{<4~VccMPlfsN}{|tgMp(1 zLtZpolo~hc6^z>c$v*SH9rZnP1i3g{SrlF+g&3t{?PM^A38wKHS;iA0FW4=laHAA5|>{XX+<-a$>vjo4*DFit` ztK^O4iWDj(d2Jfjp*BirSGWkHZ*6MzL|$QHW+n$}mHIWoi5_%Qr<5Jb0AG@AtkL*k zvU8T)63oJAR{c)|d5({G_j-Kv)ERH^H%5@qAHflN>9;7jR^Q5yGz!ioJxLN%Jai>% zA;`KTg1fx5svic^A2iRO__NU$h*(6S*k5arP$R-6-pC*OHqIov<%z2~f*Uu6CiP~O z86yZUj#-mjt%cYHudeHEzAXTJ1!+7+5 zSIdX4y;(tf36kfZRnDM-7tPg=-MYb#4Rsb;>NO+jw7<7yX zai}bT26<|-PT52?BNDtL7uj0&Y#cUjclJ+Uy;*|7bMKauiA@NKXgJ^$_)pNT*IC@y;4f{2z712PlKZwE4*wZPCzo>G zxfx4j0>DcFuqVv=X?VJ4^D{L=a8X1ul6>CGi@VsCumC-jxc!XES?_WL1MmU}K)X4( zsV=3V<%4s2z>)2)tMS`b_Z$A~MeCR5k2ctqf3kIwX&3uUIEQwwN8Cg00Cyt9CaL$^ zC$Fx??@vw4I_M}*|KRMS6GI%20X?BgIv-xT397=1>wW^2iND$|{}R>cL<$}LCk=OqaKz0-*x)AN=^1LhE14iKf{aVg?5tJKt!6osfcK=&7 z`De!Q*YN%%#lL>|MZNzp9{-<`J^6y?;LVhGra~3hYLhaFhP6nRy{j;>&)6{-afCy< zZ3RmfRP=fdEz2@$n}Qg`QbDtTXCJ)wX|l_~Lq7eLRraL1^>pk1-k-MTO17;jyQ`Z= zs8`c07`t_f73$|Cg6*S^>O>S0%)&Xtt(=_PG+yyiOunejDshc3TQun4yy~`uP4y$j zm_aEQ6=9=Ux15U7woEz;#xFoBfPPn!cZ2`PS!OGo-#nAqN07}@=84uqZAi7OejS(` zVY=j+J&7Hh;$EwLVHYwHQrmEe#+_oqmhji3%;3~sg}U#XnzN>Ds-4su!{?YMo!BAUi-*+&*s+*{*#njP%O#kP%?Uf*N<;q-F^NC{yz zXS-~T-kG2ycz;g&cqxZPn(EI>Wrf~s5%7ZVzrddp2N~PzC?F6u(4+ z=h?|&g|eem=|Z;`=d((uZm>dmB^p=qhb`uWuoAa%{Di_eOb>31FOpFoW`$D83U_Uo zdM-5atTN8BEE7K6mg%QipBc(R@1O^EjF*Dz@nm9ov-k?QvtC5kZQ%-8C~fHSl&_L| zuQr4&+)neVH5!j#8Zz~&9nj=;K?|h9)l}EcREakL zcp_Oa2w_5ojl8~{<_HS*F3AU=i+UojwIPHR+%A!tUz5&RFZfoZ;=Bnj*oiCF%no)s zhb~^UZS`PoR;XG_Q14>~5=!!1TpkSTi-k)qdC4DGe7b=Hq&r!Q4&Ni_#GhIlu#X4sJ{iO@Q)3$h(=WYD`_7}UOTa$eEHETYbnWLSht zX;eV+h}6#4yk4q;<+gP0GiRI7RDH}zxN?i*soUh`@!L-JE!hBeZ^N;WmEmBZXFb{d zbb%21-~q#H+rYXCH60sUB#!CBwnJ){N{*mejY`eJmAmMMoZkGt{~3UmP*dHcu?%O0A1WbcxF4=nYr*%+qF{hS@V@B@x(%ivuYWT=5ChVQ#8SwD!djHI0;I~k4lR>Wrre7y7 zN0P42-Qrj>rx%3xW5q%(|7gDI5P-XzUlS%5!~0hE=En#Lp_lC(N-701hQ8JxFobV3 zR7hp*hKj(9OqVPu&f9NZ#hhFQx2yGWa^SX{!-grFfcTp}>ovjYiFLB(a2MjTK7>Jw z@N7b~lj7@yEAkb{DE8{LP}bSjF0-QDe0N2{O#~O-6az73Mdx-Sm_*PiKUySwiHKI< zE^ArOvDwVq}Hph*kHlhLeqTAn68Hh>)UH|)BwS07z_j2 zVUK$=6XAh1i^F7KMw*C@1`_rj+6GiOKk1ihN3$8{M$t`mU zSFDIYQOUUm^HO`~_Mzg7A3H%!TFUtQCn_mm|m$vz(Dt{TNUl#V4d+^IK`{h;s z^5=gABflc4Uoq^j0Q~=p=s^fw=HvHX0KZ_6UjWoE^z0XI`3rPKknvxL@h_~tU2dc? zj8li>mldp*9?Hj3)8b7wN)Nq*9ZSXB>-VpIF9M-+O3P+hk0cV2inGR;zX332F3OnkBNqV&rGcZY+}o0!wfOV+sHkL zE4T-Fjh$<2dm&tXpL?%EO%XCxW^V~akS_AQ|84(U$Rb`{I?T<4)QQ7({hA;+_U;_k zc9=;JIoNMa*CkVMOI+6ZUto((K2)Ew*JbCG=0DkrD_;;95QOBr7WNIg!?!eavR})B zkuYl4AF+qT5M;Z%x5G;%9+j&@)@gtA>q|k9o*OyYb6x~Uv+V9rMI78Nb6US;7mWE% zzF`3tNzA2{zg^413ZsWotiL^sA-#9L`8#9W>mGG~EjqhYx z(QGCohFeo93T>b3DeG}q)b1KWJe6$7-1YWB1S;1;zbtg7RLxRlF{XZFfAQEl{400a z+{b1#o92q)&spF*ov-~w8{Pq=nQg$m0dv(340+-P{Hc8|T6P!sV=0^3w-ZREn2D{0 zWb3=tLlG&!2csAIeVgF+$ZLl8=fmw{_e2XQa64|Dbswa4-)r3~S3?qyXWw2j0CozY z^^22+j4x9ZSQD2^^>Ns55;egjD!G`sd$K12b;nw_Z5qg$Y>85lz|?DgH5^yQzapsT zKH5REd3$@fIRmNts(-o?hqbJ$anFJWyLL7ohbdYbrcN(_bm4@3tWnlx+WQTZ@~pM+ z-b+R3StBmgqx5(OBzSS17vJ(`^sd9aLpHLs|e(`;Vf+|Y6tS?~{`Vxm7|cf2M#cvRiWHh>(6Zrn7n>-% z9gxailAsiTB>lzf(yqetQ?=|8(T6exSpJ2VhfK~<6!^=p6jQg;1}jYR)ul(&aM;&5 zHQrE+EuFEe_C^G1^qqcQ@&dF^;x>ogMVNYRR+7I8{?)M+qmQ@HY}(!tK4()D61LYD zh~luxgj!tVOdg*FdHm6uu<%wmn{)n4UJD$rTgDn2Gkch z8+8$^$3;zp*9&3z3#G)y)nT~FG~ou&|Bu2uc9(#;2>R+bs0R-tTeA{jb~*>=J3_4j zMc`t_$Vz!Ic_Ti!uS6T9=J~#xC6yd>B)P*X5~XK#Cq-Kr-E-Z~!(b_<6LX=kK@oq? zovzovk9NLpV~<@7C9U*2^N9-%tA6PoL551!XQq^A!h;rRbZsGYPxAD814$qjxpdy8 zWGCABb4_Osf*7kQ%kBUP`*BL(=B>i{UtA7@T&^iQ-~e)HkS*~@0pxIYRYnjf>XA3S zUkLCp;bcgm4$R#1sc7*@kh58(xkSv#~5U7zJD8erQ^3BDIbSWUiV5+!z%ne zw}yiLJ+$-9n|lt%Q_?a6nNRI-SgX2wo@!L`4I#(V}ITPR+=m zMsQ^U#;`1Vq*cV^3JZTI8Cic^={QjQ=}n^YbtrK9VM?1}N)ey8AAvz0c}~m^07A1W ztSDq4G)g+PXD{$YCF^u8Jp69f&^J0vwd8~Tpdid;?s>yIk~2Oo8n%ERRY~y1TROAjo{RVS)@${`%(RUZA|*xU|0- z{uOS)XeLyKu==9t1&I0@(Z1HlrIHWVTh*`BRKe@^*YdX0d}uG!oEQp%t)To0b9`@q z1x@ufapJ58jIdQ#|IH*EdkMpP(V)@rE$rbYL0L^3xO_K`|8E+ID3FHES{EoY&F0YJvM`UPE&|g_>*ipWRGu+7Snj{?{yb zP-b6DiWH>1N2gV4VL96nGqwUp;`2p~Y@x6w;gv)XQZqHDmm&i5UgkV>TNdVNbW^{R z3S)}AZ&JkTdQt8U;_+PxQkVTf}{O6-bwm`vj{5QckNe_6b{SVur4qX z-g(p9L&&-qJe8qGgqB2#Le>5HoHaPCQBKW?3@W*0&925ESSHm58LhzO8)l*IpxJbW z?fq5puUr@VWnyVI%oYMIm!c50w!UZ{QlTUl#6&E?`^H00hDFd9cN**jaZj6DiG2;@ zeSI(a9<1Q^r`pq2!LV1ovsw>pMsxPd@)BT1kdpr?ZMe(*3ujLjTy_kU35kQ*h@Q6| z2PM)S+Yl}Zs!sJ}=h_mGS`bM4{d%&nOgi@T9JoYDOF6*I-~FRzYX;3$V0k-p6qd=2 z0IMQcUmIg58&=})d3F`_?Wdi8yQ;@Nijr1Sz@)h2u%f%~1wd6*kekU!O(c9s%X)E` zDfur77-;{)l(xfQw_i&>jI5b?=c0l^;zq3c=D5S)8lJQ7JA!1E4C!;EVckp1rw$kg(np|*zdO0UEgxy;1P5U!0x_r52ueJ|$io#P1wbYT7>B|Nxx{TD~uj<^e5;}NCaf!_tGev7|H+hu{k#CnrQA* z7~-+@PVXUX$eiuhL~v(;HPU}AiS+rY#iZuBN_p6jZE z!Fm24jiP}uj<$#6A1?YAhX0qFOe*|UL-4NGf8&lpMX~1^?Z?|_Hg!*jn;s&OS>w{r9ROU8T9$unL%Da^(AoQLK?$bg@brMCjJG$?w|!sA=f}VvJL|1>R=}R51h)X# zqu*Wg@xoQ`22?$}R0_e&`*w6I_d+c9sJ7bn1hl)Qcw@KjFM}mN?O*Bx{wNYUXWk4i z4T7GG)z@`SG(8<$0A=aB1D(Yn%*E9*OP5J9_Vsc4_Tm+h9L;Z6ti& zh-?wxMtkD=c9W9iA7%3vRPt0=Sg`KYExUa+EAji&2Cd?OB0nk8n~cX=AmA?hl$JCgAl4%4AX56X@w!Gg$T@sL_pTnkCR-iV z1|1Bkp3@DUk$heoKN#tbSdMhd6SemuQ^`EH&V@Pfy*VdB?vjB2{FgPFcHmMrzjT{K zLdo9VZg<4bA7g*!!TOl~tUnY2uMJVG{ck})U^_vvo&Q$5_~+B(ZE`#&pPR~n#>j5a z{9!&7cJ9M&2C>^&%ORA~6e_wD2A=wtT~fQ?>~>fya~Y=Ih@TdqftLyr98GaT;u-?2 zV8&H00QChRfGdCi5Q?@zC<*~l1`-h6FmnJO8yZh|SDGNbV$Z5%3YGDC%VgV0w@Z;W zg~Ccn5ZzUdsILEAdf|U^eSin-Dm<9%pm|W)1%>~N1 zf8*Gv&Y1Vq#zt~HG~^f*{(4uf+;|2Gu5{#i&T{7c7tH}GSxhOBKjw(T#*lLTU4~*v^UHiIf zbmkjRy3t_+`nO;wxmR&|yT9B`^f>Nl*2C$D7mJb&>VC(Zm=m5kx1IDQj>PkTUE9)r z^*b3SXNz`EE{WTE=PJ3Qb)g8PqjyJSH11JEy(u=*yzk3RYlPe#>kMHgoOZm(yYx6Z<%lyD@L8 z8N&36dkP?^!A|SEnb(&^!=}m9GDISB8j;;}u(MLD>uh5xzd8{B;#;O%f)0-*+BRh` zcK=U;Ff;FIj?U>ZZW|dl##y#DZt7{Bq9+_|nYfGG!YK>vNG&QIw|q-1^9)D!>3toA zCRN3(-MO+(wD)h?Ul4IG4K@@HC$knJokEDP&r-Gqng-76n(WOI^CXs|Z#1vqy-wwB zCp5-x;mtQMrl}Y8JrVbO&4c!KMFld?>=qX_z&x54IfagXVi@SqWowED6IC*d2A`&v zpOGvM`|83H@5aj0?{{@2dU|+lm-Mt`UabgKHt13f&k7rTSlv%^*=|r@=2dervFh!z zXnu_(pBt-Tw9NBNf3VYM&^ZC@+^Zd}OfuJQPQms4)+@p}>kSbqiJfByx7p+|EpqJK zUR%nSHVh$q8M#p}m7wizfm&_Qf@RSfbjg{GjP$e?g!JsD7+?mnQ_fKB;!a|fV}r57Vo zlH4{`(k7CT8Wu!^UM;0QQ(rjMLr6Qj;rXM9&`z;SJH!#1^VCcy+SK7@dJ?QT2dLgq zFlBx?M~;}8-+tDM-{>jk=H&bK&GKh5(;Tm!N>7L1mq&CKTZSLykL<(DEsdJQmc^Io z9pTMv^Sse`5cYUpnqRbKKX1oAT9oOzLg2O(Mw-NBhvhfUf8tUzv$^wz+(0EOG%R1| z(rcw4@o2uCKC%rH%cV1;f{o-2IxWJpLi^nnO6fku=0`$GTz?ly6<$CBG+aX-Oqdb8 zg$H3TtJKq%tWZb6TE7cB5GJPUsJDTR(SL{AgwgB3?7QI(N{%+}O58oYa8k zCwtyYWi3Eoj196^I@=~r8V?}pgg{rcHa7j@k?aAe6cWwN0^2bArhzM@ zbG;N6LN-onSJ?nT1l^Oue_s@D!U8jD6scdU0kP+EeWlci%mC=5Tgs`_Aamqy+m=-} z-{;q`xK_L6c^@^VcM+J8&~V9ZpqD9(-e*3kd*}HM+F-Z&&ZJOwzd~PAYZ4u{ty-ng z34pvOhl&D#4lu4h0^2$TZlkp?(_l#{09j706@@xVQVgy0RDF+^{T@VtnPCCU{7kQ zcZ~srnTE+v8rRGWVSyy(oX6p1T*#8YaMipGwtHJB_(S=y+xusZ$WOc%@Nx>r8baxu zLg*$Azj4Y7X&szb{8bIt+wfRK=B>;OCi?H`-9G>TgYOXPANaucym|^SA4witfNWT- z_;W|_WAGdHEfZS-oGSMRT_cn zxVHPuLx`$$Ke79^(=<~KTzLk+4&<1GOk)^aykWcj;Zn@yjn(Xa*b7>E{mK((9Cqy` zX{Iq;OtKSxxC|V=9rX-{!C_=!?5F}hX>u{TrM*DE2Obd9U zFctIk;oJqtZ%|1Z6@sw#nIYo?g)#0VIcCD9j7m2&itqmpG5it1eTRd-2Y8mSP;M9+ zf6jr3&FxTBDva;i_6`YQSRumalW!q8BX9*ag8*)RZXjhq*Yyj^}rEXns>9sw0)LKLCi( z+xcdH9wqIZUYxfim8@m;I7wRs{pR{tn*U0?RCC3fgISccM}cw2W_plc%WOnF26Rrr z*s1upaHKWDBT0ql2(`l^ObuN5y_GT={os^AD9# z^GBV_cU1D<%3@4lW4-33mr9EvB3~?Tu;n4}@k-_CGr+$yH2Lem19zN0y9*AO&vdP~zM=xOkK<=qi7YQCI!+%(B?86;eGHxl+c zVGn%=w*SCZlfM^?`~kgcgJ7b~xOjq}9KUb8(`Qz2Duh*b7j8QV(%#6<(_L z(GI)&l(eU3;@nN(V)et@pWxz+F3uR}v$l6T4rEi(E^m(`$ic;$#}Rvx@{rZ`RU-YM zP!R}tm8LLyfY*@Bc&E{iYF3{(Cj&;YgspMNP^$M#2zS4Q( zyi66ZeB^R^OEd*{*+s?q1C_k;rEZfYTr?>BpaGhHa#36mScaYPp7k+E?TA_EER7r@ zu6+H$lx7%yn)}WwJNUhVlyg^^{*cA)K3LCQ2Kr5P-PgcSP+*fbe($!x^kS3)t(<^Q zrH5q#Qh-k>=WEBoFm>;4%x(nz^5{+n;XCwmV1`kT+|%=`pyNfX?;bP=&tH<23zBv3 z<=%Sd8L{(4#LnYzr7Gw!kjVm&N#&}26}RBxIq#>5Ah7j2p>hb8231O3p^>+ zaa2bEFV){|RtE51_g38=6tKalbf<(%p3*z>c)Q3l7z5I^;V}|D6EXhi+(ME@L1p`6|F|MNU`4c>=|4aTqs_E4Z!J^7D<4lvZ$3xC`6{X$c|j#Ro9VrY z1(NGMTe(aPuk3s-y)}Y@dzy}(rbeRDG92H``x#g z%Z1`o7vx$`WBKvZdFp%I-D8&i)=M5*;&n&;=E*N_tWHu^VH<=momhW;loY1ak_f-P z%jasF9y70))%&T&A?)q=6pt&VFrG~C^?V&9t}J1SS5E)3ebyXt^n#N$R~2Se%oE*o z0nX0eW{uOMQDICc-|=YF7NsvNUNnk##B&of8nx#37dI4K>+;yq7>&BSis2*^4d?Wv zpo)2(w=hpW1UX~f-HKETIu{=BW#2bzu-xe(Zi@F(+-g^3il=DnP4$}LDJIxN131$& zDstb7rx=(tKZJwPZ^6BA&@qr)w-rM8Ee64G%LX0W2WEJ$)!U~gP4RHfs$cU+DqBH8 z_wF%;leC+26=PPdI!QY(E?xH&Ui1Fcg)NKWxJ@8-ZIh5Z<|y{{CM{ukOvRj+UXNjB zPumY?OTp3kx$`}L!Dv*|g(rtL!*S7XTB;Ioyne{_gAiO7N3WEY&>_oX@-L)6cDP(Y zPCjk$wc>@|N567KoQi$+i-yhp575~!GK=jWF3WzAySe|{{OlJgkL({U$$n9}r@vb` z`$b%m{o}>iFFqZvW{G6K;GVP36wiL~)wg;8mHnbG#6C+j`^9KPb-!5ltn3#pb@vO_ zSInz6J)piU{0WUh$Vk%MXnlkz;XSHH@6nSb$QHY-8CKyfBS(s zIz2$^|C3oIkD=X2Li;KH&bXPgX*i>$rd7+pSyD=CE}SK&_J4vy%FU8dI3y;ET!9D9 z-%2)uTdpUn?1i(7w?jh_643&VOYA%MuL1_N-X;`x>>wgwFJ7$_hU2P}FPEG^Xq|RA z?HU}N1zqy@fj}MfezJcd9JLNGPVhb1jaK?kPzS{R)qHxWscAyE)YHG)N7NPOh@9C3pR$UYMW)a$c z+lLxiadh9i>MIZ6Z2QMjV?>(!4B%HhnEmUhK~h%DsoJQ9jQY^w3v=MO*wp1L21e6l z^Tfsi8Fjdpy$X!_vF-uYc`#ZjiqwN-0q4{Hw^gMs2smRm+bnz=0mn7_4{Q{K`zpvn zRTZ%c(I}73qwbQ?sAp@KX01P*xsl*(%9MpWWO%RU4H?_8)fyO744=AsJVP1PKm3aktu`fw=*=gXx z!u<$9kkp!EPhbIbwluGP}G{khmY7Sb;i!KIk0|x_%Y}5!>RA4 zUb?Jj_mv_l?eMvU7Qa;5f0Ol@QDVPT+D}#5#>MC$$Izq^r&6nfMFzQjl`F;Xt$_t^ z8POOB+WOTcOz>Vf-n;!`xESKHtvyp5OVNbH3km{(8II&pqDv`+8sV zbzS#{0_-t`lgnB}$d*seNdo!thY(v3?4-c?jb700GwVLn3|VnwMC(0_h>nhI3n26> zx62CLJR?BYnRo3$`N9)lu~ObN+iAsspiPbrd8a-)7V{*wHMqmSs&ELQ;ff&Q3Z5tG zly0jZTa5ZOAHuUe4C?9>VXOiTTC9gHoG;1>P9Er#!Cv+Y-4_Aoj>#U+0Nzs|=+5XK zvy=qEx0_D9E()eH;@S1`C`?fJwu1;Y2$90uM!~c&aWlz*^e-R*hutIm-oSWm_kdvr z{NJjo(TEC0QZ&0i?z0Lgpr+3GX;QS3O3lG3RM3}@VUDx0Ny50B;2vkL!^(!nMCHFY@EioQ- zSlT+-0FUx`pPNR7N7e7KPSeJtKJ3X$rpKdv-dnxBhDYs}&Uqs#$&5o zCAbDI@8wxggQ&y)DKX?^==pmFFARuiy@+)_aJ~Qz@mj6jyGIKitmZMhxhyOg=?lss z8_-A}m7gyR!v8&`eX@jsea2qZ-n;FGj6tgU{zu`32(x9&UWHM< zmYeJBw&Kk-u8COjB<{HM)gbN#rZS@PXlPYFHYiX}lxoqN&~Gp-%m0*c`x;#@6^_ua zGhEH*OSr8>se9}(q2F*)md}rH`_hr#V~+^^cjuo9`V&+gh;*qR68de|p9%yJRM}i- z1m&=^n+GUSyI?QGy{!ygs`eyyq%ba5fX8^j=MWs8{Fqjw5M1zOSOU|7eP+^*CJ z82QO`M|bqH571}XOG5XE{4wzmxYs0D^tS?`DrLh*QBfO|NXPP_# z3{LI7gXR$o&OQ+R>HP0h8&KS84q6A`6B$|P7CsD)jN9QB5eApvLfv}-BRxRopaV*> z^E-97C`LMNxwMZit745s_4qEtT6up|uNTd-Ee_ZZf(#K}9jWu?=&1b?%Ah6mbd>a? zEGSg8!Z8KdQ%t-05>5Wf<+ut(J~EKGV{5rB|n0yGfG?&mdtv! zY7rKY)2+`QmTZHFvQq-l{`Zr(Mx)8lR;VVsHh4^>$_k$-L=+y0no0yGBFr5}Ere_x zGJ#u-;9Q*Fd(d+D5={H_N31%smdczZ;V=((x)V7vIxsKWmRYH4m?saaVU8Fs#D}l+ zp!vCGvku{Tuo7C1%NoQOoTXRfC$8VAHlT#GmTw%y-~v)2TX-=z8lmNz`!Kl7TIxPQ zjP!S!W$oPi-hI6Lwi(r7_ltm2v~lwiz<(tED5B5=Z=!hQC5cM=FgfnIttHi^|0ZXEj)L- zOyzzW#3(#|zFrWF#E+yTj7D&Q-C0bPLxvr#p2S992X66y!c=-Q&O9s4|H7rNuFm9* z$orYD(<|;zU9Ca#Jfkpqmm&`$jP+P=1DIRVN1t`YpM%>LnJk~shuhNZHCl0p+tLmW zR33rb8XO3H9SN_?#M^T>4ZQ!F1Dy4;a3Ruz6gVDu|8U8QEFNolJc`%)`|CGmS$96+ z_8TlM*w+t_Ie6~Mst65MRf>hzanHCD2iLuDPzJ8T<}JF$4tI;f#lsp?%EY)(0n0T> zslhM2nVZYJ!4MpM$p*iW2{hLGY4cKUGi7{}?(`b7z_5Q5F$bJ_?T>!bY2+=*HMt`S zf&KO@pC+7EIB1lE+~B&TLpWHY?q5`M)DUBs|K+U4Ipp^h4Xn0se*5mq&*8{hrQ=5P z!E`?EI?8CMu>Drs_8PSgLLlMwfZkaMD;Qc_{qQ@b02G83j3KN5L57PEWPp%^B`o}= zk)90%ARrvE1mOq>P8cA;iB@Avn4lywL71YjQ{fC~OA#x9v=bkdus1g21nb^gs~j^m zO$n<~KQM;uUk{ellZejnMd!34`?vWSRS+^Z`G!<#K{D`lu- zSrxg2bZ!cNrvDZupad&?`zr0`C)mS!BZ)S!hZV?OJ75nRPqW&95WaY5vjg_9<)Vx| z>|vWdt}|eC*l)_%fe=3FvM7t~KSBx11|m2)Cv>e07KFvP^3WdOPPO7Dez^dg$19L0jhF`N&54UeKZ7#IiQZJX z^XG)r_y>I99BQ6EtSN={+#MX31@ZweTTSE#&&v1SAms*(m;uaNBbYCSKpfUa;Fd z)U_EZShw6%dWRRBYeO^jD$q)=zv`xY5>FeNDawIce)6kPG`PAWX40B)?A5S^9~?6> ze_#xo^z^Wu3mh9Yx3Y&#nl$8k7mgWY%{&k>ofxA$Wy>UBB3JkLuVYa=0l1y`$WEkT z$Gb%=KcbIe@lRow9rNW3dIn2*@t%?*c=SyC7mr<*f^nZ{Vj}D%SoHB-U3@fI6^XfD zQmL{kN|o-8ANZa622{waFKHB66^+_=#|~vxxIh1rPL@^CgH@h6l2tMPhRF z@-zhGhHbuNLU?Y&S9yvQvH9C`QcBPLjM|_c8DT0XheB!xwwBjg%U+69zd7<$k`I=k zr0b8CTLrp?OS|xSu zxQgD%DHm(bZ=O7N5t*WXbNUmQdAf=3Y@QHqD_*te@FMgZWeC`S(I>4i-Qh{-w|JLm z?@qY=;G#vR2ch4tjMd(QaNA0Fx)W>-*Xl$&Pk5WR$Gh5fsecA|aDN=hx-->{!&HvM z-J^ow&v_bJsjg26~o}B4tC#?z~D-|3RQ(LxWi| z!MTJNUgO5#6ivH79me2v91AthVsPeO-EAaDU`i~B-|)33@tNH`VpNZAVk)QImgA1} ztV%lS;N5u+#hCnBjVi>PI82S>2j2U5Aow)~v>fspB0%(+?5XAPg7M`0>XXF4t9+6M zUU(SE6ZA2NMZbM*V99ICVNs8_wVd~mekV9T#dB8aH}R-W^1M{W->D>^oU*O5uHsSD z>UjyL@hA~1t2_lfN;8|oPI#KEd)zlzfK~j?Yq1N!QxR&yUV6a`WqrmO>@ z0GPOiSrd`CANh$kbx;_Wp9a*PfysjMx*%?+EQ5W^!PWT&l-F6c;#H~NC?|ya7Xx4r z@>;Sn%Q%uH#j6_PBl9wZaDFk`!|+FvtEz6&R>G+KO!(>NdOwKv6v2~>G1m&qBdnXWH3YzS_>=132TI4_3Y9OR|5}s=Km78#b`zBk zpDn+ z&X=N=?~q4Aj0gEg#$d<4#4($n{PlFLz;l-spRJg}BwxjS_`(O1YyGHiu`@Yvyxm_7 zhb4i9jyvj|76_LGA0^0I|0^9H#i4JVXo5!>$LD5~{Z3^7HNkBC<~AOcG?@E_29HV| zw92@FM~yP)B{Jbr=J8h9s(93C{k-@ya303$-DS8+0)H7d!ec`uDH(gne!0@|p2RNH zB{xDgRwjZto9Pbl?O6T@`{@6{sTT_K)Zue}w&U{bxDHofB364E1$bd%4U+HCJ3iB2 z`yw2$*}xuWM0LhO7AfX!*TyGubmDaHNwmaVy48Ruu_2{s3*77{x4=yiF$xdyl8jZc z3K64d8VWw(-H${;hU5qjKk<{Ng?pMjtReRTS#?tO1AfQNL=5tjS#F_y+tQTs!-tF% z2%dladNVH%@4lF{RO4>%7Wai6BTchUg%6(V=zq)Mmoq>c2(wpfR|vl>%LQ3~u6J z!kzCw3{%2n!DFXmEKXz=AiM}Q|I|T_E?QKsUHu`U zW0AMiK=B%OaL9jT_v$EO$G&|FTvI82^F@ED+f``JdWNlhtuUa0)&75LXH6cNbQ&D zxcuwsUL*1&+9&uNU{uVCV0JHp8gC^4VC%d$WvL30c{tXwoodV5JI-6s4d#GuyCeL15*R_e_ zz!+%Ump&&aPQqQEJ)yP!Q${2Frr_^r3QP6sRLwdcvQ!`qca{o5c<#AG;jxH$bk(Y% zZRe$caD~lPTbN~a6oY$_C8Ta6mUXAHIP=B8n&6AIOGx_OOBUInce1ZxVay8TmO^|L zc`}yr?XVeapog%I?oD5H*`n{=7qL>RjFxE)<+_{sd8gq9k_q!usP|a#*$tyPESM z)U>{fk^rflN3&gE3|%1lpW+dJ<*uu$U>#Pu%Fh*B>gmA;vPOH)^q6aIE6o1`M&DkH zU;Aee76W9TlTjLd0O9ssn=6#MDpd$2$1Y@EXE|ujJVsJ- zXAAvO#i2WLLK>z(EH;ve@UT0BbsKrV_qILqH&piTmVg)7C%~-J;aPT~7+hY9keUN9 z>sZHr!#26H>(gXealVgF{~u_ZZPyQx%4HMVGU$Xv649e81T@Z-@+?=GWx)KEa#VT1Duh|8i2kPz$ee7%)gh6sL0< zsD=;ijbq?^`_eh5@5!p@9QK((XT>caan69BjL!Q^(Pzco*esjzC-ko!WXo{FyAKj_ zat>hjCT%k=gzutDzbwgMYv_g08l=e_0@ywl^tlxhn$KCM>3Ry&G&{cBGp$mJxp-nH z0w)CfZFn*)+#mLvuxrB=Ibc?L38T~CRzHiP=WRm*c}PGHv6!m;v>qTja-fF&J4iLW zarF@w*v|7=2Cp4pBt;KKhdq{6QC6-U8%w6Ua8I(RxcGHxshHiVpOTaXS?=Y2amPWI z&iY45c29~Sp7EsVsv*1~u-?wV4wT;gv5N=K zqQt$=-T`&+eI*Z2M|xg{0ClJ`=PFQ#zr3{r>M%aR3RyQ_v1$0;cyMZF*-eX*@3M|x zIrPpLN8NYl)GfKe;5;j}vbj{wC1N-EjKpoZ(yB>l^wKy;rtLVyi8q9g0&efpissJp zj6P@KdGo~31=bG|DkDfj<;tkyi~HE6T|$rcR;aU}Jk-aWG5O}MSgT6XOTpAI0cPr_{rfeg)UPidGYHX`ECK6k})g(siWKnbEsO^X~OeMgb~ zOs}e4!T6ihVfecFp|2in$^gHL!-u6ExO5CXVpr?U(G(cN>)-f>vSR_FBRG#ZbUP0Q zCw0d`;}8aC{0QA~4uiY%$l)4T0&;iI1hBRr#W`Fh$KXy&p}SczxVQ=j)ngdkekSx6 zW(=-k;DRnKVmt@RX?qc)1udRWWE5Y~NWXc?(&_W(S9y8-Y1ez=2Vn&%Pl-jr*Qy87 z+2i>h5lEks>t{2AE)Jum4^8j) z2_;F3I)@>e*mrl1^GcDFkh4Sfdf`V@=1#oad?r`RKYmEb#l^)DTgyOt;NWC;9>;o5XIIym z`cN(sxBZV%dmPDB-D-05Klsnu_r=Tj%yoyjoH#LFU;60L?r&pb2i%ry?d_%a?I|Qv z)$ILJ=2D!dUC2|CZeoGX30xv-ziW<^gl}ratuB=bS&uk06YW|dQ#FP!jNfhTgXRd+ z=BB2#mezK25f4|(q@yKlM>NuIRLqEHsz`4(rb?Li7TKCF8#gM3aGue-FSeyXXc624 zVv5JSBklcWl*-0*j|30-x}behZhrph>b`9IzS(Q-T|@f%Ji-r7tY1}gsjaNU?%98o z?iuoN(1Ek3+ucq`td1qbJ)eC<;w%X}G-_7jX1t9mIl8^CS7ani|I!9ieh`a;TtH+wl^%W~8676+F>RXTII`Ye|>#lWoy1N}dr64l?w#Q|u zCm!z(W!u_`iHXaG=zVnhxumM)*iUXIsT|Lef23~8sBhd!c#yl{EZLn|KGQEgmCo~h zHNjw=SdQMgam(W&YR~wW6yD^`ed^H?@Z-JLLOhiuqf5QKyk;kvK3%G=mLw)x?n&DA zlK+?~`)SmcuOVq{_C4#Ma!<4rc|Q zq^*aV?PCguLba{DlDB;3*u03xgC@;8Gl;iFl8emtk{lS8lGM3-smJn~R_?gQE;#tQ z{{3|M1~<(rKES}tMwIjQTHXiwnJouf2E#0m(?km860p3s9J_2&SdmruBjoI@_w^W# zV7txr700{v;-m+*1T}jON!C6Yw`a^hF{C;yFS#+To-eIdzSe$r78s9f&v?1hry21D zkI&-*Z0pnYKB8-Ed&aL$j&lXi76<%zCb)Y+Q1(-7G4f9O)(S3K_8wd=jfI7_k6nE% zv*E<(7LK3!@yvK>oLwGxm>0MzpwGpz)i|?tM%;F>eMTHl+jL2n2>!6W4w9vxnE?zU z)qH0a!`ppe49tNRlgPE&1O+SBx@8iIM=k94<#PS*v$EHszWLGJuc>RCTZ{SS#Flb# z7Ej}cX&34>#TI+>q)-_}^3^C3U14lR*J)5J9bFyF3$#j-Jt&!;8I4-L@tA;h5C4ig zLN~gH0wfq*E3AeC<&GJ(3VXf?VJ1~|D@b--IuowgGf{k!QZ}rP^uR0Dw|6StXAg@n z6sz=DKHpC?$Nch=D-iNJFxetLPFNrr$=Y>=$(yAQY}o{d^SIU|Z!b;v#BY9)fwhF+ z(C5z7lW@zh?S-J% zY6^q3-h=1!`!%s9Geee|Lav-v<|J86g6-IA2&R*mt>ndF&}-V~bL8n^BDR0b+94Sx z9=Qe-F_^Y|4=d%u!Q-f*&f9)d?DWfrDb(wf`Iq47-0@39owLe3vtou2p;sLoZ zho?}f*)Kl3yDQf;80q=&T>XmBsgkIm)zGS2mZjHSmn#|TWTt4D;NUxqY_(=YFm#BFja@N za!mr0w^SKuXad_*B*aSSpBA}kJ>QR!rd05)^_HZ|ikk>~JwSzSS79LixDzt@*W*aP zho>kY{0sK%7vA8B<&CwadP~uI-8P*;`FrXBW@bAn_cD-hUc5j>;7r!QXn==^9ev)(5~`|6eUi2tyA@X;*o^pO zGvJefhn5ah=Wyc~+!nmIG_c;lm-_~{1XB zBtG_J_@0+1@lg+R2Xjwi_x-~Tj&N*0_MVd`G0I~|%G8sX;R)8n2achJ%|L6!y3D}?(!(b-cGpAD>RM77^;dE2%+Ey}kXGo~c^56m>ei&&(5 zMX_t{+g3%Y#QUfOzYdK$cQ$&ek=mH(KQa4%FX9zh+aEb{5E$E1WL=~y z!ALjLm>X14O}x5mp_yo#Rlo@EcD<&n{T$?$m8&<305RK47(NQrl@C+h{|pWnek=0H z7BD%>g9IaeF3TYYka+*FxW=`8r|qM|;J%Xy-2xe& z({i%~Bsvwhkk&~=ltVun4RufAyyRJ3U6{ojbE9}3XsY)3_fC$82j;`Kb`63w$*k`u zM8Pu~mX9oDhw)l#zhf`*d)6k`Q?RE_k*{+?-dUOqtN}Q!pNUyx0_vZTrVmmRy0iYE_|qQWvfHNTHC2Gs3SRLrXm4A7lW@D z$ey1|G?P<-W5dH{FW?xfxvQQh@rd!T6&McZlFaQa0r)pM^uQL5oyJrzas@6gY?`e> z2jhJcZVdqMqIxcAak0bGCBHfz3YkQA&g5((N;vLGtk&d3p-+mA@@(m5jrSyu6t_x1 z;0A?q+i-(d5I_0rO#7>cAKKgDURPNR9zB8apDbnp)$2QiS%O?{@sFHRs4VDlZ{vkFf*reCY@9hu2cl}RZ) z`TU$g-WW2I9Ly_MV29Q4Y|P(6Jpt}1mRwF2KuhGWUf;b$hF;v9lXslk&||mXPN@Vx zhl~95CjZPK-9o`nkA?NsrzOIVYs@n;mS%hbZ}jES2;QvaVB zvXMiO7C`E0Xs=@lv(x5=%bUty89I9JJFW<+oOt#(a-n~nAt4PtK4goqgM4t_PW40) zJmMH@-2*ikFP*9?-~?7Skb1$N@RMDvu0$P&knSbl4gs?3Sch?(-U~5%agFI8%QLR3 z-4BZDG9C9k`dcZH`(E6-AiGh~d|DhxMa_lWp%=Gzz^|2yS{P>6JgV}4Dm z?;-IM@Npaklz;R1AW;6ztH==Wu^ag(k`&rLp$<*}TC_kOI0(Tc6diz>y<2Zf&mNL+ z)9tnEGBlR&d&+fYTQ|W9{QM295a6Hg!wMnwEzuwsY+4P{FTgnX5tC*CZ=lF4TlX4_ zYVzFY4#TLWYS_qz2q9IBk_L>~%=OT!0uiTtA}meLz(@$JD}SKA!Vml7uYg!c`jzP%cl88_$(O;b=2f6eNIj0}+aga6 z)kX2}~TnHaH0k3Fz&k-iL9dzu{Pa)o%ZTsMi0g4zE532v?=X=m;pXt8IOE1=){SksPlH zC=T3yYS@oZ5m58T3&Gmy&mA7c@3^8kQYEYd8-z<#q^URukn(Q5V8H~aqTscf%{qif zF{%qD83QCUdp4aMkLn{AOw|Lx=G<%wEgluyDv+rOU{0RzubJVzAb}i}?E;lJb_8L^ z*yKoH`RPiSV6yI1&^zNl8IDOj3En@!?tuqRQlY#5D}&~jOM*M0x>IEaV${9la*hhJ|IL_7ns%zv z5WL1I;+RncQ*JppsR`L7RpyFjW>{THsc5mERWrZ!!T@h$odO(>U-~MHjwU#LYfhhL zy}RcQqbIR!wW*F8RNu*rT&wUTzEWj+9T4>zRbREUJc*lR%#@{nRp!C5YBL$|k2LVZ z*g!ZYYwoP?Nn9;AZ1V_?eK5DNM%Y7^Q8l`X=~opXtYD(-2gA>!ZtS1Smvipp_1pvy3FKBWtONvTje#!|6R6cddwR=|| zq}cQf>RHST^e7Dx@W?+7@U_nQ9tZ6$7_SQC$o(v60)7GR$pr2JZn|P9OT{E$Konxg z0>0kg%>E&G8U#>|z%zN+QD*-F#2IjnbSxqvVBsJ|?Mmzdxw*MDkFE9C>Lun;nk^qS zcQS^5uPSUi<+y`lN1(^Dx3vVwfw?Y+9e<7#E+BjSSAB*mf>H#WqFmjVucYXP#%hnc zK~G|*neG#NHG59#*03dpImQ*Eow*JANYgKmvQ(2}bc51I?@5zK)c26&i#M;KsLp)VcE;7|&j6)x8 z@Xwa1{ZESm%Euw9K^L%rNXwIR{)F>$V;WtQp2Xux4;oy)sl^dK_6mNmRhkd|w@+UI z!J_9SaQ&MY*ONma`Ig$G+q5@Q?Ms|Vcn zAmW2X*f|8zO=$;nT#-s>sD*BFx1IqvfjZ?naTT};Tb1^PqkiQQt~V`6J3 zQy(PWum)8X7W_2e#R-Asup$~U7J;xpT;_G4BAmaw0AMioQ8N`#QLb5~K-Kpx^^l>R zC3aUWK+c)Y$?ljGwx9BE)e_#Xa;NX?IrBNAP?=TkL*;d(mT-=5r|e<}X^kjYPVFNd zI}r9?g4qc1+V>`N(m;8}PdE^aAS2%)=)pwK+j$o-Q#O=GC`|DPY0yv6(g-?%(dhK_0QV{ zt%Kgg#g(@D@nCpNyqlG)0L>+&DfT*zr*=9%=f<_u{ zW)T>>vJAgggHWdZG0=tuC&s`>?&i==FM#^cPZMA=CO5mQWFw$7F@3vaFf!rgFO{ZBOix+8|{5I&GCOqwsls+WWfWp+_c&yLBIPO#pGX2I}54 z!c6&$@t%{wcLpCqUZ7!?CJi1cIS69=te!?D*dKA?Xz|(@nB}5xR~*7I-P#Ujbkm~s4*D@}Q^mI+v+sBgHSyIY9^MPnXK&RHF(m7SGz0&VmN5GUP{#F1~ zjg6a?116yrXxL)}lZYT2{K1K=yL@OK8ZK{g$)hQ6~{o++l-`d8J>3~ zKpYr^+0D`SwM!5|_5Bo)801Ne(<{9t;5NShD?{kk*RJf@#?zrlZNcA)S^U!y1nH%~ z08cshvVj4KyhH3=nEFZ?l>8Z_b@08oge5E_co@vqON0*WN4*yxEnjuea{P2F?OXW8 ze`oxWb&>wrhu%dYn$PbzdVWG)t01e=3XrUjv}O2ji_MZ%`fwsN*ZX^*8&F9DO9kuU0`3vRJ}W{b^jPz!%PhTlSwO z?*b%epXS{gnm>5|eYZT`ldCSU4^uf?iZ)*q*!uLrBR?UG;3S#I%KG}Qo$&<=%vQ1u zgFyKRUGF?nkE7=p*#Y&Xre4eHP^cxL6-j_XEw7YiH7L|#6pHMILamhA0zD|yTBC{l z0)<+&-wJe~Q0wzbL@yL-eLGlqYr9aZIH~vzFW#N~L`lO7N!qMCW>{uYv_+7Pgu_dt z#vGL!=Qb*+;48x}nUfn;!Jy-h`A6D!w`4EVb7cy72yVav^lSDe+5+Q=%OpZ_Rk7I{|s?TFnZH1t?&)Knj@c$`X-s<_8z2J3NpC;qg zJ?j6#z@Tz*E{wur<5{j9V+p^^igJttf}i+1K-%qeAk<&g+$doKAIm4q?=I9|5lo`E zptOC()6W9x{l^@l&K*U#^4nhBF1lY~j}z&Hm*YLNr}dr>tO+!+m7XP~h6Y+C0OBAE zC-81~A5GnycK+u0=|1nsth*j;x%IlwxfYOsR>a3X0IIede+4L|s(^@pCRGjTXtAE_ z(M-P@PT6=8gPE!GPRIIITG38%$_?w|lMJ9Wq0>SNbXtJE3#rg|A%8A69Sw;`Usa({1)m>p*{*9YgXVy(jU>0Ik1qWo-#jP{)ot-*8*_!ulCaRn`w2?cg7^TA^N z1rVnq^i|^|G&X3^c20)X_juko1&s~9V6`)0^`W=uG&D8twG8$jRD87Pr~E~M$( zjSW8Kh=C{UPIK9TMQ%?K9#K!~?_4&Mc+9_EH2pUIY`{rSU^2cm|CYu^Y+vwn@G9Cr zXG$9H?%>7rPiF@Ir+x?!dfT$Oum~Uo{YN)D{Lf|GasHqyG~0gERPX%hf~kvrjQwDDkISkp4((--(a}4V(=k! z`iP!CmcVqjVwFRDJ+UsJax{4D!~59u91$buka@pH9F{@G-fZalegfpqmH&MY|xMx|CPDeC;E(#^&ob8XZNnQ9shjCG@31~+p_OZ zk#d|*-@3XIf*5P^&EL!uOY;%Mv9+-`du*`nu}f<`@f(#>fW?Da0!Bb*ct@x}yBo|v zV-#q$VWNn;Mi8_Jd-O!m3IT1fVBbYR+Go3E7UI7f_hU7|kAeEf;63nr%3aZC5jUOg z_vRRXfkWW{K~}t!K|DZpyc)HnFnZ_5*DN)4y+$HzN z92^L+Q1aOg>Mw8N&AW}|b&({E@*~3ArO|-UBo=|^YVFRC391x_Ek{6HBkjs~!S~v3 z`vnnFce*<{pAfi0+r8O8vA{@92Kr=BLaZWg*BF)tY{~u;|383{7MQHXi#|d7?KYP0 zgs;7?G^~v+dDC!vxokJhYOf2AO?@ekV}JiyraDFqF*QtOb$Q|a>h!Zkor@4KzY_Xx zB?tHUMy_{Jo{ZuN<#s4=c){#nHXZQ#uCC_t%1D5l$~BjSNRRhD!l4Uw5$JR2-9o+f z8zgs+W>;!|wlWst#| z>R44x1Gb)`_>AS*Dtq}`&69P8Nn)D(>@*j}ghLjU)|5(CPieWYmOsh1 zAR?ZtV#HXSc5Tr&mIeK;z6tHBO*Ssob-1kra+Zx<3wIej)U{B#|7+LQs&cq#-#)I8ip(B$SDgv@*MyfD+O`8PmEtYPvgZ;; zpU!l$>+e+0ZYKZPg5b2vLs5TO-8P%!y`ob98U zsWu+>_*{8Q=Gxx76ERd}{M^njR}4Y;4tOp;KR9!3w57(x6lw$>$3lc+Cmi>nv!I}MDHX0=J1x(iNDi2 z;h%*I0bodsqay&?9w6YU^J-Xv_2|%C`~LE)!4ugeCAzY)-Ap*a_3V++qTWgI`%mY~ zzB^ter){TVVqNsDS>#$*a-Zp=v}kiAIDflo9oBHO@O(*h_4MaXpQ0Bs>2iIJNj2Fd zrE%*m?)G}ObmdC5R-Lh*8E96gU0P|r6k(zDmzEm+|tN|l9=HFGqlJI7aBZges# zj#>}OtPK`Q^_;E^QQ3Uyoa(g^fDPF!7ZNNOf4`SMlTGcWB@$PNuP$n>HqsM!jc3KG|2~MOZuR3hC@Ik}xtY9!l>>3xw~t2nycZ! zOP7JDSv!jH!*h+Xs};V@wq-3&BVMgr<$av3-B)$(obSbAi{@Cn)_#1bT)KHPW^J-C zRy{cQYt|=~_UrT=&Ksp)^HZzb@0AC{a5%XZ=|TaN3|PIWY`npV(@J%N`O)l(Tw)HV zp6*54*DdykP`^%PKx#E18G1eBxDkEXW&eUzlHtI|ZR|mkXDpxkAPACD>U4 zEIB$=Hw=1HPL!jPWQC#A#S4c;{9X*xv1HG*h$#=`qHBc|!Fk+Km3nBaw@*{SK^C>y7g=Aa$Sk|! z=Y87A?fWay6=S_O4#p#Gt?O)>*svoRv$G9bg);7CTYuy}r3~2YKSVL*!Ykuzy4P{{ zqn!@X4M=R>Ka7|bxX@&mM*t6!p+Gug@3ffq1#92PMC$^4vcUbov*SSrVMIQ}a0%EN zz67EKp+<-sOnKC`@7o682NWrRxv( z$~fUg_a%9!dmJ7Wnd{o`)_;&h`fKj(AhK?sI`M|_Rz?|UQP%(^<@R!6=>@ulo1Y{` zW#nq^%((d3(C665xge1|jZ zuqMP75^sltcnw;rlH5!EEgopZtgOw{^y-PWyj}9I60>UOpLxD0^W#0&_^0%f5n}Bx z4*RdvIhh?ruTF1duP~NzP*m8Nt?Fkhx9v4J-yE4=l9No8Wc<-}2 zE%wTQ(QJp(?R<@jRoT5ulvqzgg z$b3?tU?+hls*jTF>(Z_tuFMm38J6v`UUGMDl~3t2f@Ik$?CS;zS4~zAIpg_+aozh> zLh%;mYItd#77Vz_tUVvNLVE?HPkKwqo%jgVQVQDsb<*D>31W2GUKrhira=j3nG+F2 znl1ftf9DGt+w6#RnQi!(eUAdzQF+(*`-vcBd8+CM|E%}}X^QmKZ3hS+cx$631xwH& z+bO)eJrHn%Ywv}u@JXAJMgBL*2crohX#L1ra<^WT$kmE*-U@x3ifMSE#j$44mlx^{}OA&2T0>temAaQ7Jb7>omA@j7QF}dE4I8@aj##WDtNNR z8E$(icH5<2do08=H`W1Gl>}PsSosDdrDHTZ>8?)rj7IF`=$#aPkm4)a;h=K8&zmHc z1=5Jc&Q#t2usfWYI6HZE3B`F-C+j7WMcv4h@HuIh%%P*?r(DzC9>*b>ymESdmW!SJ z`Rw?Sy=1QfT3-c)z0^(R+OXfs&ANRxLUXmNwS`r&_q&y;$kL^*g8KY<)e|TTO)u(bN$c3gL-r^+=7DaZ?9IonISxoXr!C`g z%i3xl&wS~cF*a|qS?}~=w~#L zhLRSyFdQFCJD!3x@~a9JdC-y_fz$v)06X)65*7MQ`DO7*6+QJb;-?D9;GFe-KCAQS z$az|4CUI^)PQ_kZcN>(RYqR6PKr8LK-)b^OI-#5?GlLllZ*(bo-!qNNe>;#-eSYS} zhq_0^pAEMvBa;rIU-KhC#K(f=l8Z!|FIDx%Qa?Z|`BNUHOZ*kr4_h1^eNp{}LxFDd z`(YL@;vZK;gu#AGKGIh#FC4n$u85e$8oIWwriHVk?aQx@xPBeYKK1NCyr=>Q?-(-2 z$w9Z^th>btN|6Gxp9Dr*HEaHurssvS@Kak&nOheH^fqw!{K@%N+h3e~%t1R@ZW-)L z%up2ni1uUIy^OTY2APVxXI2xZ;wEGk9MkeGpyurHXZ^Ip8OKSt>?m0(uJeH1WO}p1 zEVk^hE~i46$vj~xGRaW1z^^K*-;8WLc5f$xLX*(!d_`*Tkv_o_k5X6;H#) z9fE)Va90xC&~9`u0|qy+TS%J$2}4wVaC%@4AsWY7cD+4`-jE(I6e*#4yqax?nc1zi zS>6;foySpk51J*1EDC5jOxI?Q#jE4Ex|W_;Q)f6m%C)l(8TGa3rEKcc)>mC;w^Fu>*R;M-hY3{pl3Y5&`-ptTKE-m`$!dd$(M>|!W8ilRyJC?Rm zoVt!ZRCYr%`(lpA>fHLQTfzBtjN=ap*Gy-sr$#^6B{cJ&g)MFJ8Yq6)42Skn1z6$? z-M2$UBJa;AU~gE}(p0#LJWiDsrPZy7gVw4TTv1i^EN(9VZ8-{bE=3|=tgzvX(G)s6 z_XC*lLwuptFu&V#lUSX6@#99qH1oPE$&~2ob_o(GaUH4^dWQ!|q6-q`Fy8-0qK$)8VTv{?(RMh@B6)LeP^AY=NGIEvlgr!_r9)u?fZ`G`FJ#_ zx!H@Bt?(0-`c>1G&))62dc}ICEp8$e=8j5?(xZ$IVG|Mn&Rx5EI;8Ub&YRGQtr`#+jjz}f1nt@}j&E!7-E zt!%{pE$Q=wUgV1-6T1I?=-M!OA2)5SsJC-k+pb>)*eM0p{&+fw*PRP90d%FJ^?2Gj zLalF}yQS)J&Aou%2_I}?@G|mdzFo`o{xB9#Q%JMZXN9@LKF{~#GJFub z6YkwQ{=7coBQc(~_SYmK2(=vb?881|c9}|((sp%yQy?M3MuvxmJ$BG>UhSu;$DRb0 zJ2H10y-y)Gq5*o|Z|>4bQOOnmR`NgQJ=ATAs9rn0*=IHimuQ+Vx5_xvzhIfJu`2up zP)4w_m3F%rVZX7Xz1qX{a--_0teg3Zz|6l;+GtsoqI#*$7*Vglv-=16-ml%6!byHB zV|%K~HY%G^$5eMS!t1Y|qg&d4vJZ?Fh%8%py!vI+a$(7xH{kxjqH{m}z5O!&s8Tpe zcx?JWpjhVJEopB>-@llkg{$MEug>O#&JiAl&S}MUcWLCGJKaqeIWlo>b&obk!P<9j zx)=95+hPz8Q(Db8n2F>xbEJlfW{302sOMFvwca|!3rbR$to1aKerM&|42vurlo>&+ z|1e92%4J!AES%8NISs!A+iWCu^Le!WW}+BA-mMg9!FP%h|Nnan+W-5$!>V5D<@IX)>*W7^}A z_F=+6<~z6V-*&F64YeP_J)KztdpS(T-JH?T&iPm5Bmd;Tc(x#6`tUg-_HFB_MWn}h zLCf1x+wYLgHO+wp3{mZoPDDJnU7(7cV#Rkc1kH`NoSL0yNT7i)7qs(CXvClScr_2Y!9Tuvc^2hSuKQ5Ss ze)=A&UcRF_1paGgX8WhLK{tYTO(ZMK1yh9|ICgC zoj=H`iR!&WnPzko@;tROUplnSs9Ctmk9a!e6A_bWyHF_f(r`(Nc$a*IiO3603h zoy(m1cOmt1{dbep9KS2^4gOeAyf^%Y8uw9S7mM?iDm%6UJCN=MYz>rP0J92s(;1ZtRU|0i+N=&7xg$K|DZw;;RY8{ctSmxms%e)&9dRK<;uUsMKn!_>LMHFlXo94~^3mqE8TE z7Iw!Q@&i2l7s9xnvfXa{M7VqnqCGq)*5(PC`8e-6d$rM{;I%woSN z%=IXJSvpQS@a5F9Z$sJ3C<9;|MiY`QCrJMeeUks#U|Di@dD*wO`A}PW^Sx_iaY#EB z-JfEdM399hcW^TexZUFRS$t^>$VK$Bvx!14hB|Ia#pBI&&ZzmZpdmXO6?%&pN0ZTg zWx|xo-^=pa$A&OeN3m^(v+6^ZLv4@J;Xx1bgpJ~+RmTv93?4zracv>yef67PC9vV! zyx87aLCCYNOj;pp1xzm8S3F}8(oe^O+Q`N+@zlkmPSDbe^*B3<$LfE2x1(*MIY zapbn1Nb=ZnRK_+gTaPBGCirH>2Y_XNFP3^=1mgw`X)LlZMr=7)k+fuJJ~sbqMpk*w zqfZP-Hy&YozDLOfiG&v$=#O_=3w(^&5yi<*c$&+$U+!rGL0D2t7I=n^?yaCj-qV{5 z5j2&f|Hrd`0bKslvb+Bb@azb;S)dv474UKT3v5d|zK>)nlCN%G|K}Qs6r->r+401c z9lJDFzU$cq&jU9Fxc@926SKJR*kV>`PJRHiyS-pV^^7WDvJSW|=-P zRCrUT*1k-_%HyMZD1UstW4Dj4{Z^&cIQT||OYFCv@cVS5&uKR%ntw@l?A{*&1hjD|8gI zD-Ps`iv02$!Bs2)A*~pKpnjW5fg7w){;B5Efo8?;1<$wfkGC5{0+3bybKCOmjSz|n~{l{IPi z_g)_NfN6=E-rcrz7F!3e_t3^Fr-o{so+-3jFm6`!>NXMcv^QSidL>fhe20a_sKbWD zt9~x_jC6`tKd4MFs3u)AqmZj3-$+ES-*Q>2lfd*I1dd&4X^}~HW}7Qj+9|LZx*_F# zB_F2fro{Ef5e->|Ac_B7>TDI37v~zjj5e_aRPx~dl*z&(Bag;;wb+k+fj{){US-w} zOfdzbRUPkZIsJFfaX$&Q7OpFf-_TWHO(wU@qPU$mmzYZx}A5KiAN!0Lm~_+d1mp$<11@ z@~f0*wc@S!)z-`9i|p-919(BymVM|9V=6_n3y9||x^1Xm+7TwI>#8yIPqeXZ3I00z zB@bhWZs}YmeeW#OU8+eaG)qRt9fkM4;yV&GImIe68R`kOi1Hd|!%38L_TUf)8r`=? zCuen6ZH`J1&KCkt&C*%qEzh<-ZHUSbsXAdM<5dei@m$&bM^msGEqin_Cg__V?O=lY z_YUEQZ<~2TSETHd84f62XmN}6)%yK3X=eW#w_n!LQoN zJNTV9^d$DbFz&GM}49wA>OS;M;&q;-b)VITWm^rT7#LdQmGe!0}lc znm;EgR1n}&&0IpfGT@WqNR(hVIW$`btSN#zQ*J_Cz7ZI1R@0)xH8*&*=7pu5A1+r@ zV_Vj|&L)%q7U@7%oa!^pmh*QJ-nj#i!JL#a=g8oIu%gJkmlm8yQgfWTS!q2E+r++Y z8iigixgZu)nV{opNwRAO#&8qu=ZEh!E%|sPjXaLTHq&L)uQn)l++4!d4asT!a0Qy| zM55V8&o;5T;BI^md|rR19c@3x|3>FaG7yxVBy$Kl%r*aVXCX*3e_oP(r`jSIkz&RGy-L-#|3ZOF;=d*qKY1$24o2O?5R zll@$0jr$FXLibIXoh=3Pme*>`jY%JH-_2-_1^nw0cxMrxn8bLBZ{wMUujZ2JpUi z_^&gRTqHCbGA6}B$D1F(cU#{lG&1~#fQZ2P_FR+PdmQ-gO!F&?kuPr+4EKIKt9Oy+ zsRH;`<)2*008#Q}O%|18DFr_9Ynj3&oaz$-jFUBe#E4#9ih!rQ5{+hK)+G*zZ4uBc zb!A<$gG!#x(2EQ|RPDj*cKkqa419b=QR0yjJwsBsZqZN3Mr8eioAsRw!Kz83pW1tkzA$SvbvXnq~oo47ttc3vAiGG^akjQEa zLNxtN@=a45=nq!C*s`R@@r6RrnY!c69+pwtgsgP*-YD$`?D+HxTdI$ZyPpsnQAhE(TPfSt<-2N&S8OZEH_c>q?S(?Lfbt(_ez|Q~ zrD(cyvMjg3nA=qj|L%3M`U4}ZC%<+ko=<2)QEqT2S>gwOkIm$^giLcJI?|++;ybz< zUG!LLWV7bCh@qZ7Oe~)0^Ldd;@szUW?I#N7p7$&w;6qgca_&cy8XD>%StU$mGT-vL zC_sw3`FCC@(iuMl6IJH(cdJu?v{jdCu_Z3cHri27Mo|FVM*v0PYD>w1;WF4DPzo9V zgIEA3a_SGjU_XTNvkx%PqHs#2W&;MNC6uh$|9Bh;csed06Ke`0;GtFh@iX)#G9^}t zXY{+zNye?u-?LGAgg1Jl@g?y#+}6|khj)CgNj8@QX$So340HUi8zNv=K`GLi5$J|BMw zg|e6CF|9W_!BaUje!23KVm8#w$}$W6rANz)UJIJj?3@?*hr0>CNfG|+e4+A~Ay}k) zI3woL$h-Wfmy9a+y}zq&YxvE%@{3;JdrF}+fJ5l(3SkZ}4p?Z#`}3l>9%+$E5ZwCt z8cp?~y`*fe+ZL$X#i$1|Dk2-?VR>}r%VUNb^3m!QOy9@_YtkBemTnM*VQHS5gfse` z`=4)Y1*>fwL>HJOktzMje`cc}yjL7}4vGT1pqRdDF6Q?|qO_-9oFy9Hxc>q8le>)0 zWkSIJA---xjIC7h-lI}?vE-{~#3wVY1pi&bAwm;T`I{E?UvU&SxHh0ldC~@n9iS!I zz?JeBW;`C-)-?!lMscNt?jda{h9dLHDxMt{0vS1)*-P~iDA;fWMP&2@1(4yN7^nsA zH30+p_vk4?zz~u{P3Ely7|4R6XRrXnFNT)QTi|%cnkw70T&ArC3P1S6a3O^}ZrFRM zBxz>Pp#{rzy*cGnVEb)~A)Kg;?xE)c&rJz;Rr9rDltOG-R z3)PZnl~cw&_dl-po7W+pzt70?{TX1)nP%_~s(@AMty=eC2fB^iLjj#6ASOh8!-GEXPE@PjjTxQd=ZqoOq^dxFl2B299-6A zkH%NUxhnnFTb|ka@Qs`vWL7QkOM|slcl@B5j3!48ni#u!^Hq+y2g%kDti=y^b4T7RRIjATVR*P|D4Fk zo^U+qE^(PLDE?RIi4F?GvnkqmGd>=CAmi5-BCk4{0%~ZS*F`;${5!}LHZLjs}Oy}BLDS7{5m4PG0!`)w@Ni@sn{ zt$?omta+@Ih*!*|OJ8uFl_~b^Ed+roPCP$RA%@FI`1dW#tU<}~VeVZPu-*h$6dXT9(!RrW!^ z{DdB%>CII%?iB*?JI~)qqX{vlH)qm=BIHH5#0#oF{5~R&XYP4(CT% z@q3oK4R^K|H_A;%1cmoYmZ2HZGU1ChIWg!W6Dvq?S)pl8?J6-+{s$A_wLNL|e=UO! z{rG%*=z+5ZseV*ZQFXHP2Q9?h?GkMvp4_8d(f%PQyZI~A@f!bZIS>iMhWZQ@*^fg3 z$_xUU@?ORJj4;Alh2=N{V%yRGN$}cH83iVqe41Q0e;mIxZo>B2+U{4&`AK z2%szF+CsCzrnEP9PiOndGMS3#WL*q)b?H0t1+W% z{r5me5zIlZ;{m$QrB-qzMRf$+wOTd8)YSQTragWL-elX2BZ}zejrul30(Ck~x&AWO zrbwv3?oT-mk=LFMxiGwY>`@0Rh^6NblkW6!&UO3nYEj2N1YScY)>(KPxps|vmZv~ z4H5AR1x&_$N%Bc*h7FpJ7&RG>b?UbgCJa_{UlaeJ_Qr``=LV2u%lLlC-)EfBySuzB zNa!BdKGvY=twiQqyT3%9aUW%cU-UVnF&OkCxyL!25Bra|*-TB=at4N9c9aOR7(FOq z^Ko+}?Ov{T)U!fSHfT2JHge*fzNYq)*{J+I5G_VUB!2{d0bnapLNe_JTgfNvh^(k2 zjAudDK@MGRyRTw&L{=ixB{*z^_1#}cTC@J$F-(P({Oq7!(-qv2@y+7BA2`YDa{bbNR|_Dst8aO#ej`~xPn~0)8@Uf2Exz7Ir{*t{bZ`U(W)kY zYAmI_3-Vi^;ua^PL{stKQ`}mn7euu_)j|M_F7;P|3Z#%Ik)Z_A_C!E6Bb|ao!HZin z)IM9zn-_5H|Bwts8YkFVN71LW5jb9ik1c>G`%B1FV1PHmydW1JFheo(54!l|Ok4s= z5?tE1HI}iK%dySX9C$A$3AQ^Wt*p}ooaeMYZcACq5Y}Y|TdZ#f#e0_uoFF%*mf@tJXJxOeY?v%2ai@{ve(F?NO@vM!UCZlMrb!5p9Fpv|d}S0Xvmb*4Ku$3owr`hQr0WHd#Ecnt z=lfyN-m|~r?AsP1!tN#wwUlSs-PK_1^<+PhLd=GgC@x7EPeUH@2V}E)KIFVl%NGU7 zX|()FIX9q>L9UDT_-jm@LdcLLI8&2=?SgC9$33ZiR_ltHKPfqCGEEf`uJ%uGYKZ2m zhfyYfhFj+74O!2npL+{;?O>M&#qC)^Ouao$Lf2;G(u2Z&*9&NE{4%Gf84 zP4}M|n6mn?l(WuQ`(z|c+6P!abCV2!g{~Wl+zg_eanTq7I@q+{cFg;+YxHMf^I5a_14@K8qz*w5{67O#Ys_=ngRO&) zPUx;DxeJaPkQtX?`FkZWiW@JI!D50gN$;ehTl)hVrCsz&I)TIGQ!eI7IS=0rweXUu&m+B02(OvC z>eH2@8;HoRTMa>e@`QDhQ6?^(gl5>!5Rq9?P~=Jp!bs;E|7os2%hqT%1@5=#%QMLC z0XYmEpIplRfK=!^(~Ag_yT(aQ-&rpXkDbUr3ycWOA3;wfPhvZ}Ub~B0Lh*iakWCsF ze~S=8g2Chi!4vMTwe2LK3ZgqRrnojZNi?8_Ewlr1e}=Y#)W-l-&#}WuOKrM^KQf5B zkw|`}i@xhd=Lb`p_odmWF8Iik4E9nnT!*yYW%UlDY-NtMSPz8b|UJ)rt!(H(&GhAtN(}WE49!o}&+YQK@1N{A& zj83F66xxH%C-X2=ajj zBqOH|udEz6K$XN%oOZ3f?*C4~&Out~6N;})=3Tfbd$t}0zrs9zhN@D~sv*F<*cU&a zLQ2`%*K87Kwi!I{E}ag6s8xtuhRYmX(dUq^2BAZ)qp9|`9xen1ha1V~6)0o2;jr#rW3YFpu*=>hR=U(xZCzSN;Lvflc0VleSqA8l!= z{kqD-8CId&j(omQIXds)N!b7H0*K5>^Bov(8wkS{5XPH)0oe#>l9WRpe6>XlLu*R% z!@UuhlZhD2Dj(Fh#(O^K0X)>|ML+ISl^{n0%@Y5bO4pQJj;esI_s@GQs%sX#3SD}0 z#Z%P2m+&p8I!xtVQZ{neW?BDx2ao321IT;RJAVdpuOSP5viwnYkvKx-bR{WY-%N^@ z`3((Q042oO?+z2G-*-{;-JFvzV=>%Lyrn2W1KWk6i{V48cRPiQqF*~!plpp)GubC2)S?>UMnQ_esc8vl?s>- zH0JjJ^*)!fPAti(&;1QIli3L;F;mYC%nKVdwZt)17NnmHhwWxW75U+fUv!%GoSFKH{c z%l7~-xx4~99s)omO2+c4RuM2flA&AU1CgyFV_{Y$4}^M`0J;eW$rG1|>|l74s|X+< z(&|EqrZI#pm@im8#75;{Wi&*q}a??!4WLzTa9410|S<#j#-JuAlnLnw0`u&R#mOI1$m>3`QP9jO@CMTmODJm1*q z0>~B|En^*&WzAgQQphrGb(r zjokMhygN!c?eVZWR{wjBnOVuQomWaW%~U@+>ysuD@V{YtOEjs`pQxjiWtSeH?1)U$ zDjmC)KQrX8&?W?!{hPMwHbtrZpcafSbciORj>Na$4Cw)}ZQoSKrLa31=6S#sO18VF zCV@ais>a+B#hkv3S()^|o(*uHRrIyx+X9xIaBf33@&=y=D6xhEJ$G zmm2^#TL3Jsu6@^ddg&6|9G4?RURTbA0` z7h)fTRmO}S*DWTxi<(GM4}PlF{_ft#0FgISAO`k1HR-6A z9T`o$Qb1EugFTgU5&#;@H>hn{U$aK-daI{a1$`JQA^8hJ z7B<;K0=Ci5!Nc5tpaV!}LA=4{_szO~xI9>(Z9P9kjLGAjr(|R7#2om(I2@D) z|C~m4^rc+OrXm~%*!d{3JmYCuphzdQzcYK63&WMPD1aMC0(E@~cROk)>c8ZuR=DO< zhe4)AG-F^1@UQ7E8fh)OkQhJQhlzb?f*qJo;cm&pp{=I$OIamdVMBF#_&gp1SC~Zx zDkGa7SpkUQ`P_R1Mxx_GtADk44(}mzM=wF>yNXArLTwx*&C=h5Hk27N->%jw@v^qI zO_OqIBkVn1ERm!k797q^@?(Ex^qzL{qV!JPPuBjr&Nm=#wI$tCXty(!v~K60{YHsF z#@3LcJ6-J;{o2rozhJQy3)C8U1mTiCV%|S$B=+V?ywtk!dj=fN5tT3SAtca7Ytb7i zRw8K;2;UO=Z8HXrhLmel(Ya&-5Co`b$Wb>Q<}D^*AEL>|{S5F3;`q^RdgKT@RX9})HJD}Scz>cw9AIZ&`$=<+41 zkyW}u=j9SHq5nW-bQfiqZNcgfEgS_m=bARCSVX7fyO``EP@!7g!5hB!5%P=`XqPpK z^MOCpT+2ZGTcdwH!#+*tWQN?oKjahdO6`_}ug-Mse(5$>R+llT6q&M*tOi{WK}6&3 zb{Ps+eEtxUIhXq^E1|?ZwtQ#ec{QQw*mo4pxUg9xe4|ic2Q>QU;xpa=f#i;(b$aFc z$+!+lP-*1TttvsKhDOs}x`opyBQgR>S}2+39S8+?t`pkgS47%RbZ2ziI_Tf@egnPC zl84(2+MIO;nejTF|fp{GUK!#x@yV)%cXh6}i3;$!&m(z>=*_BKTcy+8C-gg&k0g_6^uvkZ3WmDzh#vCTYg)?!`JE_i|S+(WZu)i*)NM zj>c+__Fneh4+;&OBX6AR(y9v)Kh^GM3Q8YedX1dM_u!a7)q`7SRZJSSAHi zRh|Kv`#Iydxkn}4un_|MNnj6Z-v)qf?l78}7{>$gxGn|s*2cUP-vz}VErKj&+y;i` zw@3*{M6&I@zv+ML(?Gpo8LUQET6OQD7;iYyN3I^>rL`jmEsd|{0(#`21=5~PNPKGx zqRMICCQDr*ezd-BhJ|m`^!51SPv*Q0%katQo$xC}>uL9Et`07L-OSTeSr(+V_&qYFsrX;odk8;m8Qfr94BCTUhlN8OlCQKo zUG&9-ItlmOqbo9x#&tM?HIu}AN%>?NeW+kJ!qg?1YuO)Nn;Amp!dpuTBjajPSdC=e zP^&VE+Ee`!9%G(ysMNW=Z!qK}v+YI#XcfT#yVoUPv zX*#lA!wR~DTR@RY^piOv5=fM1Gh{cH56FCoYJK76z~y(0oN^F-oj*c5{AH~xo|v~B zu*PPr3zpzJcFBhLt0(GX$>_dt#Nac)hUo7~cV0aaBulcx zxJWnc$prjZh!y@R7&u5q{ix{Y)evwxE|vDOhChU_q{;7NBnI$mfzMtLujJSYESP^5oD!DxRQ(-Xx=J+r*XBKbpW{9Y!dzVV z+0gNU;399GbQuLcFsD&o*X!i>X$hF;(NIHf_{y|W??6FCUvOq8lDrbhRsR-<@(vl_ z>YU&W}yA7;1rC|;?7{Gc!9 z&-3{)zN%t{T>vw@Wxi^wRpwiwjMz7^A*_h^4G(9p$9@0&C9pk*)vNzYeRVUdx)1#$ z=38m^7%#wyms87{d^U{h@ABe&nw0`oQi5nuxVUyoXB7~0P#Z_Q50lTGlI$A2$vG%z zXJL(yuoLn(nb1(%KVAI9&W}*xdI;=8n-ViKqb1vLJQiKx-;#bFTHfP%CQ)wqnY<3x z^vo0sM2gUl`nF2-1~VpJO4O&~g*z|h&H&s;ScSpS3mopfp(YOvc%$rl&R>RHHVjB408V z1h{JY-vJISB&No8Q~ka;69XKHlK}NXty?-ZWcZ5wc$MjXe+)k+lZ6KCQQjFP3irZ; zw3~^hLrdrxgpzOoZseWp-84yQqK||&zL-`tpqK%{IM);<^)%aD!0qLFMh`u&};B=JDpB{P(sOCNh5 zc0b|h2NO&WFZeGuG3gieoH*7Q6yzK?P@%e#UO09)N(m{sF#elPj7cI9!`Fzu4>4gV zGz3}CboLXbpR?US`#i@mQJgwYjXrOid+v%Z%1YVTJH86ZSIBt-?u8+l0s|3pF{slc z5=_(60awbA%0Wiaf?>*XLTVovgkad?=KB8D1^MIE>A% z>GtBNw4-8Nd{F#DsJ0Mv%kT3GWq?$lj$h6Qboe8!*16p=k2BqKmhLG>QBFnz)TbZX6>6e+K4~}rS}KT{kZf2W#H@DqS9NtfJv}|sn|>>y)4X`#La0&!hfi*2%CSU#Anb%r<4vbWuVgLYlfFoB15>g zfZiUKjRCw&BqN%WK)p&o(r(G^1KjQJb7vvRo~f(3k`C@?j2Z|-VUohP!B7mzy<=q0 z8yfp~$YLM+q@986$+5VSt%DT&I6w9Rf5MXC%MFB=wYlymoy_9)+0R!=X2w={ z=A(8cD;>y!1Hc|y1OMy)OdQ`P>Yg*RjF&QWu3GdZp9HL6Zyh(K2(^lT8$ci(?#s9w zm~iYOF3`8$>q-6A0w|iz(5Bxr?Lo2Z$XyYC<4z_-d77;Oq09oUZ1(+nMOQgz#FKGiaKkInAv5CgDCO{DC1L#stq1rhE43906Y|5MZz`2wFz_tllDu($8ai zjEt1PJ;)J9)p_dB-}LnpFdC{y!@zn?3Vw(Pi4Lzu7$Q*$pT}^Sn$2q=AluZ1z1+rZ#T7WI`-g1&2;J|r*cRHw{r6nC69!^Y3Dx;&L!{fZ8(c9mD zR(}t~dv&3P#Q!a5D<~<+GiY#8B}SpFsH|jmj9PdSL|~R0en>w;UmRwhS**$NmwRZ^ zxXO}&u}YRrrc?Q0vSt{Hhwm+bkWE2KlPgt%+xD-am|Gj8>jY$PjDuei5lFqixk6`0 zwnx^b2ryRYyBU39%#-a)XZ(5x5khhG#6V@nEfCyx34NnWx= zUa=O@2B4$8=?`<4udyD(*>PQuuGOnm?xnlP(_yFg&8>9NmgW8JIN z;4#(yJtP&Bu7P5FR!QO{?9j&t>T%!dz@*0v?_!znCg(TctDk|mw;t`a#d@ViU%k1) z8^PuHmmQ;dmC-(VuS^3JHn(TI?*y;}zuHlxXUdBQR+}`}oHR4MddZUJpFCYK^<#8#Jz% z6N!8!i}4@R?KxJ?_1gElye8#04wbWha$jsdj12)WZG6tChlk5hr06sBGXkI(o2ldu z?q!{@iq&0wB9UMILa$8n6E9h?WBR#>0J~IHukLiF6tTo%9@(hKVubs;XCn!P2xyyt zeel3#*;{spYV#F_T4Y8uyM;88Z2CYx%mm|RUI2>TBl5oNDIy>rLUZ_pW{P3B?WQk+ ztT}*TK|#dakX#?@x+=M`YRfDSbzV;dXuJy4)S{Q{?aP|S?pJdsNdc}kfWsZ zIRS)%>3T9qW5DTj-T+TBc{=Dkfx|azBfljl3O63kr>yTtm3d?#Npl+#w1w;K#z)EZ zO!OvV^w-Ltc#3h+k=Aq&&ZBrXUJjQS+R*?miZmbx=4Id>Y10HL4b8^n{F+_(ux|6R z1S;pse6V3e${kPqcq&2H{I=|xx$)J8wVs}R>2zVR$_$Zge@6uCIi_eDq)|&V%0fht!u)2{ydpf z8zc?tAV5&~zy;Akrh6n!C7fs5(d-*AAtd5qlvf>}48S57dH&Q9W@fQiapSLjl@nE3^>@lx?~;a`g%Md~5lvu-VPEFBN$3$k)w)WBJcVhI#B;3DhqepI_vCQV;T6O75`a5RCG)jJYxEcLcb*;3ot10_-5|KU#T&k_v=p z&iC=Y%Iq)UyMwybjz@b*qzkxtUqZ-eH}>;4S%KK39^>eM97j%(B|7t|ar03tW|?5c z-2?Cne>gV^<=pkp%RHfYizD8ls=O$dg`4`UU77u8A}*%Q=C4(wcX|uD;$QH8(^~VX zWh(1>1VCaYD|f-NMu4(D!JiH}e>xW-Mnp*SN?#F?DCE^EA0lYNVGN1qnK)9#hlUd? zpW?%>M$koGlX<+VuI~Tr?2K*$iuV$}El)&-f|V3xDrWs-Zbc{NDrG0>3y4Z6_vTsv zRaM?vhAi_YFc3i3fj?jsUca`rLyhcar_MaUdx*}!efz29t^P|Cs2E2u&@1Xh`jLz7 z1>B>Yqkk0vX%`ylb$-!fsX#+RR|iy8>Rr{{6jrUvHSN95s3VN&yaCb%uEkovn~Dwl z{kq1{{O6IMK}=k$$u~n2{OnM1HRkgv3l3rzV#{xw>STNy*ryWPEY?3E;97hU zi6V7jHoqLacD(a*^@zxN#0O{+;-Ty#M{^z5NM}|YCA{DI&?}8~;H%brcjk)^tSF-& zEDf+V%bStjgWCPcQFA!@ybghKSCgaGfBEm``2Y+Pg4;E*^yAxRBI?g7r!MK9aLM(@e+S|zJ~QS$BF^i-SIAG1K9_6=TacF# zz@kAQ>lcWetgEft6A<1Q|5B zkJm|shS!Fy(sftH5X`wF;=2p?wzLGo8oTG~^A4}SSd!v@Y#|UXLxTRA$3c#xC+FW0 zU3@uHgnr(`v!I`C1bbOlDn55Edfq04y``zMXinX)g7>*5nD<~^C-JMH@<^>}nHklT6!KR}}1 z@O`~YEBbzq`gqii?{wTrid{>DFM3y&SKr4=((iJ3!jcFD{4ltA zJ?}{&FW860P%}1YaL6Hinp~g1@xwQ^wuRwR57BdPc;$V5z)AE_A=$Ru@0P)|w;U~m zHdr@&AD}2x^k-f3-%RsVfK8C(@6}$m7CD@AY^a)t#W5#_@HNP(u}I8l%z-m z!NP8D#@}4)d=*4Co!V+iXD1^$GuIhF?`V)SdYqPan4#YH=P4Jj+ZjLXWOjcBy+Hv` z#G#NR1S_d~FfLAlFsER((VNMp#Rhi@#seKI%k2;tz)u$4ClpD4pd8E}{f*;YfuoO9 z=Vv@Ew`c)Hfe~w+l5H0n?Vdj?E=2P1>L7r-FiW5;05lnP(!aRqqDz z8w>e89s!AW9y%H6?~YNo?bB}^GFsr$NusyE zp*pQwTcml*nE<=bN%EspGqG4xJ@DNKk%2&MgtkSD!14v{ z%(W-@=C>>8euz-Z?gf!#kuuYuFU9VrZWs=>o?t_jRag$Fe@5-7!@H0qTD0?Ti;Rp zp?y-XdZidfMlx7JeBdK-EsT2fTr9Eu1F~N^lE)Wwm%<)q9m7)5(_21kuFCy6&zdeS z-n1^Ox(h&b_y3v2kVWLYbGiX0F_MNO^UFddy>Zd+74)smtPA<6yrH8%O#$>4Go-$uzv%bNN zX*7nToMr9|a*DevdG;_i{Gy-{y9A5_;v_f>;X3`mnnx&Xne+UIWYtd#Nsf;=&|qNp zwH6{`2g5y3+CzDX1O?CF@0h0LMH*{Aw=e}_*C9v=N^!f1bJNr0n;~23Yyq;JBUdxc z+f*US)K3Jfa<{{qZw&Zbl$Q4R9Gb z#s(PNj+`hFV&!SnSdzkQ)UA{e=FO2&-Up_LqLl^CEdSkJlm;^r4pVnU8& z`Rfe9=|I?@CaCccw@{Mb2SA6J8;^F8z?}J54nUhG5r`=LYjZBh=iC6}Z;!$x(TVgs zHtS93_qc-}p5LQ^+TJu%pH;(rA}T_t-dm3tKT(PzL9&~<)-Aej+F>~o2+Dj z^0YWbAYjT#sQ0vJ<1mJjGYA;SRv~J7Pm4CrYnNUv|4FNhyEj!<;+B?-nwpx=_<)Cj zHG2R49af{W9WT^$>Lk$~D%PpoYDK3Z7VY|ksXqtseb}7^OXB}oAqY*;e8tW~(1QSz zDr&V#Dxk2X_F2tJJ#trh^q!;^IA|j6uf?X$+tf{BBELFP@Fw)#%2Bpr}G1% z-D&XYIMx)XmSA1kEBjTruzj)pWi}el5Ux^$fTi#(>6@#f|I#hpV#;izLkkEZA}KB1-8rn{oU^a`Ven*WpR*(8ln7OLHyOZSyXL|$o^nhBA)%C~ZI2AwLq zev1+4;knz!`okl1S)1Sd6jXmF(oZ~*n6yYmN9SiQ?hUepD~^CUxKapl#!Wq#1*;cR z?7&-QCT-|F%?>e=;^OQduU^j39Mwui#8XvLP?1AkHM8`17-jf(LIh42@Ur+a zu>x>OB1k}(p+3a)c%e6jC+B#(7OR;Qb>>D9ZLPUodo~ZfB+gg@Jdb&RtOQ}>u-?Ll z!@NSBN=VJX)C-As`;mOQfL#rE05nE-T@Fq0DM{~MWRW^;Wv6$YksJo7r zD0qX6uBU_UYqSYpqNCUln$$ASDK*9&<)-)nGJZ%+m4{ZCg{cc_?LxX+s3^ zkB~s1NeoQQ&r)Q=C-M;ieK`+ZGy>~0O*X{6`1H48=Kgih3HJ+%9PSa7r#!3w4oYak zk6BA)0C_MsLM-QH8E`|8p0$z}xUn7~ZuSyTL4YdaN%*}O0f>N;*8G0MZa9&Xg04(0 z|4OdlX&EsstexxWQ?h*_Kut8MS-RF}S1J`?5~x0CAH?eK(isH&oeBHa7fyX{LSK8Y zlh2NQF&iUJUyZzaM9q}&3m!+NmafK-QFZLiJ(SyryT9~`%5HpR33jbN!oFlQuSnEQ zyvDf&c^r`JpMcO!EA2;UV9|M8u=B+~V$U4|6H&kCyQw!*RB0|-&*-7+TpE>ZMNBPO z_Cds6WlXxxubOoPhj85;^3N-Z4R9VwV+ErimQ!Oz3D!&f@D+13%o?}T2oDY+`b&;B{dbtdJOiI;nDV#u$5-^KpW5o%r&kidLhODm-K zmZq!sNul)qt_yx}eGoV7{ zdu5`#68%p`vVA}cI{fR|s`$cK`D4F1+)ch!aBf9wNNX9r8MAx9xesSFXs)i`PVYQM00pNk;rCZ@)(rz*NEFhdpaOcd3vptET}xOhenGkbuB^Ok@++b6HoxMsGS>S=RsjK>eW)|UITz0a0`l->O?53vp?^KJ z#gNZNKA}?x&>!UIOGz*CQ6>ncQ zoy#mpq$X7tZp3W_DW3#{n;pvVbRDns+}1f%DO%lP^m;G!%IOkl3tg=k@M5eL!&bS=z&{D2WUxVRJ+D5v02QSuNV{Prpm zzuP18T3e{d;KAh2w2RaVwzutgZ^m|}>n4Mx1ej0RL<)U+TbHoW@;i7n9hU?Bf|1g-!jc7ADqCfhvLtrFsj zFqFAGNTQH@o*tUI{?bZ^{-fIPamkP~LY6%nAL9w%y28B+*xtiiRML}`2%_*_pZ>Ls z=74szr0zO1ug8q;@?*7BuR3K6oB`5-ho7R_^L~HqVz{7yP$CWCu}G@ohqCdPZ|WnlzHnInXQg*}z7Fr9x6_msh7!$X!f+ zP^Mb9lhajX>Q-bq>uL3Sin?Qx-&u`Ec-A~0tBJ4jgR-8yW1GIW$Mu9&VBq&8K*W`| z$;XY`S4v&YU}$0D;%_E9iekurt`sWxGTYZAo?f@eB6Pc;u`hjN_ps`(sZF8C(?m(N z>OHKD^me;SkRucQ4$mXctH7tw!bHXIYrI#%wp46=D9^Hg52w$KZ_7vG_FA{(0 zAiCnEVpN>T<~3V&&Wkr{WWAgWyBN*FGsl`V#N<+O_F&Eo_17amaH)+|yffFnjHv0FM^{d@@pJFADkf|-(k zCd_|z)l?H3Z{x{w_v`NZe&R}e4-4%WN8Qsp-1sD1R?ZYPv^JGiLC0K5w@zlNe)9x& zb!UErDu2}FHKg8j9Tny0*oV=Vws_)tUQ}Z+%d9i7Yr{L=8X7JMblhT?|7s=!G3WhS zO0n=k$0rQL))78Ui4qq(_XQq>Izc91@V`W!uZ z{J6M-TYAu5M5wHyB6fXaJu+K+ql%Hy6^StePK>7QvhF}-x<%PL)-q@sxx);th zyfKqgRgION@ZvQtB`XN&o|G2W2KUC|RbEF5q4aSMyC*wKvCoJe*L}t|!%EXC~MWzhzmRrUQ35bXOyS>55u*gkGfnv!nb>*dX zw+>p|RLlK>F^?kvj||=y{{mNlOaIk`ZOZ|JE3yVx27`T; zhiWgc)beJPFG=69WZeTjOWrJlCo7PbXXN6uap&Wey#{J1YO*BoaHA3J} z5&EK`hQ5HQ4;QFuK_N~2TBvD{KXcU`V@PMKT-Z=!;WTN_*?eG+frHd%8SQp#PlcCi zn3xTxl4uj!;ew!zQargQF^IrC8SC9Sz@|5RXQ7u{3hs zKj2~MHc)SRZ>^MMMAAt6?H-N6R96M#I=HOzHX?QVJmaDZlDP`Crx{Ey90}46#a^50 zO0PsZRX#eZV**FeV7mJUeA^DXs~XVOZ}p$h?yGa;{1ch=evUHygrqeNm(nttgKo~= zsJoouj!*6<4SD_a=22(0u(0+vl~{w1s|%1Av*)e!F~t_FA7UDD6h7*efs>z4@&_Gg zYjd8i%|!6+EBw{CVHS(!M?w3K?BzM#IB>3BN;PCD^kB_|--`7PsZDA0hiG2PvCZ3+2$oFaTI zsr%;|)pM8jKy?&Wb~zQ@WGKaWMI2}w4s8#UO*xzx{_Wy}6k3)r@)tmach$zr3h8QY zB~tKi$<~Zp#GVH{Slo*h6{Xje*h<;M+OKX(Xm!gcgA2gNbo`q+w-w`B5>F@MXeW-K z*O_mTv-=9xWx1IG!8ege^#iC{sU})jPDp#ZxRUZBKa&N}&J9OnB+`X{x5f}a#fhKd zI4`#*J$ms@(KZ{}sLt&uQElE6j=Mh1$g!KqK}Z0g=`9#tbRMS>a2QVf<m#R zitMP8I(O6i>rFYUac7AjtCWTRBx_t{s%1!XObU?hX&t65K_04S&ML$E`G0!V_lm$z zZoF!_O)Sl7aE4TKe}y_!5H^OnbL^MCU8pO%;3E6*Agi9P^rJ}SiAA0m=7Ig)BHwPM zH7JJe_uSmc&?ASSVILNN;4gq>J(z&3XqYWnRd^}HC>z?sy%q%g=gvW1g!3sbwJ+u< zXfqhOmK0VgS`%~NZg@1=O2#V82;Euf2{!qv@#Dp~yj1}kGILxDR<``kR2mtRx60<3 zGSf&5I)-5IMA*`t%DoqVZ`6x-u6P90gl+8!8wxF&-&jb= z1XBlP@qe8-yUcT8oLgOzcQ8GF?itWx9KNCcvt9+S%?^CUE1NM*RZa`?Jqg&Zbf*04 zjT)6E%M3TEmYUBtuE0M*3Z1S^m24mhDKMK@1D?{ueoTlaTsxd~@C5xaWAJ!}czvMW zEZH4*WdKKC=~`|dZ;jq7IjB@GNq=iKnmdlTEQ%vIBPIYhCOkR4&0z7ET3Jy$Zu$(N zmQh$jPL5kWOkB{yM8Idax{(^fd)*bN?=(W1_ddT>xjKSuANLas=*%oXX=Aud#T_1(Ub^=qY9#=AaXK{ z@Y>_$sfI=%SrLx>GT6@~&RrAa8K7&1Gaxs;?%q&+8L6)|I5%)KacmS<`1Q>%DW7Ps z%WBsKDb({H40>c2)9(&4i{g%+7XN7UPsV>u3~ff4jB>ANClP!BAGa}IN>X1OFBb*6 z($II58Pe0jmM|>IR@yW*HI<%+6`%!5MIqlY7Md|-nF(QnT-0%!3eJki@E9|tjr&I* z4Rf(xlQJUr8p$-V|L*2HCw$+EK!|uAuRUth6~Cxfife9ugC-uKGJTJY{=w z_#F{LS=|lEdpSDX_jtM18vnUreUx5>;VYoKC28!|jV0Me8V>_s(@U%dV^By;@uahd ztT0?J-t81+hl>TDYz*RibL13h1E&pY+F&}oYl&WkCjp0?!W>~gF5=%REVJ*GZ2w~G zu|~Tm*uzTv-J%bvBDM%Z1eC_Ig>%j|jT-d@Hq9&0-UUPkt!or&$1QP%SrF3fE5y z$)#!`eoas1R5SLYnpxRx{aoHjY3X#;L<~P%iSEo9pe(60r*&*jx8u{Af^u?$+PNW_ zl|!}+(s`9qzer4guxkC?FKh?9mQ!^+nN-~nTr1~Qba-y z7AS$hHdPzpcVWE@!7G(Qk-w)%cGk?#jcv7dpvprgJ7x;FpFZUP0o)(Wy{ZH1#D5K5 z$z3t%68s7F|D5Hm)#rnNSsJT4>i{jZ60Z`BR17(9crj*e3a?#NQ5OgKEs?o~VxNI6 zE7!v8^)k_dCtX{3joTEd^~{p zrJSo|kkz*2+-Mrqm}kwlcN9f!59M~6c?E@bRP2A^E;(&U+H~l5KR@S9d{1};z5kV* zo9)_7kfGRi?iMW(5IS}DJY#MQ(9pE4${sBE0nV55cDe6?%iF{eNBR`(Oec{;(+Gjo zlr_FN($LAKO10)GOZ@ITImf~?;Gk}qngzOu5L)V#ba@0G_+Shj|~BR3D}odW(< zeMTN9E;w^y^|LruKezhQc<)lepHa3B^7gZ{dS>=XgD1Inf~2K-&GL%iuTX0|M0Ee za^Dg5yf*fK#x|b$$w;*o-k^Asm#qmz>No-8|4dM$=A!AS`{S*wI?hWCY&VTZOlSHX zorZf=pQlX8zho=T2i+t|DlXq?Vq9EY^o)yxD&Ti!RwqUECrvUV5hl_q)AhU-M_0C) zo3(9*3nhU!ub+#E@O@slQJ>D58C$i_sjk+z{ysi5#C-i-zi1O%;p(8jLFaP&gTwN= z8&h^-e8z9?Mr)=SGt;$o<%e{@WB8km5CD~SyZ%abhu?u*G=Sna0G28)KmzF! z?7e7IC>-r}NG-L6JAxWb0?uB^%kRf`0%jLR&AW*bifQDvNWl6h_U*cH$-EZ#t;vEE zlZB$79KfnML36bR*8+AR|GQ1-r@ZK;qcx@8R{XyIQR&tLn%1_}f>VZrGIUp52_n2; zO=UDog}*^2O~Cw#z{yN*M3JHlBAXLQa>*~1M)>#|&E%6e?JtC_}bQVB*dIAN+-u{C<-%TNe0wYR&(B;f@H)jf^*sr=*pxT%`6SeZN|ZO$-^{}#hZ z5rA7cB=ht`<6v$H0A(i8!|y>|Hm?JKDn+6Bf`sbvRzblOojB4AVwEN;&L4@ALy9@y zCSI5A=pbild)f#?CFp~})IW~H{6|$JTjp4S-eXDT#awGeXsLsp*D~j`=P3LHiiceR*9F8{^kV^1$kiX6g!jYdk5vjoR1KB<3`mE zr@Jlb^qTOHz`(fmGEk7WlAKf)fIXOiTnd65tQO?K`JntYH6C>!kM!n;Q#o4dKr+1? z&^1>pfU5i-7J!|T(BbJNTcy1!^FjNC=k7Fb6r3#Ae*JFl)4KQpF+iFNKWTogGHPZP zwxKY*P18al`SMLW1fYeAcw)iOKsZc*TO9QIHS$nIPO@CN-L-sJ4svleevgusIXvCt zn%q-{RCZy5u=gHq?7XQ48wrbj_Ew}z?(Z!3b^V%?`}LZG7g8SVkTuLmbP`7BRP1*n z_SARPQq|K(eFG|yYQbHS2Xy4~bw343#R5Eys?CJtH_HT3fR)5tdV1TP`Pv2TST(Q< z1n^rMtC)1Nxx5m(cM)fAfte?rgv(ZY?ume@<*nz_4bVT@CRqAYX_VwYTRscJ1JhB$ zrOuNs2N23PIwodk-Zr~{O*wg@q$o=%#Wc`2d*>D4SE=ePz`M%%hj_|iqCC+5hjTKW zDZ`ty&q+rtf6ZM)kg{)D5dCDmqaiOTY{@#cQ6UvDZ~e17D~q8g70&dQn&04a&|f@M z_sc$(6ujC=m>Gz!e3#)fup4Mw^j~IMrD%4;XX3Jz3z&)%E4f}F#%0GVSOtuvRmvH= zv;5*rCazI^{Av|F6OY6kGd8A0>&iB)YTiDjssaLYL}6VW#mR)S&k0(2vKoK+UkCJ* zxN!DA+)HPCB^>!s&C>_r|CD>laKIzr%#||KryC>-Ct2&215mun*{MW;^W(iXCLG-_ zEa$<_^pc^AjH%+*nhxi0)KqU_s{|UI*82Cc5M(oO!W9gY11z%DkrDiEOb5aWc10}^Z5*!-MXZ1A!9Gw-VV_Y6pcH~Ei_v_pN)6> z$yprxPQt}b=XDk_CP>C6!sQ8f*Bi;!4)hG|n|om=>FB~}TRjK%rf%tPLuPpvHmW7a zW2z#)(TK@d_roF=Ng1LYr_CwU{xD3*^!f)Q0f|}US29uNUR)SO+W2SM6ctMhpLYGv zgn(r(13;je@ephSp7~XX7LFwK|w=DDo|Ra--2^D zz&=TZ4%**7Ykx7<`XcVTZuOkWTfjnT5Y+Z?+ISfo7lujG9(mi! zSO6|ZNgVB=m_|-F0kqC=x6>#5Mw&|2%M7{fnJ{7B)ZU;yXZ`BhP+HaX!Pj ztn>CQ=Qe!pLFJP_p#hR4l>SYT8ia&3>uw=Zf8Px(P; zG#3cDxIN8{=WK~N^wYOw!}&Xv_qHjE<(<3;n-{s6WOr1)jXsUu{Fs!=umHx1oEdT} z-EM2FYKZ0I8hh$8mbC2kA77p}@9sCs8PTpA)Gx0mHt!=a-g z9pz}1(We75ir}g3UM3@ttGoL=H}Y=EFb`Aa5iDu&fIIcsA5Kr2xfCw7m9TCYT{V^3 zz+C%P0tj#>zWA|?JQKJ~|8JMg)ud0}^H1)V1&_S&L$z1T%$YBfoGjCz9UgV&Eo7Dh z!C>5JX`@!r3QS`diBa#_&u4DXuOC*r3BtE?qiyx89lvvrzQN?jc=vod@Hl3I>a(ty z__mFPbgyv7p(J1ec{I~`Rre_37VUEmE+@r@?mz*X&Bvwwf4E8m-5d0NU8=cfn>jJOesiw;i)WWgb~Q zqwculCqNQzGe#(dfPItiLYk4sW1EJ%x(hG_Xr3StQ?>K(D53~T*ZyP*IbY&z_9as% z(Jjqx6P7bWU`>O76(VRa#?A;}VBb^58(t||-*Oaebl;!Gdc&pQ9 zqK7J(fR&GUR%=;dk0X=@cl9Zoi%?sfd2gGEgU=T#l&krN-OoT_RYF*gxd|% z9C=p7B^QtX=b)VrHC?Q+-$O9XwRpNN{q~x`1FY_W{`L$ZCB@FdVL0t1{MB(jzO(Ij ztdu*O8J(#DD`&hbKCTLz<#liDul4 z{G1#{-kon*F3YuOD(nNnfHSit8{_SS#PPLr>jz=b9YZmmaPfG`j7rm*cIl6vdEP7Q z<7h;4LxnNuz5<59cSz)AG{9XS@?nAGq6ekc2uyT0>i803iFVh&j7MQ5vKuAo`+-N%}?RvSEMQg*C!A#V>4;6CJA~k z)p^L&vQ3Fi_O6H^1HgLwywRYJKY`NM}-h z3gnIqyuQ6Yf&99%$%7PMlB+GHd!cp+F&a8_eOJP7PJ0_}r!+~@>h^H+G()`+2s-}F#o7iX|tW37Hv^uh*Rao9wB{8MKG z@CM?9h3Ah6B!Hx%_QB?TL^vbkIuVU}_sd@N^RK@u-wnf?jru=huSBr}ryyy8s6J>O`aX#leZPKt+sljfF(7M(PC#yPgx;DAeyAv}pRSf*GVZ?k9i zCeGIi_VZ?I-l>^heFa2kHQ$V0AK$%?4_MN-!?46cvtIR*3=!AxpJBoO;V?y&#QhD& z;YVw<;0uKVu?Y1+;i*RD?D}W;?3=tz%8*-m3=C$e&3C77v7^NQKHPm9&h`MazJ2sF z#P2n3J{k-&r+1;C!(BF;3P%3fd7b7?^I6@8)wj{UfoK`v!pCQUW!O*BXhLApXpKpV zD^0)IgPp+IqXbI!R1C8A_%_6fSz5zJ((Gied?Q22_*hHo-8UmU^$xELd!t@=*1D<%y8nQbJc%# za^a4WcgVT_yuRfgY~!<)IG#Sujb!xu>AWM99{WTsL-A~D+&i5JXzZ_QzIqx0`B!d% zI+afs{zJ~si`GGx`bV=2(5|Ee2y4dF-bvSmz#iOf8((LxQ68I6>~rij+F$@41?LzH zOq~sK!m$iG?8Mn!^f`q_Jk!%z0#NXv4PLe8Iq0P0V`*AyFnOw8i`1m%l$A(GnJ|lC z5|^6r8Ll~XLq-0{p0kyu)GTQL?PSt5o)ahs1L?Z>wt!MpCz#NT9M=BrHAMwRD;XS%$8O)=uf`N+LUN`h~0;d7QTCQ96sRbozJ=B6$F_2xYxmc>v37nsO9jgg5FM1 zy(61%89Vi8L`8a+L#jU~O5T4GS$?rdo1Cs<>{+4Wi+CLNj7G=o8aN#Hy>a=4Hb-Sc z<32u50i+4stEc$Y8dG>z(aXu9l+`wm9~R6tUekP#as*E1ZQVTDdPg~ZoLKxl{oCaB;ds8r1#}+a2aVcZY`fq+G4Qn=b)E$1^f3KWf69XVMtuEv`6e9s?C#E z8Rf)RBxOg$3ZGA^1g6{45ChW#ML0ImG*|cNayj!8&QX7`sdsq0{-V(3q~K4gA2Hot z><|@)*Hgzm0aJwV%HY}E0b->lYulTl=3TorR1pwz13#ez5mqD24#H04y0WZ9D=SmG zQU|qQc8zC+`o@ahstqU45t^kOv2RtiwT1iL+E z${MQku>C~0^ueJ{FBa3!9<4C_c3cS0jp`jr+T#1oVA3@{JlM6!&<2SHBHd#=69I7O z|6ueNTupXkC;gtuGk$fzLX&}^+DS1Q1k%p;sWZJk?M(dZ=U(BHhNy^pF8p|=NPXN&iyMU==!?kxk^K)=jiM%+VFZq&Kpb(VWzW40kg08xN(eBDfkL5{wl2*L zm<1~Ex|}b`9Q3cgIyuRuJd!1kd2wlvo)`4kIv--31qi*k)7v{%0c3;DSgv3~w~wHw z{?xZ_7VoHk`hCXjOpKTsnka;xM&u2VoLGKYzgkkvJk!WC+(UWV*$eyX!X2xv7+r9i`Hm_tv4(JU5h zlOwnJ#3v3mu;6qIbppCco(+e>eg^9$`fw%>H$U=K;_c{thysi%5To3xxWi z3+311Q&-D*G+gm*r~nx?vu8UFc9i=}2EEVsVm@}lM$x+&!>-Jyh#$sRd|`u{8An0W z&dRk`^R?DShe8apMQZ8BX3)RJ zH@i)@p{q`Wvc=*~q4Q70w}mhGPRAHwpa`be#7tV8-{$vP+66D|peMlb^t+kl0@hLb zM?|~Wg(OzFfM#x?(^o|Xxh9%C);6DPs>_>e7lq?v#=SkEN~xo>rd@=#7u_b(p_l6N zy&D|FDfUO0D=We4|Jqj9?H*ot?Ap9k=vq{g#mdwd* ze6!5z;GZDih3(7FfyRCAAMC2lFFza}a+%9i?xNs3vk$ zdahWP#6INYjf!u)M1BgYuJt?R9^V!0;`_*=mf8FlpQTvmC!X0KkCo>3SLbAr4Iny2 z1mB+18$u?oskVZ+Ms2bp7DJfbO;G%F(@3ZzM0@YhSGD9<#V?~fpF*6cfo7z?df|gX z2L>UTg$rYtD59U%L+TJp4vP}QxTHaeH?h$1NDP{d$^`LCy0qEiTXp)x(EN7xdX)bc z!|TjD3i`%=3*p3pqpadbO0>9i)H0<}oL!Pe3NMadI~_NQo$^LihTO=tNQ!#<}Iflqpx&_J9gyxhzwG}zf_B#27cAk-pC!`); z(=Tn)xLYkWG(d%*jcVdCa_NrqtSJl2vRHA=3m&t6ZZLu)I>M z%yB!WwjkmVo23(b3DnrPxw~b{!1&USH$)RS_Iw&<%JLt0$A$NM>ngM+-{;i7Ime=# zoD{3O54(HhHKubcUY2T*vg0Oj2aG-W6l^uTG`>=rnP;;I>`Eo?&Vue9U@JP`vrLOA zsNk#Ej6ElxY-RS!&d{&ME4Q(QR<9Eg841aiSn(e>18>z!JcuXo)vSZvRG#T|qMMYHG`F$Vd( zm>-W1aL4q&;9$@!$@`;oz*Q3P$2bBJ(TtcCzbyorN~j&?M)T=5Y*``Oh@)@DZ11|H zW+iUEw9A?5rnC8x;K8QM+f*jHq>>TmTtXz{*NZlz z7_{8zbJLk+zjH#-%tu@9_cepfFl>9wIRDjsui01!TGxy22(*+8s(K10xY?)fDvV0d z|97MG-;E@(Z!x|M-E2ve4!7A-C7B-a?*K&^7hee0LZrYfz{3WCf%6@B;&D~FzWU`l zc{G4Qt2#nhmuWsTCAV=M2FT@rBQ8H{DYFn#$W3tR-8Q4c&^Sc5>`$t3^#^Jm^i}fk zjB%Zbz}_vQIij&g?;X-ZFC!tf6(%7>kxY7ao_Ie2EZL+^v$r-?_P|N4d7%3OBrATA zeyv}B<>Z^g!;$P=;R@9#M>Bn_wk+%EJ@*Z$v`K-ZqNmbJ9S6B{qje&y|{v zn*%eRd|{{^-A3OIj)H!qyxi^*85a750{tDDAS!*ZAafxAQRZ+mLtJ#Qw-+JI@~MeI zeEc&*`(WBq5-kH)h0L->S?o4k6M8IF2qU2aLBCuc=2W zQ*fPlNHtZAT}yFAUa$QekRT~!V!+D{x-MYhKQe|vW$;$+Zm**AwN4+M1u_I4H?GJ* zR~ql;W|j&U(T-!UIzHJ^ge}QnQP{oFl8L#2twy{j0%+L%OgbRhs6_Z&ro1!*`U)Y{kfutJ%$e(Z9>sO!SFO@P)pY@>n(&7&gY(%r(+#>q8n@$R`qO26og0O#02L7O+Ax(AydsPwXad! z!35VRP9BeYPww=SSd7Af8%Kzu^e=YM5+)rO0XFwv^~Agqrf`DqDZG_>1`l@hf2d9) z2G%Fdm_Un*#bW+TP3LbbKZ@%OPHC%>9;)7b_iAdwovHiu=>;I8BOQ36g zI@qs^PWUX|L(HqY)RwJ;BzXifKBs)RsnS0hDN2RNt=g)Hlv1;>KHfZZIr>rWyy0W- z?-_y~9CCY|$3frHE7e>9W__esiYu>l{78Nx#C^%j%jIgd0FMi^<9}*TPd82^P`_?| zk6g`m@%I-UTsJE-dbZrMCLrkc`gw+gdeRGLee0|CvX|EV9mY-XborrO$qV90%00*Al(DO&tSVY(q|e4PeKDdjy7d+$O}XQtOYk6CZj zhPU?f_4_HVGB!-l3X$4)Mi@o-yerf?YknHp=aa?)qN~QMQO+1%nTacj8KT-ohq@u2 z-=Z)|DA0nb+@zTFZw4rzJA4tr8$|{e;}T{|yc63&!Xk2iSUVtPVjV``x0UfuJ=ylp zJNWDC_wjKSK8FD({vb7r^0Vgq7Y)A#WlFgWAEp)aqfSyF6%01t7t-&(OI_f_x(dc$ z&bV+N1UH04y;zB&79p^`hbzZv%&gnCT4FP?v-_AD2lapwp7e?a-7N)dKOSDX77FULt|0V zAJfk2>^63CobubL_2&rI6yqx^by{#p(Q#)Lx%AUt0tDtSB@BSUY1^LD*ayO}yuu?q zCJxR85Q~`$+N=PKN?^dB?#Q)`>F@7i{;QcTv>Hek>*OAvAAuzSU} zz=w6kz4>efva*hKHt;SWs#NL-*E)Aguf3a0v~PHP?AU^_`w&kqgfOXW7JIv;dD3SgV#!(!sm%{_h@Cgat(M&^!f zM#&k*y1ebS8=jycbXa&!l{QwdKDyj_k z8ot7v-OYwm<1MX|L9<3u(vV1sSD7_J(G;0I#EY{`x{P*(NK<%hRVP0=@rw}mFa4|T zUBix6wSj2*wTf>mITpKipI)B?gUJV=2h{IP1ke4G*y(%SUR%CG4p8oS^u_E#{bg_N zLI@3HJ}EM}X{F2(kbkh<^I0Q6)3Hc#3bIq!-Y8i2;qTAlcDoid@y(@uVb82`GFz2$ zJWH7?Lt)knABX(VL;XegsW9+26*lwE4X{@VZ%##)2z}RAMHiChPJH^tG(_q=Ul2--M>N&fL9pnB8#dBBySXqY`t z%bmMb^dPHtkq^O@zj8ei_eO%a;~;5L&Mx+64fgoiFQY^2 zi}dY=efTK5wkUUJ8JWL++z#79(7xZauA|BlZjL(^=ef-#{8OUv2R{=JQ>>Aq+`zeY zJ#glf`&0Ae__}BNhOhNFR(?xrz`;DSUFmYFPD1_tlL5qaCb`GcN3P0&SB`~SRKH@; zoeYj=#02}o5D*Slsm8b-0-5a(qhf9FtETyEd?$meC6zZb=Q(PzVKe<8to=~$--&zU z${orbUnJXHQjgrWW67h4emTku8s2PCQPB6K_69$8h+BFSdpE3UU*dVcpzx6X80+hs zlu&;f(u~nN+B6vN7Y!tmvkZ0m%@7s2B>)fhu6!>){xHd4AN*Cg=i^nH;+7Q*yIqPD z>sjR_K$&J@%yQkfj2z__k2&e&CNFkEj{bG%EAe;uIdNE*&5GEzq;K~^Iw9gsY&Qp8 z6z9o-SA=cdr{)mv%XVegxvrFbuh9Tm2WARad5lHP_RaW*l74-OY4_Ir!V(e^(s+ks zI@aj*qDcF>_NtB?P8Ms`&P|m}TJ$bfG$A-=id&!%us$3i1(X$7yeoAN=j^!ONAY{2MOQ1hyzcR@byrSidXfbZScvdKWH)y!m0WAaU*;0!`^ z<8%SaLQ((DG^n}ciBosoME)2o2%k_#fM(QGTv+L^x5nQ}a*rlbwh?)Sr}4#_gr^8K zMYKoy%y5#FS|{NYU|GNs`jf``NmQ+xnYYm60~01)hJca9OV$5<$@1@#)W*Y$1d2v? z`o6l;EKJ0t(8RlvsF$H}Jn+j?DTS|d??3GU5ngFQLzJnGNiKH`4~z?S9jb;fKkaMk zYA~RT$pgFLWH968?dwOVl+3k>ZBh^;Cjlb7bO(JK^(}YUHrc7 zun&@zJ7ovIyS-Zm)4lfAEAG#3d0sK(^rgE#8l%4E{%U-?7)w5Oab_H1%x;?ff` zud0Pbzu`ah5HR=#YQG`Y@+lv zZx&Bu|AmMs##hm=jR6*d)p|v$;tgllKlc7*4G%ub3WKosB884bi)vyDHJMNOBw=iB zr^#PPzOpIs-mV2=Ut=iTW1^9%_$7O)cN9Ge*r+PebEBW=Stwdb`g?gOC#X3_7zr2+dyy6e4_bzc7uR?_qf_68dsM-2G)tj-Fm!V-0_*cF6Bd>)N#5}Uy ze?Kpk(_*uJ>PE4#LKnDw*{bxy?TNP73eqhguI=7H@Ig6f$OlQYejYg`s*UDx_A6dW znSrtOz(AMRI`PYX?Yd0}Nn^eQdUfQLeNgnA=9fm=2gHf}`0M>LQ7bXQwMh02$EtPV zAg}8!>r#W<;XY0D5W>^@urF?_FOs%(qYT3l47%uuiq4}39SYVdsZptoa)G;h`kN=Y zlxFoj)?>Y{h4#Bx+SRmyheZ#G$u4LI`U0~hoO788(N62JK3_jjjBEs6U{vjS7#5mR zb8lh?dS5g#QDREdiAj84E`0yle=DJaD)4L#<&t!jVa$VYAVQcZ{D9qoS3N7ytI^`W zu8LqY?dq3aW|ejmfksbdr@yZH-AKgP#u4_QtEAs9p53QyH*f=BE^kTNy9?Ut{o}hM zTvrKD0l24!chpt>UF}q)S(k7w;U(6K)1}_~nuE-=hY&hk)8BXgQ}6W|{T(ZdlEG0VsFjyIDFjopJy{O`;3Ejd>B*DbJ;&(${qA*OGA_31CR z6R=JO7rz%FpByaFcn7{lDN3?9Lvn&&NnN)sR9CY*$a!yp8Cp9x0iAiK7p&L85{*HR zc^&rf)&9carxzg^_?nVX;{0W|t->tjuH;1xKG!q$fTPE{eC^&Zap(~o&@}fS;m0X6 zDS=o!#MOQjzn|tl6N2-Y_h1TMMLABUjc-z`l%-C@7IIlN)QF7CjT@IRsA#*0RL$Mj zMIK@?-vS5Dl#?mvWZ1}5=fL%uicxG@Yw;HTrv68mVKu{l@%7eGQATaQ zFb#r~fGC|(N{Dm|NP{%e2nY-<-61V4-7z#0Qqm>eUD7esFm%i?aBiPxops*tTkD)Z zH)}w-+4sJ#UtK$|M)?|bu||hsd-1|%T1{1V4qN%Ae(_NSOpRJ!4-b0$V*lD;+^c#U z3*XYm{|ei#3t_&w@Q{&0fwh2Min76uTVpmRy(RPTpkw_6>Z7D?6FJT%0X)WiY4sp{ zZ69SF?eo{4;3EN^Tjqj5|Bl5NN=M#?_74htfrbL?M%ti@FyFU+hu@k$Ue)@07H+ndix=31$ zO**hqDdR!b9pIK8*mF(gB+M?&HJ^p*PJdJMoB&NnQ4=}DTPW5Y@NfH5zZsdm8V3mI zvDsdhoNwBQOBv~541N?&qgNBAKB2yZqfBL;7>x&xpAd@%p!~KOUNT+@V0V~yXvAceKL>x%|VmW%h*YnvYOFlme`o?IcVl4f3;nz!0mT2Z71=?oZK$Uq2r z0WIAxs$*anM-=S~-WLEm2)@}iy$m-+pS61V6lk1@>|R8B39v3OEpoH|x+pLJ=&Y_X z?lb^ezfivNvJb}3LjP6-Fj?IiISZj^3RslCgxtUBt-J3ObmSJ|j*mAkFm*PDw8|Z% zNL=?cha~Hy>m(9pQk9hp4>_cP`R(Yy@)=# z;uI7(9dL1c>2naym$k8)Yv1~HA(Fz5HPvaaxhqUzE6ol8>q**vk~bd0p*qVE(Y_G) z?T+mjdSuWE4m<4W<|aono631i>vsD+!4WVWD? z4`Icoshav%wOmMmLPNG-#<@A=U9+_kJe?aTZxZWO@OGEx9y(tkoDLDQ>$%m!4^6E0 z;&(!-nk<)R-@_>Wq(_of?!C+@E3@;yT~%ATzj{VZ-&NiI5QrZUALMkm=0gORK?M$& zcaIzqDI2-Z`)LRZP*9b7d|t8Fpu#f8-lh_TiD`DX&2O)|eAOqD@VS-=6;hSsJo}*c zORAfB;a*zAVTx|=99*HMWV_PJ10f0c$c!f8s@M{3I4+;{T^lyz(DJcl>|k|%ERoYn zec9nJ#}EI+HEcud>!7-?y*=7+kN)_V28Cl_zIaPU^2Nzl#8OdA`WQNo2r){cr9K%q zp478%D)(JjvrU>TX%{Yj9X`i=?NyNxl=2I6oWz<@VM)cZ1~n^nh&^PxB~@$YJbgo~ zj8=`QSiM>73e*zraH8;+(0!>!Zw-}v^>4+aJN5PpIoGFct^6JyN4?iDE(y7sRjlbJ z`50c$fwxt8rGKsHyfsg6nrXx6Gu-O%7Aej@a6AD3PZx17rDX)i3Nrhs37^4O`R;1z z*%H(XUtrPOs}rKQ8+`MR$zY^&iOXj9$SwVl(;U}j01yvosI|<=smL|isA~1EZKd^) zPPw*)@1Iv-NWtAyE1SV7${?mupsbNR$QlKd({5XBwd?tZpop(WFJ-%0u}| zo9q`a{^#<--xNT!Yci21ritBQSVQV~CFUk0S!#c?+z)5h14Qjb%JE^e%XJO5;p6=v zf5Dyk31d!r_`0%zl+|)zAzlFM>mSJ;ON#t8`G63o-OC(qX-J>)bJcDDhu=bz<(`LY zF&=nbkohmYQdo@L$|)LM3SoT-+igZ&!9|_kqHj7I%c8ZfRn|Ke?6BbV)ZZ&;eWm3j zhFm`*y|4#|r<;Esr#BczBu_TFP@&~n#A>>)bV>j)GKDbo5@>Aw3j{E!oejX!- z&lA?Uq%V@ievqk^QjC|C@TB`#|74NeJ*>SQ)4tN5TWsrFc8m1^7cKd|{kO_;nxATK zgq(hX^qaMklag3vD$2?g7JW;a4XkFSU=b>%(i2%w;a%W$-gS zNqc*JL95|ba#GYS&b4TIvwl>Xggf< zujCI$TO22BaxXGoDw@wFnIi#?&bId2i zjPf0;S$84}P3Fym&6{>ZeucqA~cLMJB+hxW70j54((&M;! zHN6xqY%39!U}KBPvYUUA~b+yK47EW-hb|IP;axa2R_SXcC$bzyb7^Fr~^`7J=b zYq?PW+9PH%yTNN8i@kJ#%M9JdSKA$5>j9zSq@zk@yfVGy- z@9U46kXBU=0fjQx+(mme&?^KHj3r4isPA-z_*6@)4|ek^XPqyR(>@QFr1{-5_O(Ha z^3NB)jQLb^N+~EPw49xNCFQi>wb>T`?IYiQ`3l(d>~QFwRs1OiO4TCRvKJMoV?>fn zI*}@?IW8dVBX=!6j%;S5L(VkgkLYy{mY(|MJD%xI1jM!7m833MV8!k|n3^fL_fyb1l}K zix3uvwy&K8afN`YCpmnmU5p2Ef5xrTW*n(5OUED%)!iG6*B2fx>>GBh;y^5Kvce;j zgPl&CN*VgPzB)TpnJm5@L4Lo)^%M~mq2NeF;vi#j7%~Oyra7_`rqP%eqF}?>Ec;~D zuc$+Ex77sdcXL&KW-ph8QJt*?bB{A4R5dE%a%VZnLdy`>X1Wm9ljUt5$eo{iTdUSs z$Hs?FY2ndk<10=MGwec7P0KWKYq{g5xCM35(GKO@7qlAZ(Sl|I4d*EkA0eA7FEp%Z zLDj4S6A2y1FZz8TDlbLB1h3pq6}RzERY#1k;nAJGiaF9n=`9IiqLJm}Y#!e-D9_T0T|u$%j=+d@t4yAWcEwsfyU$7u$l=+<4)22W&0m#K6`Wgb_m zmZZ~PMltN3Q*#3I*vz;08}@GAX~|9P*6xRMLi#2N+K4U5 zaP!J5iHqqf%kgeS2f?t4>&)Vmd#$Xdn@jh#t-HyG4?k3)!5$hWO0|OXJSjq`8b>S%HP~M-p)Z z;Bi!#w5=MI?gckW@+Q+jyMvq>njH^pdSzyh>F^pXC8&D2{yY=zg&R-YNv>SB>{DLv zkp=E7<8MXt`+`o{hD80#b3X~yT4nS3;+wNouuS9O_sZVRmd0CJgtI*x4MUgXg28$S zEU-r#tMRCeojVE)ZuO-761lMqpci(_;V0H8wSOdM~~7 z7fMKG7R{RNqO-4t$Ytvk($zi@?q1tPhyk#UoM{Y$ehz6hXF1O*g$cb4>*X|Me{Ftq1)Wt1beT$q*FTDca z!KlBnP2}G%+>-7S?nFDw+pp=1dW1g-wVi~10xiaF&z08QkgynrhN09Hyue%mDkiVJ z;vS&=yicoFvK={kC6NUs|r?65zS>x}u^8j20B{ci`*)-1dKMG$N ztjLd`4-8`|bZ)*e=EghVXd5TUGNccEYVX*1j{5NMmsnbq^>vfb3F!5QQLPFsCaY z{-7YB!IQ5FC_+1U9o{7;99c-|Ch7{92+9*F05;5W9!Q{+(k(y1J%NExDNkd6(c`uv zYfbKJ-fQJH=fchPBV1;AV1(f6po@B0DqFuda|qQLr)<<@6nDDBj}3wk#Ke|Y>z2vC zw22bX?)IUBESdTX6}tkWMKc23LU>m9``-x@N|{zFng||h-5L|$5tZQ0{-81H);dMB z2bsU%(h+8KBg*-1Sy(amwwWfUj+I|N-(yfhu;YB5d?xEuFzjqDx3$wh%tO8fPf#r1 zybd$&w!vz7>^dFa?)2I!djA^^hSNvR=bo5Op(fpbWm@P)Jv842;hT5*CQ$$>0{zsh z3!UM=_WN#X>HMe7_f2ZR|4zaRHhf1tMRh#*G+ieG0>YUboTZ#tm}0*_^uBse{Wq8F zhulGus^wMh+%S-8eKK^{@N*^6V=u{eVxCQz0h98NaT+2`Es*!)skrpn2f`^4m-`?1 z;CeL`Gys4gvI_$o2$eQeA8|Bp{rDa^JW7>`V~Aeb{IdxRV;J^cJb|fog{9-J#gX`* zs#7x$7+~M9N}_hzxToKL=hfad*3_!IfBsJ&J(h0oxVgHC@Kfpfd1VEyNKwcrX*d@l zF>j)0G$g&53xVY_O>~ybOds?C*ugk$2CvggjhzCkJIuHk5}QUww+NKRJ+>5yF+eq? zH6Z=02B@Y5a~D>lK)*WhJamyH7(dkd>iz|9ci4T6bpAENkZG?wp<%ghBBa$mp|Ev3 zyU#nBs>h}}MFOKLx7AJ7a86uP`|4rFvd^~Uebv4)2R4Ft+;Tzb#hFBLI58KkE=Tit z>m@FNE_S76fn||Mr}p?Nvx8AxpfM1$^d-jeQhlS*xR-%8R7jSJ=Yfs-tr1zEw6m;D z|K0Dy@y7^8uOr&2+Ng%>UJ>>k*ZxCIN1FiK+8z2wX^_AnVZtOb0&D7e_}{PuSkzr$6?dtty9;(86JUamcb{wVC3RWUd0$1i z(7di(u#iO5gseo87Va;_b9;6z%Wqq~Lf^Yb@r?F()th|V`15C=dL9emwBx6s> zo*}!zr;?@DVH4Cd{@40ycntLTVyKN=$-!dfpEeXX}EhW5~-~*VaJKC?>GGv!&N?qys#- z{*|H*zDM4c>wJkH=AA6)M6DQrZ=b-dHzvqLI~H(pj4JpB}o z<~R5^0OosZWuraddq~7*m->X_L*T^}rxIrTCohQ7H7Am~W^QoiTE}UA+X%QUiRtDW zSUjGUlC_Ah_woX*qDq~A2w^PpHuu`YqQD~^>gofa)?7}cc$A-V{?Wh^WyFu5-jO%ddJ7bj=_&aTe z4W+cL7*n1zY!=(ar^L$lQS0~8Vi)3UG&(09`nlJeo%YK+UU6EqSZ^5@Ct29MG$ep_ zRk9|Zc4pwG<`e$kmet^^6jSNx*N5|-IRA4w{%>hs!F2tu%oM$cg%MJ1i)m(C%kWGU z^`4$8-&`)cF1P%?3*$+5yMjpK_KP1LnCzFCx7(Um>=hc$#iWF9zTf}Cpe#ssZ4QC2 z)>lbWQQ2NaZDKheZ^X?uIx_uXru|smRFXmY`)!*_$ro!!{WUD38IuuW59mW!r-cvcm)ST_lry{xvw$37IY| z0@*Tf{qs$EpqBOy%2BK!Oosa8Etz1|=04QdF|x~Ai5at3Ywr5o7Zak*HIRHsuF5KC z-NJMjoW1pVF=$2`(JZp3EIakE9*RaGKknunoiTmdOn9Ju|Jy3F>Nl4yX-Jd;BcCe@ z3d-gHx&M)|mw4s8N_V8tZrH`U7lG8C1dDVE31I&heru@ZpS{}!*J%fF_)UGUo@JZ% zz2Un2+OK^guCPwajAq#8q6Npb;>}Uc{rv!l@#=-+iw4Wk>Db1rI`6KRB0ilAo zNDYCPr$u@U^Cw|uV zt+&KKWFh+uzIuG&rX)xuYzS6=?#y@@<+V>PE9L#&{^Yiv{l`9o@c^GI+Ft?J zOst0E8?_@=WvS}OdXn99!!8%nGS822bbh+`!?%t!sTb!85sQHkF%60(%n5Jsb?KU8 z%i<>fPmWmgho&JHLrZx+t4`QQ2~7|y6))yR>-8Y@uK3HM^?)y(9{y>_-<^%kh-VGA z1BCFkxMCXd_PkKP*E2Wc=_7NM7yha4zqR}!_KD4wfi#kOb7=BEA69HzX!d9&-_P=x z;uBcF#X{VP+Hr`~7PlzXBgja+Pj0g$_;^ z{_j};@Bh+7Hi_#<#wLG!s5rT8I`ni&xW(jOnZr!{{c&3T9T+-N&w~ig^rlKr6rmz0 zIa?Pt=2NR0#O1ep9&g4GnJ3!6HNSUq9tEsM0)AyxmbQJ(Vn$!HSd zD=+tc-^KA`#dz;!)}3DS_q%n?D6Iw*;_XiVtQ<#SyJneXz0@LFJtQs>Iwmo>cTZMl zbyGV>bo;_gtgwO=x@4DX5sw9^9o%a@N0jWqIwIk8z6t*=LG+6@e==M-ne{az`&`Q# z^`}n4lHzv)KbZxGiTO~8pB(?XJq({cM3}%VCH`j4p_N4cAf;|T8JN52EPrm)Fvs)B zqsO8A?tvye)9($;;|JWV8&Eo)3w6)d4cK1OBe76O>oi`l@ljMCqk8C@WEH~5x9hoR zH4D{N=q8eDEC;MQ*3e5a1$jsoB+4{PS(||isie>;cf8hD*=iJ;WU3Q9 zTEiS0Q8){k?9p-*q`p@d8g~5w_Mob?yZ-<$z%2RuKJJ(x#7)}l7biTZdVRInS6itvNoAp1+g{=EOdOdjQmd%HwaY5(P-kKVF1`5#|r=Wa7C#^<`TB^eXj zv63o5)&-yN@8qiBmdPogQ66jI; z6aL&w%gLSDF0LLh|EW{uryYOng2ShSXzpvMnR(-=s`pp&XmNShG9fKGa!@Em9KD_gf+k}YKX9_(@$^nA{1K3+6 zLbX=7Ou)V25L)mD6W5w{4v0r5Sz*l9Vig&7Yg66{z~eQi7WR$T#w6cOj)1);iE%nS z!R~)e$v_ukTzDIk<=HtX^skVYu6(ZH&SgsJS=-rLr&8Ti(z6jM)D9EfUAITpel zBvi+9lNA@@Tl5yWOhojON8^F44B&qvSz)F{a34reCF|j@x$Yvpli4#kU+U!dN6pZ1 zJgCRlQ3YZK1}XQs`(B|@Vi43`oi7(wcIc}hk$2v^wfKO@RS+=343cULo_#UteM~M< zSwY{9sqtD$g0wPA0Q16etniUH6F&3z@%qq@N^-gQ&g3M!j%FMJ$0N|#_lOU^i#__n z5fKvj`9zoZ@i}roNb3(a!q>pj=Mk{2XabCe3N*uaES+MoHka!ImyG;$S=r9ZAg{2q zjpp!u;-z80h@&aZe1*jzxB~6Pgts|oVuq1bNur|xHm`)(R)1p|jowkrhnc24) z{aJ$0`SG)Vb<2^ZU!@^~FiJqTU-VsW8Q`~I(sqfc1THH5GFIq#=t|xnmOfs4v;E10 zg-vz?hrj5g(8t=k-_HE%HcRck`{p5JHDy?2B*#QpJG!k&d52&|HGU%Mi5py)FLAhR zJDZv9vi8h=Y=R8n>_iw zOHyhTtOSm?tKrVy@FOHqUtq=SDd02gX_aa;hw~5JrrsYD|73hu51H8iF3xJ>(YWA8 z@b&Rlco>1gFX<9!!biJ|`bwS*cERA*b%6=lMCvm^HGIr&>FD7@p;oQq>e#8 zBSv|l=fUe@Zx@4-KeEocXs7K1ZOs;u*$?7FngA`Bl1)IJ!lT*x1G?X8WGe@lJc4mm zDNWql#X=1U9gwztn%V1IdmZVakvfjS-?fI<;oJoBvG*_M>{$Gl6+(G~)4F!{Fl}s# z%D{v2h4Rg~y=Ge;CRKh;G4gazN-a@~%QYGoqY^Tge5tNk!A=z`*b;(@*Z4u*6x^+)~zGCe> z074#TzOzZB;QbP9LC=1e3_>I5bw(pzy1p0}kdcz~A)6W4x29TxLp&-!pR*poj6rv> z+^I5+kJz0n#kgQ~;c`VeH%W@5aiOi!6f`8U8|u`vVWuQe6y~JZrsr&k9L&X!9I7Uw zooBCm6dh$uSYL7`7>l~k;CnuFi>b3X+08V;RH(ZT&xam8ilaTE*RW4KF0}la^>F2CY7i`y(q1( z(&!BEDJV|^M2`4o9iK~1_=DZU(Fofu_5|_1E?^Z3tBE$=8~xpl1@=dY|Ml{~77PQ_ z<|O}<%9-p@@yAyti0$-H4{3leRxAfYRdjj%$D8-}NbRiHQO|KGYf&$j2$L?wxkmZg zaw8IhtOVD-dlGE>gS@W&OlnZg>mlOD5D$qif6tQ)tB9Kn{}n0t_nV{Nj#L#o$X(`D z2^rWK$L~iaCVR}BfZZ5I4H#pWu{&xO?`C+3HDVco>_EtNl?s}cXSmNDU{Ky;Y5VwiVD|Fi>WdUhJ8(-9;J5U zeiS756}mvMOidz%qWVeGPOf7~;6<+Sz&U}MamsUVG3#6u{!0J-7_F;lVB!Q{uuwN; zlz*SfMA>d40TUmWvTW|eA^MknQJB1T(Dd~(8a}%n(Ez97T#hE-q{QL-mllEXK6ahG zJvT?}!=>KOWnRGVy_QwWcjP~1wgf zxdyb(H+Tg1ND+b={skDrU7M;F2&Pl4)-w9mg9-_!pkz&%O@D{#_qy$o=)Mr1UsM3 zkJYzJ74W_8fSWzJ>(J5RQNlnO+H@cnwfd0V*aRlkE1VZ!LO-M%Q-}ckU7z($iPQ#-|zN%87^Fb1BO?c8KwtdC2TuM2=kg=F-U_2LfVKOmS=!o_%qSHP=o4caw0Ao$Inz_gqjPeS`DA!36n-Z zcaX3*_Vh#A+YdUE3%LP#v0FiqNCgS?<`pLDaT-|YR+I*H875@(!w!amz4>tT3g^74 zu^l>-BrctxUqT17E~|f5VPSy$pA6Y`gu!8sWH~T$p|Xo6@Mte!R~41xOF;`~guh1n zqifv~o?X{HhH0P{yE+-Aon&T&9eO(v87923eHxt;Wl-`DPLe8W@-kC0=3Wy1^@j2L zKO|t#^6xelZ4}oJ-pQ5%qS^d9_22Gn9c5-jY^X#NY2OJ|0I4bv z|M!3ZPK~H*zcrEyP3A{M<^4=2Q^qJS+a%h;`eeQSH?iLfJmQ^+(1)w}8<(UTOuH`R zH!C)cy2UU z?Al$luj|OW!wk#MJZJwCks-Ru8eX{us_;)6+yZl;miCN#iO2(0*ft}0{{>Kmxm}D2 zfJWIkqsUYkXp~Krvpvv_>b}J2|8~k}=ir#BNa5i41`!-@?XHm?XCorDi|{t(aP(29 zwm6;;MN9j@#R7X9)k`?U~QRxFDWNMqk$V05}7G5{_ zkK?v~olzACNiHfBZ>%=WC(M_i!|DIJf&f;JUOK{6K{W zWz^a|`Qhp(LMlQP{7SjQ?|rhMqdifVco?itdIa+$tIp~#5RIygBinM7v8c}Ub*4e~ z3}7wlzoP7XRHxQo_n|`mxjlX;P;EfLlAlv|xl~=|u^C6NnPT^D1dx{lq$ekii6 zn*J`av5N^$>;lc%hTlSdG?iXI+Gl@bj70{9X%)1Etv;6|94#RO9I#)rRt9)FDf*mS z{ZEG9$+3NHV|)`!O^*WUo{EjXPW$2C*BVTlfU9o;_z(=H&60)>?X6>zxBYk9Ndk%h zyWBwk?$+_@m&&duc=zR4Yr4*Zu6#(Jl0&aU71^zYTt33iGHS>d@s-%CSdY|09E!cI zz*)e*suS6`k^8=P-@tFom62N7zWl9HU|1Nj>G!+V)$nv?E# zx5VCdUubnYVYj0TW5o}b%anN^d&%R7e5&UMS`CDo1e0}(v#lIAUZPn&gke^=*3(hd zp;8$HQ1MO(NwPIZH3g(=bmnEtCKlT@bOBfOLF6v{sugiVN|>X@u_Etlhi<$UPSbm5 z|D}iQDMjXbWL1Q>(5FwebR9#U^6z8JM~`z)dwlFd9v@RrslTFsB19^0+6}__P z>4qw4ky=Ei!v}$sUD`#r$VH;@_OUHezM_I!Rc+-T));S7h1Wk7FowAfmYA?nYQNFf z+IOWX>^qB{JoU%SU$9TZTF_gc%_@(gx}14%k^+g$kpIQ@45JJE-o9ll^6&?>**I%M zN@*$9d@hMYqFmRga8)S6moAr8zP)a2H5JS^2j9x7zdLrTTuCN&w&)w1C<98e?k<`w z3Q_HVh{dusACPSY>l99iLR;#$NdjD>Iy_VYYCwL6unD!frfRYsksN`ai4;#%VSu`j zAH=>w<4*ab`YuhCvQ}Z^8&E5XbRMB@pU&1vrqHK|W{VvFc0B7F>&uvR@in_cH895~ zj)nJd2ASJZ3790R34!X|nefR~6oP)Y3^CzPOgtBUIGiFdLP7R-b`-XjXu6f1Wh? zDCZZ4=J3b~d)PyzBw9OTJsksjPWam8hwJ9+i;-B#&qwwPH~w;pA$KMn(sI$YahuI6 zr+2-WPpx$-PT*>w4%2{8nJQYgbW?Ol*9N;}jhWLG*JTBX9qZ*wZqIF&MQ+dKi(w2(L2N8F;BEU>B33Qx zX^P&kULFY5>seD48gPEf@nMF6MBV~*XWRHI03ey;dy=-(hTN>dVWhE%r)Y28nO z&+rmVb+M}7CJe!JQ0dT)J$VJGf0h!B1rw(`BT)8E)eT1>vu~%WRE?!L!?PfhDdMM` z?J9fIU;R8Iz=>ULZ)HnNg`5TI(l2IOw>r+zN#;(a8?}`lV6-UFJiZl?P8bY_yu6U9|hyjIuI^iU7fTbNdR zd{|6YDun)aq9`lh27noC04w#}!`{3#!?i+b+&fa~Cfgd1quCcVLRI1XPJ}BL&8du^ z_6XsjaMoLK)oJ5dnpUOq)%nNKxtrD?F+c>UOmLM4JdFZ#jkJTG3E)BaSCHfCeqZ=U zP3Eh}2M-*s;1FhrKVK=0Vko88Us)~ zG2wA@FystF;Q-wUmzY#ppQ($d@4Rvl<9!hyCuM#&R=UJ*tN7Rt~v0jZ?33*2uOsWm0oJ0x6 z>2*Z0lYR`F2$0uSrYg&T1*o9nR3?>DW!I%1Kr*~a@+2~k!phBPv+|35@AS0Wm}T*< z>plhv{#W52zf0D&2NZq|7g9s37q$5t$~n5|uNLEV_gH_A6WOr}{(u0Nee>_cu;Kn< zj9&a0|7GI46!3XGUQ#<2ERN68-MA;br`)FBCJwyGHN-|m5lzoP9V~{TSu-<__X?k` z3<~y4yju^6Sj2Iu;x+yK8U-KZ4(+SWF-`vgS`dUlXSbkB=x>7tRqe#YF(&`<>8&o` zJ5SHd;63$ZC+=bhA83MzO`uu3;Pre|FR?1DPr^%2@;kLwpnsv5(Y2-G;VHj=72Z=o6VhK-2Lsss7tMmQF>ljxkZc)>#-x0Kx0${?ks| z$abYKvQ?*{{jFN~{J>ua5(wusdDoOKlS`i7dl*Ohz;B5!fMZPGrG{50nsKYERqpY9 zQ#dZ5elsD56qbdob2l(4e}zsrP4s$<@{B6%<6C(sKx^uSq9$+F9+vEt;X?=I@W6xX zDu2IqR_-`jVDbi5gQ$ek*yAB}$HLJFGAyhOpqiY2ETgDeYN9ax^t#254CK+kUnx>A zkzlK!&AGZW9l!GT?#2q#CaD&lGf;5^lAb>T+;pn%%yOe4Z>Qty~XmfD-kZ zYys-gnrsSYvOceNPQq2E0Ea}#un+AjE^?#eO@dW{*T6E%JXOaxnfK{GutXOJfN4zL zYd6(5R51Q7tkHnZS*21uBykZc%%-<% zsHybklBmfCWoiHfU}UUQ%964lLozzmz^gII7d7;A z@?l(=Oc}iewBmW1GM*I{hjC5DX;5YFrxHK{?SV3pMsoW5qUDmC+TpBFzf_d@8e+97 z_o$WJj`g``+514uO7VdUj4A3@kLv>(bUIbnJv5G9W{uz{2MU`r4J|4;lrv1h{U;q` zo|-oy+0_)$x4YXEjY0kqPk)r5;M%rr$v~qUbJyMdMAvQOmWS4gasx&je06>%kLlq1u2b#R%|xa zXXnINE{Nsp;QX0NE$9}U0_)MC(_=FUkDQ==0?XZa9_7&%t!4hkx|G96yZWwU8`uN( z=UEs)Jil;FklE=Hsje=vn|K0R9h#ZDefdLMS{jgXJ8*|C-v4*v)4sy?;S&r>!v@2* zaKJF~(%sdK^#8MT{Z%Y6g)ngf#5Y(*MS|hD| zy84j7b)_$Ls8WO)PDf=SOw}5~Fu?8u_N^Hyt)|MS5MQP&tvPzS=MjuAcG%SZ)~tooYZ`yA?QZ&^XfSCU$0c*_Qt+t_aV0pM>^Az@SG)Rmb_bWYMhzDu;%U9sp7 z&WeB1)dH8b4qk_Qs>8E{r^A$>SwOwP_kALo3;3m}shtaVk<6fpBdO{_S+BWdb1Dn41R0~KXe=m>6v*_ zD&S|#fo;cUZ+-~GsBU|xX*9f@x5GS8-81ymtcDmhx;Q5G7&Pnp@Mk|{vR0g|D>~k zZwsKY)2sgwPPkrQJTRoq^V9fiB`oQAfw(o-y2RZAnR)Vwz6)Voq4k?{HhZMCPJrhJ+Q~k_n zeT*6~4~^muVN^ZeaU=oiqs>v;t(05-ahAw#aZ7J7gK0r2Rs8WVJukz~K2!`#Zuj`Y zk#rEqc+BFacLqAJdt?LY18ABtF`m=2y*lT^&j#V8aVQ{316HUbzsCT_A3eo)U`*HBY{8DOuCBrMa(;Jx1d;(;NsBm#Cgx3+uDh zYLdjSQBeO>CR$f~ZOMxMRdwLGmGJTU>JpjfxQ|Ga`H2X=yaL?Ddpz_#~kkP8Sy9l;Gymq)c-n7P_rDq`LF>Xs3IWWB->OveV(w?Xwn6v!++&ADy^^ljfXbf*qFmS#2_Gb1MY zGK9u+ldE{pbBIRquU?YWB2w9%xlWGFq6z zg%lG)T50|663QP=b7slJOP<0xKiDmT&}5!VUB>>Jn!_Dm7;mo7+I1k&YOdTT(BM|v zNHauCG|iMx{J#1Ogs8QaJ4yxG$_U$_YWGuY!pyR@T@=#AiY81+_mbp=gmma3w{l6* z>X74-jn^?P5)nCX0k4+^x0Vf*n~#Rg&LR{T8TyDrx=(v@^Q;Wd3#Ik)%8PGud2i%O zUtfPVxXG$)%*2<~Jk|7j7#DRqv0dr1v_{eWVP*Zrf|!8jo8NY}vpGN)_#w%(H@@tO z!j7HRfbrub7Si7aaJ?(wo$2BFM?C{VtG}QPn7AJaaC4;P*FQ4ZC~xeWxJ__?2EhsC z@8O}%k*ru84&FL%I(P51;Dn+T#cNJE?8$?;OSqD4Um^{IYD3r>JC=arjxQh1`-wrB zj#?WSpnwqX_k8s)vGvxkKDOXj*-y=601VjD`Y65o#AO4{N!X_qgBbv!O=;B7 z;$3%G8@F_~doVF&q9r^=X`wvb%9x9QDu|g0^4mx`1k-2z*HNw(;OEa+=DW*?E6ZPr zQ?bwFGM2-=Muff>xT^E>);%VllfS(iiPN|WKTdF&im@(Up#ZEm%j(RMj2l>A+Nsuvbw@box2nkSBi2W?orY8g2Nk_arLod zM3B#7kcdxa54n#=FzgP`qvD-nRy<;7?Cc=FWc^v8@efeoW@53N$1=A`;q$-N%bT5> zC>!I|{X+FdLv(S%%CXOEB~B9xd#yu8^5KvblMHQp@d`B+731{z%2~(Wroi04)~I69 zE#Wz;DL!3_T@fhP2Di*wQO+Sd*O<>?P-PITsFRa=jk~7#!6%)=_CLVl= zoxpqgZS?yFyArm_e+&T*6So31CC#X&MAs@1)7G{iLl5g9=vZj=TEBKkjJdUF|l5 zfL5{W?T}cUHFKY~^z3gSMxY1^+Y@b)Agd|x>B}|#cm%1^f%85b!;&=}L48 z_ie^pGO&&~0*1MU(Rh1GSMLPCE>`}@YBi6M=MrQwLICKiY$hr9mkC-S3Hi@ZEdWuu zhyiW^;;2H()BifGO4Yvmt)KnjOQ~q9t$4lk2IK}e@;Z<|jJB~I!NJB%n0VUATBa(3rOXH8-<6x@Bic}|^J#JT++q9MDaNN4W zPa*4QP}bW7({IF~x#$J|t-#}C1Tmw1M&fC{R#0$q};kb1T8fJA3v3<_```k>R>9T|19h6v4WNT#q8n!4zz^8oBfk*Rmx-DPh>}w@UoV^JUyj7n{<)e zs$u!Qu%u4+WLrJkVbgDc9}iCbN!(h`N9_ z!SnCoN$E!LI$Q0EzcWjkV&Lrf0ZLDCGToFT2+3LyV9rs`)VXQHJIgTw+J))0ru54z zf+=9+;g^-aevwssn3VSnoL(y<_3g(3YNg3dk0NC4-+&xr(QgTd|3+zjVs}*#z{Bk) z#ato|{?FO<^55CTBP-<|Vk#|_A;yITwBKB#A8WFKiElNl_h^`!oU#g))WHsQi^m1`tZ_-K%nF$S?aOH#S+2no=6oPuxKd#s0&J`IQ)*pR~bV zYd7Gw`taUSbB8gU(RcU?QoaW?oGFGEdiscTD%fhGJKB<&VmZ63pWr26Ay#W>6BL2^ z)&*&iT|pF=L-8)SLcbrF|IOX=GtkliA2PAaLXOGGzcxiC(ttT3MeTrt1ieX}q*`1% zmz;UA=K8Ejd&Ty`EhH++rf}|11qzzm{|tEEZ0L0&^#5bHDag_ETA=|R6`zWfE0A%J zN)vL#R;ZzORM8}&W?cM8^ksZR3J~&@>PVGK0E`@6cHxi*Jn3;J%sKjC{GoRqBY1VQ zTNN}Kz=hLt$^5gW_;{Jxs|r3~DjhR9C^o;H&yAJ#(qcx9O3`7pL6hnxjFXv}83f#* zT3*invcg9beJ8c;-()W1OqQu^r6BxlBhX+rto+)_O7`s<;3d{}xxH2-BQM*e{i~`? z%WWiQE6E=GZ^+HoxrSf1vx&%uIhqESaa)F~cvbV$(fM~+*9z8-m!a4dK=rqR?cD_8 zq6mI+gUoBuH_$Gt2X>QOEvX0NubphPcW?T=l{w9d`xqn2v-Lv1Z)mc2U=h!EnFXwS89-v$bEP#2^<=t^@d}=p`W$+ zfugp;vN&mV;Q1K?D8~_!GW=64m>QIoEQ4i)-WGXa-n)G&pA@Zn$3fB1uL3AMiS z#YH(XK&AY8F#GZ$UnEv{cl)c{pcFd0ihoVIbj{ygWOd0HQo#q2LrUoXc*L z1JQP3=yu`pN0Z(<9qYzT;{#FGDn{Fu!v7pZC!_QJ*GZ<*5|}#&NDbh*ckXNt{qrCH z|Gx6^-&g)@mm&ZCZ917P(Ea{>hW3B4_m*KjBaL)}O6N<8w1~Vk zNJ*n0UD7B>Bi$j=NOy>o#48N~XFjgI_V?}Yti3(kJhfIvWVc@NOq3f1ETs$$UCKJRxN(x}=_UHu z&x-D`yrV3O@i+I&CHVXW#MzgF&QZPF8j zI<=yG!Z_|jz*1@9%+zvMMVq`}1@u^#&k_dzkAp^6R5XegTc475e%15_h*VT%h%rtM zsUy34Gy{5Pi1QTwR00s0|KnevM|cSMlV<3VBj%JOHS`zxCAdBVv@bGU9fIZV(g{Vg z-g2!{!4E1K(cF9XXuYXKT7$)vyNT$bWkI7W4;2R$kT{3Ebz1v_SA_!lNd6|-(s_~E z(s|?W`N4fgZ=#E}D0Hd|Uc*+O%&vATC}rxWJochk2fG`lU|~V;4Tboa zzw*o}%s-9ZzD6OJUY$S=F4mwQeqk!jz*GK8{MUX+Q|Cpd*=m<2U14@8W&? zAEeaRJw#1D0CjDL;twCr1RyL|%3YrvbvuqG%#V(|{oFNLV87F3!BzMA#1`DG&=89C z)DODui@2`?7+0>mfnNWARccQmr8dHLZ5TL3*I?foZZrJSiOV8#rL+j9fh}b4y6rwg zoeL;GwV!XGrUZ`#HO)j4!T)JDZJkW+vnK%IpLeRoVuijm9o25W+7Hx5vEMPxrBP@> z$w1U~3^(NvF+BXsfph3p>WYmgb@>Yq0djs#J`bvx@;B9{#j55qI3p&9s1M-|zP~8S zw-Wo^g)W!^LY5yoTtR`)vdZSDAGq@9znXne)BK&xb8w{T`Zqf=K@*nO@f>7ln5m{5 zqk;>Oj+vC6UB?t)m~L!;>+R~hoN~I?fX{y^BPTF&{g-xJZ?AOTCw_tBFwM@=2qFQ*%60c}|D`Dinmbc$ zielg?gKZuEE%0=T)uxegGKMd_j+Y8NoxB?g4r+|d`#SDCLsRLMkK$QM^4fLWq(>w7 zz^@2dmv=l%ge1H}<(nZY?PQEHEmibGV62?q^;Wxp3zcaT?u%08gWKDdA0^mZKLiI- zz!lC0A%Tu!u9?9cz<#KFcDj-Tf(WB|^o{=~Sjxrx8^<_t;2U%$8{_WxKogeG%~C0V zZ?JQw8{&(9sm3EOg#*BXg$g*%<3#~aD1~|WHc0<|y#MWD5hTMjh7!(5fyeyCL||DL z1Ct1+6hV{kJ#PT0%7`c=oB(Del(+C9Drm;l5{~JCCQrUg061x>J%nFiP=YHVZ;}fR zfFvA2a8WgA$b}p;0}vx)cmn~4Jhhh(jjGe2O9-Y$m&$5@a>D=EW(TgoMWf(hX}>^y z3tsitNSfFK^Zu1fpaH%iWzKcKT>>fFyllg&`OC0P{jt3FICDNlvYBa<6b`6czkkJ< zaejdVyJ>#G49zGv*^adc%&6^d*O?{=)Y2;PPid2aaRq9IyJkZMXEdz5F%!%T;~nu< zCp2fFN0up4U?cHm7{^1ELGR8iz0dzr8i3^3?bB<5OHLR*rw-r*n+CWrDL)DU2jIV& zDp0d4P7OEEG$7VWStcJe+ad;VUzi3|Aj%X`-e*c=+8aRiJ z_pPr(caa5D#0uJVV+sUnMI2^BhB59*d2uN;;P<3k@=z8B^O%JJ3>vT;`*6oeKt>^(fXp2pTYHm9F3u zFkqQt<{2#L+VV`LGGj_ec%`+f$z;H{Qjg5iYM|NwdDG%kmW~x?s53R!spkT}MDUY< z1feT{oLPMvv!=fezKkm%ApWZf=>cgom5r~G;Hgcu%(G;m*D!sbbnTacMfpZ`S0G>Z(fWUbU8902jN8K*y!tFGTVJp>N|;K$Si&P zicggl3p~(!O!jdYjaYy$xnN^VSrR7Qm=iI9jdFt`S!#@Lix#{|rPO6}3%uD+XIoc= z!E9F``{|+|>LTfxYCSo4aD(FASakJte(BrdFkUy3>{41c*>P<$?a3kS&ZPQq&B>U0 zI@=U%R=dY3S$(TPU=WsNRkC?E1uA8^za7yL3w*6!f12SOgj37eW@zHq7Hmjj@`1Oc zly;&s;k=L4?vppcw<8o{0q~*yYJW0#Q7@FN`4u$i*LgOUjG3jhtlTStufdmcBYeed z;DNad5n3%;2)!FeJKh7Z+bA3c2VkPw64OsI+JdX^4$K-wv<1)J)HATug-ILSwBF&6gGuj%71{F?mA~^5p9m^)p zU};Q((SLj+XbWDnV6^%vM+gS3kev7C4H!yMN5p~x*a~vRFO}u^pvRziX9`uYN7Z!J=-2tXHXAW9kzy3h}tT~L8`Q*om5J$L}#)xUWB z-uni{^gAIJH*PR29Si$6upk_Jt<>6jQm_fE<<&i}E+|4xEy(0Pr{M`6B4WH;D)eMKEZ3 zn`CWmFqCpaULSfOijNsozkI$A9^BIJvgZSP6v_0&;}FbWgOg>2KLFYa4gP))Na8|X zJ15J6H!D6RJFtT;o)A3s<%POnGWzmC?$@t9Im|9TVzGPtez|Y16Aa7fRh3pS5HT(1 zZ#!l{#6EnQW@-ns&SQ3Qrv<~+v$KC?17@umQQL?CX3btVYi$VjG@NUK1Z)Wqy)8)e z!0g1K*_i~^HX=cy{?x#|!V0{oZT9!U`ukx0eX#yMSbtlrzb)4Pqb=4$ci+p?wY@Lu z64%7L$!&|cx=76}KG(MetJFlyngVl5jy?L72Y`bZ%!F4ID!|SL_$M3ALHHu_^Gj6- zUtnsemUDnNH`^?7m>_(?P|9Qgp{3-P5koTIK{Ccf&U?sq-;5SXfx3uAPVtfj4~PXG z6;XP_x;dMF5(Z&okjq7o68QW?bZamZx5dy(w`EEN@K|cXq?rum9Jl}oz(IfVGzr-J zPl3IkaG1bPw@RvhaDaYV&a#6EfS@6?0wz5Y@TMz_=Rqt4=?HaWiy?raDkJxp4m?l| zv>nj`=%661J53epqQ_6Q33_mY;;wd-_O4_Lm)FVOUOS$Ij5^NGZzaB0m$0l*rW>f& zar@$AgK(0Z$ZA_5Xh>SnV+o)kEfEL3U?P|-Ix2r{9n>+!8-f%`@QXy5$reV?ItBG8 zs6rr8(NwjX3H%a20omJ7s0)G*iaG$GzP(H3CXfWWU_uo*4FeBAC%X6a8&Mr`+JkYz z)5+1XN%61YVBDS*UkW5bueQVlIBAH-AsYfYe41Z+|L?&`()_|ZwgO;${Y-XeZ@}7t z4J&l0^b(mT^8q3(etIF#(G<%dhAViZ{(&ufq{(ep!ZGWqjQC4;_4Wh?@jAWSV&!(;?u>UZ$jDGNwou_wQ>f(M`zv(