import torch
device = torch.device('cuda' if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

letter_text = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs phfuvat bgurejvfr v nz va erny gebhoyr"

import math
from collections import Counter


def bow_similarity(text1, text2):
    """
    Computes the cosine similarity between two texts based on word frequencies.
    Splits the text on whitespace after converting to lower case.
    """
    words1 = text1.lower().split()
    words2 = text2.lower().split()
    counter1 = Counter(words1)
    counter2 = Counter(words2)

    def dot(counter_a, counter_b):
        return sum(counter_a[word] * counter_b[word] for word in counter_a)
    
    # TODO: Compute the BoW similarity between two texts
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

def boc_similarity(text1, text2):
    """
    Computes the cosine similarity between two texts based on character frequencies.
    Only considers alphanumeric characters after converting to lower case.
    """
    counter1 = Counter(filter(str.isalnum, text1.lower()))
    counter2 = Counter(filter(str.isalnum, text2.lower()))
    
    def dot(counter_a, counter_b):
        return sum(counter_a[ch] * counter_b[ch] for ch in counter_a)
    
    # TODO: Compute the BoC similarity between two texts
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

from pycipher import Caesar, Affine, Vigenere, SimpleSubstitution

# Sample Lorem Ipsum text
text = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.")
print("Original text:")
print(text)

# Encode using different ciphers
caesar_encoded = Caesar(3).encipher(text)
vigenere_encoded = Vigenere("key").encipher(text)
affine_encoded = Affine(5, 8).encipher(text)
substitution_encoded = SimpleSubstitution(key="phqgiumeaylnofdxjkrcvstzwb").encipher(text)
rot13_encoded = Caesar(13).encipher(text)

# Show the encoded texts
print("\nEncoded texts:")
print("Caesar Cipher:      ", caesar_encoded)
print("Vigenère Cipher:    ", vigenere_encoded)
print("Substitution Cipher:", substitution_encoded)
print("Affine Cipher:      ", affine_encoded)
print("ROT13:              ", rot13_encoded)

letter_string = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs phfuvat bgurejvfr v nz va erny gebhoyr"

print('Kidnapping letter text:', letter_string)

# Save the encoded texts in a dictionary for easy iteration
encoded_versions = {
    "Caesar": caesar_encoded,
    "Vigenère": vigenere_encoded,
    "Substitution": substitution_encoded,
    "Affine": affine_encoded,
    "ROT13": rot13_encoded
}

# TODO: Calculate and print the BoW similarities between the encoded sample texts and the letter text
print("\nBag-of-Words (BoW) similarity with test string:")

# TODO: Calculate and print the BoC similarities between the encoded sample texts and the letter text
print("\nBag-of-Characters (BoC) similarity with test string:")

import math
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk
import codecs
import numpy as np
import torch
import string
import torch.nn.functional as F
import random

from collections import namedtuple

class Rot13Dataset(Dataset):
    def __init__(self, word_list, device="cpu", one_hot_targets=True):
        # ---- vocab (owned by the dataset) ----
        self.vocab = ('<PAD>','<EOS>','<UNK>','<SOS>',' ') + tuple(string.ascii_lowercase)  # 31
        self.vocab_size = len(self.vocab)
        self.char2idx = {ch:i for i,ch in enumerate(self.vocab)}
        self.idx2char = {i:ch for i,ch in enumerate(self.vocab)}
        self.padding_idx = self.char2idx['<PAD>']

        self.device = device
        self.one_hot_targets = one_hot_targets

        words = [w.lower() for w in word_list if w.isalpha()]
        self.data = []
        for _ in range(10000):  # choose your dataset size
            k = random.randint(3, 12)
            phrase = " ".join(random.choices(words, k=k))
            rot = codecs.encode(phrase, 'rot_13')
            src = torch.tensor([self.char2idx.get(c, self.char2idx['<UNK>']) for c in rot], dtype=torch.long)
            tgt = torch.tensor([self.char2idx.get(c, self.char2idx['<UNK>']) for c in phrase], dtype=torch.long)
            self.data.append((src, tgt))

    # ---------- helpers ----------
    def encode(self, s):  # string -> 1D LongTensor
        return torch.tensor([self.char2idx.get(c, self.char2idx['<UNK>']) for c in s], dtype=torch.long)

    def decode(self, idxs):  # 1D LongTensor -> string
        return ''.join(self.idx2char[int(i)] for i in idxs)

    # ---------- Dataset API ----------
    def __len__(self): return len(self.data)
    def __getitem__(self, i): return self.data[i]  # (T,), (T,)

    # ---------- Collate (pad, optional one-hot, move to device) ----------
    def collate(self, batch):
        X_list, Y_list = zip(*batch)  # tuples of 1D tensors
        X = pad_sequence(X_list, batch_first=True, padding_value=self.padding_idx)        # (B,T)
        Y_idx = pad_sequence(Y_list, batch_first=True, padding_value=self.padding_idx)    # (B,T)

        Y = F.one_hot(Y_idx.clamp_min(0), num_classes=self.vocab_size).float()        # (B,T,V)
        return X.to(self.device), Y.to(self.device)

# Get words from NLTK and take a subset.
# Download the NLTK 'words' corpus if needed.
nltk.download('words')
from nltk.corpus import words
word_list = words.words()
filtered_words = [w for w in word_list if w.isalpha()]

dataset = Rot13Dataset(filtered_words, device=device)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=dataset.collate)
print(f"Total training samples: {len(dataset)}")

# TODO: Implement the LinearLayer class (Consider copying from Homework04)

class EmbeddingLayer(LinearLayer):
    """
    An embedding layer 

    Attributes:
        W (numpy.ndarray): Weight matrix with shape (output_dim, input_dim).
        b (numpy.ndarray): Bias vector with shape (output_dim, 1).
        vocab_size (int): Size of the vocabulary
        X (numpy.ndarray): Cached input used during the forward pass.
        dW (numpy.ndarray): Gradient with respect to the weights.
        db (numpy.ndarray): Gradient with respect to the biases.
    """
    def __init__(self, vocab_size, embed_dim, device="cpu"):
        """
        Initialize the EmbeddingLayer using the constructor from the LinearLayer

        Args:
            vocab_size (int): Number of tokens in the vocab
            embed_dim (int): Size of the token embedding
        """

        # call the LinearLayer constructor
        super().__init__(input_dim=vocab_size, output_dim=embed_dim, device=device)
        self.vocab_size = vocab_size

    def forward(self, X_idx):
        """
        Compute the forward pass of the embedding layer.

        Args:
            X_idx (torch.Tensor): Input data with shape (batch_size, sequence_length, vocab_size)

        Returns:
            torch.Tensor: Linear output with shape (batch_size, sequence_length, embedding_dim)

        Notes:
            The bias is disabled as it is not used in the embedding layer.
        """
        # TODO: Implement the forward pass for the embedding layer
        # Build one-hot so LinearLayer can do its usual math

class Tanh:
    """
    The tanh activation function

    Attributes:
       None 
    """
    def forward(self, X):
        """
        Compute the forward pass of the tanh activation function

        Args:
            X (torch.Tensor): Input data with shape (batch_size, sequence_length, )

        Returns:
            torch.Tensor: the tanh of the input X
        """
        # TODO: Return the tanh activation

    def backward(self, dA):
        """
        Compute the backward pass of the tanh activation function

        Args:
            dA (torch.Tensor): Gradient data with shape (batch_size, sequence_length, hidden_size)

        Returns:
            torch.Tensor: dA passed into the derivative of tanh
        """
        # TODO: Return the derivative of tanh

    def update(self, lr):
        """
        Update the parameters of the layer using gradient descent.

        Args:
            lr (float): Learning rate for the parameter update.

        Returns:
            None
        """
        # TODO: Update the parameters with learning rate lr

class RecurrentBlock:
    """
    A Recurrent Block

    Attributes:
        W (torch.Tensor): Input Weight matrix with shape (hidden_size, input_dim).
        U (torch.Tensor): Hidden Weight matrix with shape (hidden_size, hidden_size).
        b (torch.Tensor): Bias vector with shape (hidden_size).
        Grad_Info (namedtuple): Cached hidden state info using the Grad_Info tuple made in forward and used in backward
        dW (torch.Tensor): Gradient with respect to the input weights.
        dU (torch.Tensor): Gradient with respect to the hidden weights.
        db (torch.Tensor): Gradient with respect to the biases.
    """

    Grad_Info = namedtuple('Grad_Info', ['x_at_timestep', 'h_at_timestep', 'h_prev_at_timestep'])

    def __init__(self, input_dim, hidden_size, device='cpu'):
        self.input_dim = input_dim      # This should match the embedding size.
        self.hidden_size = hidden_size
        self.device = device

        self.W = torch.randn(hidden_size, input_dim, device=self.device) * math.sqrt(2.0 / input_dim)
        self.U = torch.randn(hidden_size, hidden_size, device=self.device) * math.sqrt(2.0 / input_dim)
        self.b = torch.zeros(hidden_size, device=self.device)

        self.activation = Tanh()
        self.hidden_states = []

    def forward(self, X):
        """
        Compute the forward pass of the recurrent block over the entire input sequence

        Args:
            X (torch.Tensor): embedded input with shape: (batch_size, sequence_len, hidden_size)

        Returns:
            self.outputs (torch.Tensor): matrix containing the hiddent state at each timestep with shape: (batch_size, sequence_len, hidden_size)
        """
        batch_size, seq_len, _ = X.shape

        outputs = []
        self.hidden_states = [self.Grad_Info(
            x_at_timestep=None,
            h_at_timestep=torch.zeros((batch_size, self.hidden_size), device=self.device),
            h_prev_at_timestep=None
        )]

        for timestep in range(seq_len):
            x_at_timestep = X[:, timestep, :]

            # TODO: Compute the forward pass for each item in the sequence

            outputs.append(h_at_timestep)

        # Stack outputs along the time-axis: (batch_size, seq_len, hidden_size)
        self.outputs = torch.stack(outputs, dim=1)
        return self.outputs

    def backward(self, d_outputs):
        """
        Compute the backward pass of the recurrent block using backpropagation through time (BPTT)

        Args:
            d_outputs (torch.Tensor): Gradient with respect to outputs with shape: (batch_size, sequence_len, hidden_size)

        Returns:
            torch.Tensor: dX after BPTT is performed with shape: (batch_size, sequence_len, embedding_size)
        """
        # d_outputs: 
        batch_size, seq_len, hidden_size = d_outputs.shape

        dW = torch.zeros_like(self.W, device=self.device)
        dU = torch.zeros_like(self.U, device=self.device)
        db = torch.zeros_like(self.b, device=self.device)
        dX = torch.zeros((batch_size, seq_len, self.input_dim), device=self.device)

        dh_next = torch.zeros((batch_size, hidden_size), device=self.device)  # Gradient propagated from future time steps.

        for backwards_timestep in list(reversed(range(seq_len))):
            # TODO: Compute the backward pass for each item in the sequence

        self.dW = dW
        self.dU = dU
        self.db = db

        return dX

    def update(self, lr):
        """
        Update the parameters of the block using gradient descent.

        Args:
            lr (float): Learning rate for the parameter update.

        Returns:
            None
        """
        # TODO: Update the parameters with learning rate lr

from typing import Any


class RecurrentNeuralNetwork:
    def __init__(self, vocab_size, embed_size, hidden_size, padding_idx=0, device='cpu'):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.padding_idx = padding_idx
        self.device = device

        # TODO: Initialize the 3 components of the RNN

        # Layers:
        embedding_layer = EmbeddingLayer(vocab_size, embed_size, device=device)
        # For the recurrent context layer, the input dimension is the embed_size.
        recurrent_block = RecurrentBlock(embed_size, hidden_size, device=device)
        # Final fully connected layer: project hidden state to vocabulary logits.
        output_layer = LinearLayer(hidden_size, vocab_size, device=device)

        # Keep model layers in a list for easy backward and update passes.
        self.layers = [embedding_layer, recurrent_block, output_layer]

    def forward(self, X, eval=False):
        """
        Args:
          x: (batch_size, seq_len) with integer token indices.
        Returns:
          logits: (batch_size, seq_len, vocab_size)
          outputs: (batch_size, seq_len, hidden_size) final hidden states over time.
        """
        # TODO: Calculate the output of the network

        return X if not eval else self.softmax(X)

    def backward(self, Y_hat, Y):
        # TODO: Calculate the gradient of the loss with respect to the input

    def softmax(self, X):
        """
        Args:
            X (torch.Tensor): Input data with shape (n_classes, m), where n_classes is the number of classes
                               and m is the number of examples.

        Returns:
            torch.Tensor: Softmax probabilities with shape (n_classes, m).
        """
        # TODO: Store the input and calculate the output of the softmax layer

    def cross_entropy(self, logits, Y):
        """
        Compute the cross-entropy loss.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix of shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels of shape (n_classes, m).

        Returns:
            float: The average cross-entropy loss over all m examples.

        Notes:
            A small constant epsilon is added to Y_hat to avoid computing log(0).
        """

        # TODO: Calculate the cross-entropy loss

    def get_accuracy(self, logits, Y):
        """
        Compute the classification accuracy.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix from the network, shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels, shape (n_classes, m).

        Returns:
            float: Accuracy as a fraction between 0 and 1.
        """

        # TODO: Calculate the accuracy of the network

    def test_on_letter(self, dataset):
        kidnapping_letter = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs svgm bgurejvfr v nz va erny gebhoyr"  # ROT13 of "hello"
        x = dataset.encode(kidnapping_letter).unsqueeze(0).to(self.device)  # (1, T)

        preds = self.forward(x, eval=True)
        preds  = preds.argmax(dim=-1)  # (1, T)
        # x = dataset.decode(preds).unsqueeze(0).to(self.device)  # (1, T)
        predicted_deciphered = "".join(dataset.idx2char[int(i)] for i in preds.squeeze(0))

        print(f"\tInput (ROT13): {kidnapping_letter}")
        print(f"\tPredicted original: {predicted_deciphered}")

        return predicted_deciphered

    def train(self, train_loader, epochs=100, learning_rate=0.001, verbose=True):
        """
        Train the neural network using mini-batch gradient descent.

        Args:
            X (numpy.ndarray): Input data with shape (784, m), where each column is a flattened MNIST style image.
            Y (numpy.ndarray): One-hot encoded labels with shape (n_classes, m), where n_classes is 26
            epochs (int): Number of epochs for training.
            learning_rate (float): Learning rate for the parameter updates.
            batch_size (int, optional): Number of examples per mini-batch. Default is 32.
            verbose (bool, optional): If True, prints training progress every 500 epochs. Default is False.

        Returns:
            dict: A dictionary containing:
                - 'loss_history': List of loss values for each epoch.
                - 'accuracy_history': List of accuracy values for each epoch.

        Process:
            - Shuffles the dataset each epoch.
            - Processes data in mini-batches.
            - Performs a forward pass, backpropagation, and parameter updates for each mini-batch.
            - Computes the loss and accuracy for the entire dataset after each epoch.
        """
        loss_history = []
        accuracy_history = []
        
        for i in range(epochs):
            batch_losses = []
            batch_accuracies = []

            for X_batch, Y_batch in train_loader:
                # Forward propagation
                # TODO: Calculate the output of the network

                # Calculate metrics for the whole epoch
                loss = self.cross_entropy(Y_hat_batch, Y_batch)
                accuracy = self.get_accuracy(Y_hat_batch, Y_batch)
                
                batch_losses.append(loss.item())
                batch_accuracies.append(accuracy.item())
                
                # Backward propagation
                # TODO: Calculate the gradients of the loss with respect to the input
                
                # Update parameters
                # TODO: Update the weights and biases of the layer using the learning rate

            loss_history.append(np.mean(batch_losses))
            accuracy_history.append(np.mean(batch_accuracies))
            
            if verbose and i % (epochs // 4) == 0:
                print(f"Epoch {i+1}/{epochs}")
                print(f"loss: {loss_history[-1]:.5f}")
                print(f"accuracy: {accuracy_history[-1]:.5f}")
                print('Output test:')
                self.test_on_letter(train_loader.dataset)
                print("-" * 30)
        
        return {'loss_history': loss_history, 'accuracy_history': accuracy_history}

# TODO: Instantiate and train our RNN

import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
kidnapping_letter = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs svgm bgurejvfr v nz va erny gebhoyr"  # ROT13 of "hello"

vocab_size = dataset.vocab_size
embed_size = 32
hidden_size = 64

class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        # TODO: Instantiate the embedding, RNN, and linear layers

    def forward(self, x, h0=None):
        # TODO: Implement the forward pass

# TODO: Instantiate the model, optimizer, and loss function

rnn_model.train()
for epoch in range(100):
    total_loss = 0.0
    correct = 0
    count = 0

    for xb, yb in train_loader:
        rnn_optimizer.zero_grad()

        # TODO: Call the forward pass
        
        B, T, V = logits.shape

        # If yb is one-hot (B, T, V), convert to indices (B, T)
        if yb.dim() == 3:
            # Assumes last dim is vocab dimension
            yb = yb.argmax(dim=-1)          # (B, T)

        # flatten for CrossEntropyLoss
        logits_flat = logits.reshape(B * T, V)  # (B*T, vocab)
        yb_flat = yb.reshape(B * T)             # (B*T,)

        # TODO: Calculate the loss and backpropagate

        total_loss += loss.item()

        preds = logits_flat.argmax(dim=-1)
        correct += (preds == yb_flat).sum().item()
        count += yb_flat.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / count

    if epoch % 25 == 0:
        print(f"Epoch {epoch+1} | loss: {avg_loss:.5f} | accuracy: {accuracy:.5f}")
        rnn_model.eval()
        x = dataset.encode(kidnapping_letter).unsqueeze(0).to(device)  # (1, T)

        preds, _ = rnn_model.forward(x)
        preds  = preds.argmax(dim=-1)  # (1, T)
        predicted_deciphered = "".join(dataset.idx2char[int(i)] for i in preds.squeeze(0))

        print(f"\tInput (ROT13): {kidnapping_letter}")
        print(f"\tPredicted original: {predicted_deciphered}")
        rnn_model.train()

import math
import torch
import torch.nn.functional as F
from collections import namedtuple

class SelfAttentionBlock:
    Grad_Info = namedtuple('Grad_Info', [
        'x',        # (B, T, C) input
        'q', 'k', 'v',                 # (B, T, Dh)
        'scores',                      # (B, T, T)
        'attn',                        # (B, T, T)
        'attn_v',                      # (B, T, Dh)  = attn @ v
        'out_before_proj'              # (B, T, Dh)  = attn_v (alias for clarity)
    ])

    def __init__(self, model_dim, head_dim=None, device='cpu'):
        """
        Single-head self-attention for clarity.
        model_dim: C (embedding size)
        head_dim: Dh (defaults to model_dim)
        """
        self.C = model_dim
        self.Dh = head_dim if head_dim is not None else model_dim
        assert self.Dh == self.C, "For this simple single-head block, set head_dim == model_dim."

        self.device = device

        # Parameters (weights are (out_dim, in_dim) to match @ x.T usage)
        scale_in = math.sqrt(2.0 / self.C)
        self.Wq = torch.randn(self.Dh, self.C, device=self.device) * scale_in
        self.Wk = torch.randn(self.Dh, self.C, device=self.device) * scale_in
        self.Wv = torch.randn(self.Dh, self.C, device=self.device) * scale_in
        self.bq = torch.zeros(self.Dh, device=self.device)
        self.bk = torch.zeros(self.Dh, device=self.device)
        self.bv = torch.zeros(self.Dh, device=self.device)

        self.Wo = torch.randn(self.C, self.Dh, device=self.device) * math.sqrt(2.0 / self.Dh)
        self.bo = torch.zeros(self.C, device=self.device)

        self.cache = None   # holds Grad_Info from the last forward

    def forward(self, X, key_pad_mask=None, causal=False):
        """
        X: (B, T, C)
        key_pad_mask: optional (B, T) bool; True where PAD token — disallow attending to those keys.
        causal: if True, apply lower-triangular mask (not needed for ROT13, included for pedagogy).
        Returns: Y = X + proj(Attn(X))  (B, T, C)
        """
        B, T, C = X.shape

        # TODO: Calculate Q, K, and V

        # TODO: Calculate the scaled dot-product scores

        # Masks (optional)
        if key_pad_mask is not None:
            # Disallow attending to PAD keys (set scores to -inf for those columns)
            # key_pad_mask: True where PAD
            mask_k = key_pad_mask.unsqueeze(1).expand(B, T, T)  # broadcast over query length
            scores = scores.masked_fill(mask_k, float('-inf'))
        if causal:
            tril = torch.tril(torch.ones(T, T, device=X.device, dtype=torch.bool))
            scores = scores.masked_fill(~tril, float('-inf'))

        # TODO: Calculate the softmax over keys

        # TODO: Calculate the attention vectors

        # TODO: Calculate the output projection, then add the residual

        # Cache for backward
        self.cache = SelfAttentionBlock.Grad_Info(
            x=X, q=Q, k=K, v=V, scores=scores, attn=attn,
            attn_v=attn_v, out_before_proj=out_before_proj
        )
        return Y

    def backward(self, dY, key_pad_mask=None, causal=False):
        """
        dY: gradient wrt output Y, shape (B, T, C)
        Returns: dX (B, T, C)
        Computes and accumulates parameter grads in self.dW*, self.db*.
        """
        B, T, C = dY.shape
        cache = self.cache

        # Y = X + y_ctx
        dy_ctx = dY.clone()                 # (B, T, C) branch gradient through context path
        dX = dY.clone()                     # residual path contributes identity

        # y_ctx = attn_v @ Wo^T + bo
        # grads for Wo, bo, and attn_v
        # dWo = sum_over_batch,time ( dy_ctx[b,t,:]^T @ attn_v[b,t,:] )
        self.dWo = dy_ctx.reshape(-1, C).T @ cache.attn_v.reshape(-1, self.Dh)
        self.dbo = dy_ctx.sum(dim=(0, 1))

        dattn_v = dy_ctx @ self.Wo          # (B, T, Dh)

        # attn_v = attn @ v
        dv = cache.attn.transpose(1, 2) @ dattn_v             # (B, T, Dh)
        dattn = dattn_v @ cache.v.transpose(1, 2)             # (B, T, T)

        # Softmax backward:
        # Given A = softmax(S), dL/dS = (dL/dA - sum(dL/dA * A, axis=-1, keepdim=True)) * A
        tmp = (dattn * cache.attn).sum(dim=-1, keepdim=True)  # (B, T, 1)
        dscores = (dattn - tmp) * cache.attn                  # (B, T, T)

        # Respect masks in backward (optional, mirrors forward)
        if key_pad_mask is not None:
            mask_k = key_pad_mask.unsqueeze(1).expand(B, T, T)  # True where PAD (disallowed)
            dscores = dscores.masked_fill(mask_k, 0.0)
        if causal:
            tril = torch.tril(torch.ones(T, T, device=dY.device, dtype=torch.bool))
            dscores = dscores.masked_fill(~tril, 0.0)

        # scores = (q @ k^T) / sqrt(Dh)
        factor = 1.0 / math.sqrt(self.Dh)
        dqk = dscores * factor                                # (B, T, T)

        # grads wrt q and k via matmul rules
        dq = dqk @ cache.k                                    # (B, T, Dh)
        dk = dqk.transpose(1, 2) @ cache.q                    # (B, T, Dh)

        # q = X Wq^T + bq ; k = X Wk^T + bk ; v = X Wv^T + bv
        # Accumulate parameter grads
        # dW = sum_over_batch,time ( dproj[b,t,:]^T @ X[b,t,:] )
        self.dWq = dq.reshape(-1, self.Dh).T @ cache.x.reshape(-1, self.C)
        self.dbq = dq.sum(dim=(0, 1))

        self.dWk = dk.reshape(-1, self.Dh).T @ cache.x.reshape(-1, self.C)
        self.dbk = dk.sum(dim=(0, 1))

        self.dWv = dv.reshape(-1, self.Dh).T @ cache.x.reshape(-1, self.C)
        self.dbv = dv.sum(dim=(0, 1))

        # dX accumulates contributions from Q/K/V branches
        dX += dq @ self.Wq
        dX += dk @ self.Wk
        dX += dv @ self.Wv

        return dX

    def update(self, lr):
        # SGD update (swap for Adam if you want)
        self.Wq -= lr * self.dWq; self.bq -= lr * self.dbq
        self.Wk -= lr * self.dWk; self.bk -= lr * self.dbk
        self.Wv -= lr * self.dWv; self.bv -= lr * self.dbv
        self.Wo -= lr * self.dWo; self.bo -= lr * self.dbo

class ReLU:
    """
    Element-wise rectified linear activation.
    """
    def forward(self, X):
        """
        Apply ReLU activation and cache the input tensor.

        Args:
            X (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Tensor with negatives zeroed out, same shape as `X`.
        """

        # TODO: Store the input and calculate and return the output of the ReLU layer

    def backward(self, dA):
        """
        Propagate gradients through the ReLU non-linearity.

        Args:
            dA (torch.Tensor): Upstream gradient matching the shape of the forward output.

        Returns:
            torch.Tensor: Gradient with respect to the input, zeroed where the cached input was non-positive.
        """

        # TODO: Calculate and return the gradient of the loss with respect to the input

    def update(self, lr):
        """
        Keep API parity with trainable layers; ReLU has no parameters to update.

        Args:
            lr (float): Unused learning rate argument.

        Returns:
            None
        """
        # TODO: Update the weights and biases of the layer using the learning rate

device = torch.device('cuda' if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

class EncoderTransformer:
    def __init__(self, vocab_size: int, padding_idx: int, ctx_len: int = 256, d_model: int = 128):
        self.padding_idx = padding_idx
        self.ctx_len = ctx_len
        self.device = torch.device('cuda' if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

        # TODO: Instantiate the components of the transformer

    def forward(self, X, eval=False):
        """
        Args:
          x: (batch_size, seq_len) with integer token indices.
        Returns:
          logits: (batch_size, seq_len, vocab_size)
          outputs: (batch_size, seq_len, hidden_size) final hidden states over time.
        """
        # TODO: Calculate the output of the network

        return X if not eval else self.softmax(X)

    def backward(self, Y_hat, Y):
        # TODO: Calculate the gradient of the loss with respect to the input

    def softmax(self, X):
        """
        Apply softmax to the input tensor.

        Args:
            X (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Tensor with softmax applied, same shape as `X`.
        """
        return F.softmax(X, dim=-1)

    def cross_entropy(self, Y_hat, Y):
        """
        Compute the cross-entropy loss.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix of shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels of shape (n_classes, m).

        Returns:
            float: The average cross-entropy loss over all m examples.

        Notes:
            A small constant epsilon is added to Y_hat to avoid computing log(0).
        """

        # TODO: Calculate the cross-entropy loss

    def get_accuracy(self, Y_hat, Y):
        """
        Compute the classification accuracy.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix from the network, shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels, shape (n_classes, m).

        Returns:
            float: Accuracy as a fraction between 0 and 1.
        """

        # TODO: Calculate the accuracy of the network

    def test_on_letter(self, dataset):
        kidnapping_letter = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs phfuvat bgurejvfr v nz va erny gebhoyr"
        x = dataset.encode(kidnapping_letter).unsqueeze(0).to(self.device)  # (1, T)

        preds = self.forward(x)
        preds  = preds.argmax(dim=-1)  # (1, T)
        predicted_deciphered = "".join(dataset.idx2char[int(i)] for i in preds.squeeze(0))

        print(f"\tInput (ROT13): {kidnapping_letter}")
        print(f"\tPredicted original: {predicted_deciphered}")

        return predicted_deciphered

    def train(self, train_loader, epochs=100, learning_rate=0.001, verbose=True):
        """
        Train the neural network using mini-batch gradient descent.

        Args:
            X (numpy.ndarray): Input data with shape (784, m), where each column is a flattened MNIST style image.
            Y (numpy.ndarray): One-hot encoded labels with shape (n_classes, m), where n_classes is 26
            epochs (int): Number of epochs for training.
            learning_rate (float): Learning rate for the parameter updates.
            batch_size (int, optional): Number of examples per mini-batch. Default is 32.
            verbose (bool, optional): If True, prints training progress every 500 epochs. Default is False.

        Returns:
            dict: A dictionary containing:
                - 'loss_history': List of loss values for each epoch.
                - 'accuracy_history': List of accuracy values for each epoch.

        Process:
            - Shuffles the dataset each epoch.
            - Processes data in mini-batches.
            - Performs a forward pass, backpropagation, and parameter updates for each mini-batch.
            - Computes the loss and accuracy for the entire dataset after each epoch.
        """
        loss_history = []
        accuracy_history = []
        
        for i in range(epochs):
            batch_losses = []
            batch_accuracies = []

            for X_batch, Y_batch in train_loader:
                # Forward propagation
                # TODO: Calculate the output of the network

                # Calculate metrics for the whole epoch
                loss = self.cross_entropy(Y_hat_batch, Y_batch)
                accuracy = self.get_accuracy(Y_hat_batch, Y_batch)
                
                batch_losses.append(loss.item())
                batch_accuracies.append(accuracy.item())
                
                # Backward propagation
                # TODO: Calculate the gradients of the loss with respect to the input
                
                # Update parameters
                # TODO: Update the weights and biases of the layer using the learning rate

            loss_history.append(np.mean(batch_losses))
            accuracy_history.append(np.mean(batch_accuracies))
            
            if verbose and i % (epochs // 4) == 0:
                print(f"Epoch {i+1}/{epochs}")
                print(f"loss: {loss_history[-1]:.5f}")
                print(f"accuracy: {accuracy_history[-1]:.5f}")
                print('Output test:')
                self.test_on_letter(train_loader.dataset)
                print("-" * 30)
        
        return {'loss_history': loss_history, 'accuracy_history': accuracy_history}

vocab_size = dataset.vocab_size
padding_idx = dataset.padding_idx

# TODO: Instantiate the model, train it, and test it

import math
import torch
import torch.nn as nn

# --- PositionalEncoding unchanged ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()
                             * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):  # x: (B, T, E)
        return x + self.pe[:, :x.size(1)]

class Rot13TransformerEncoder(nn.Module):
    """
    Encoder-only: forward(src) -> logits over vocab for each token (B, T, V).
    Matches RNN behavior: aligned per-timestep prediction.
    """
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers,
                 dim_feedforward, dropout, max_len, padding_idx):
        super().__init__()
        self.d_model = d_model
        self.padding_idx = padding_idx

        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True  # <- no transposes
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_encoder_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src):                 # src: (B, T) indices
        key_padding_mask = (src == self.padding_idx)  # (B, T) bool
        kpm = key_padding_mask if key_padding_mask.any() else None

        # TODO: Implement the forward pass

        return logits

device = torch.device('cuda' if torch.cuda.is_available()
          else "mps" if torch.backends.mps.is_available() else "cpu")

max_len = 512

model = Rot13TransformerEncoder(
    vocab_size=dataset.vocab_size,
    d_model=128,
    nhead=4,
    num_encoder_layers=2,
    dim_feedforward=256,
    dropout=0.1,
    max_len=max_len,
    padding_idx=dataset.padding_idx
).to(device)

# TODO: Instantiate the optimizer and loss function

def train(model, dataloader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total = 0.0
        for src, tgt_output in dataloader:
            if tgt_output.dim() == 3:
                tgt_output = tgt_output.argmax(dim=-1)   # (B,T)

            optimizer.zero_grad()

            # TODO: Call the forward pass

            B, T, V = logits.shape

            loss = criterion(logits.reshape(B*T, V), tgt_output.reshape(B*T))

            # TODO: Backpropagate and Gradient Descent

            total += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total/len(dataloader):.4f}")

train(model, train_loader, optimizer, criterion, num_epochs=25)

kidnapping_letter = "v ybirq zheqrevat ze gurvfrasyblq ng gur rfgngr gur tnf jnf gur cresrpg zheqre jrncba vz fher vyy trg njnl jvgu vg nf jryy ubcrshyyl abobql svtherf bhg gur pbzovangvba bs gur cnqybpx cynl ba ybpxre 69 ba gur frpbaq sybbe bs phfuvat bgurejvfr v nz va erny gebhoyr"

model.eval()

with torch.no_grad():
    # Encode string to indices (1, T)
    src = dataset.encode(kidnapping_letter).unsqueeze(0).to(device)

    # Run through model
    logits = model(src)               # (1, T, V)
    preds = logits.argmax(dim=-1)     # (1, T)

    # Decode predicted indices back to string
    output_str = dataset.decode(preds.squeeze(0))

print(f"Input:  {kidnapping_letter}")
print(f"Output: {output_str}")

Task ID	Description	Points
00	Cipher Classification and Dataset Creation
00-1	- Bag-of-Words	1
00-2	- Bag-of-Characters	1
00-3	- Cipher Classification	1
00-4	- Dataset Creation	0
01	Recurrent Neural Network
01-1	- Linear Layer	0
01-2	- Embedding Layer	1
01-3	- tanh Activation	1
01-4	- Recurrent Block	1
01-5	- Recurrent Neural Network	1
01-6	- RNN Training and Output	1
02	Torch Recurrent Neural Network Comparison
02-1	- Torch RNN Class, Training, and Comparison	1
02-2	- RNN Short Answer Questions	2
03	Encoder-Only Transformer
03-1	- ReLU Activation	1
03-2	- Self-Attention Block	1
03-3	- Encoder-Only Transformer	1
03-4	- Encoder-Only Transformer Training and Output	1
04	Torch Encoder-Only Transformer Comparison
04-1	- Torch Positional Embeddings Class	0
04-2	- Torch Transformer Class	1
04-3	- Torch Transformer Training and Comparison	1
04-4	- Transformer Short Answer Questions	2
05	Final Evidence Collection
05-1	- Selfie with Evidence	1

Introduction to Artificial Intelligence - Homework Assignment 05 (20pts.)¶

Story Progression¶

Task 00: Text Similarity¶

Task 00-1: Description (0 pts.)¶

Bag-of-Words¶

Task 00-1: Code (1 pt.)¶

Task 00-2: Description (0 pts.)¶

N-Gram/Bag-of-Characters Comparison¶

Task 00-2: Code (1 pt.)¶

Task 00-3: Description¶

Cipher Classification¶

Task 00-3: Code (1 pt.)¶

Expected Output¶

Story Progression¶

Task 00-4: Description (0 pts.)¶

Dataset creation¶

Task 00-4: Code (0 pts.)¶

Story Progression¶

Task 01: Recurrent Neural Network¶

Task 01-1: Description (0 pts.)¶

Linear Layer¶

Task 01-1: Code (0 pts.)¶

Task 01-2: Description (0 pts.)¶

Embedding Layer¶

Task 01-2: Code (1 pt.)¶

Task 01-3: Description (0 pts.)¶

Tanh Activation Function¶

Task 01-3: Code (1 pt.)¶

Task 01-4: Description (0 pts.)¶

Recurrent Block¶

Task 01-3: Code (1 pt.)¶

Task 01-5: Description (0 pts.)¶

Recurrent Neural Network¶

Task 01-5: Code (1 pt.)¶

Task 01-6: Description (0 pts.)¶

Recurrent Neural Network¶

Task 01-6: Code (1 pt.)¶

Task 01-6: Reference Output (0 pts.)¶

Story Progression¶

Task 02: RNN Comparisons¶

Task 02-1: Description (0 pts.)¶

RNN Class Instantiation¶

Task 02-1: Code (1 pt.)¶

Task 02-1: Reference Output (0 pts.)¶

Task 02-2: RNN Short Answer Questions (2 pts.)¶

Task 03: Encoder-Only Transformer¶

Task 03-1: Description (0 pts.)¶

Self-Attention Block¶

Task 03-1: Code (1 pt.)¶

Task 03-2: Description (0 pts.)¶

Rectified Linear Unit (ReLU)¶

Task 03-2: Code (1 pt.)¶

Task 03-3: Description (0 pts.)¶

Encoder-only Transformer Model¶

Task 03-3: Code (1 pt.)¶

Task 03-4: Description (0 pts.)¶

EncoderTransformer Training and Testing¶

Task 03-4: Code (1 pt.)¶

Task 03-4: Reference Output (0 pts.)¶

Task 04: Transformer Comparisons¶

Task 04-1: Description (0 pts.)¶

Positional Embedding Class¶

Task 04-1: Code (0 pts.)¶

Task 04-2: Description (0 pts.)¶

Encoder-Only Transformer Class¶

Task 04-2: Code (1 pt.)¶

Task 04-3: Description (0 pts.)¶

Encoder-Only Transformer Training and Testing¶

Task 04-3: Code (1 pt.)¶

Task 04-3: Reference Output (0 pts.)¶

Task 04-4: Transformer Short Answer Questions (2 pts.)¶

Task 05: Final Evidence Collection¶

Task 05-1: Selfie with Evidence (1 pt.)¶

Actually photographing the evidence¶