#version 450

// AMD FidelityFX Super Resolution 1.0.2 - RCAS (Robust Contrast Adaptive Sharpening)

#include "compiled.inc"

uniform sampler2D tex;

// Sharpness in "stops": 0.0 = maximum sharpness, higher = less sharp
// Converted to linear via exp2(-sharpness)
#ifdef _FSR1_Ultra_Quality
const float SHARPNESS_STOPS = 0.0;
#elif defined(_FSR1_Balanced)
const float SHARPNESS_STOPS = 1.0;
#elif defined(_FSR1_Performance)
const float SHARPNESS_STOPS = 2.0;
#elif defined(_FSR1_Custom)
uniform vec4 PPComp15;
#define SHARPNESS_STOPS (PPComp15.x * 2.0)
#else
const float SHARPNESS_STOPS = 0.5; // Quality (default)
#endif

// FSR RCAS limit - prevents unnatural sharpening artifacts
#define FSR_RCAS_LIMIT (0.25 - (1.0 / 16.0))

in vec2 texCoord;
out vec4 fragColor;

// AMD helper functions from ffx_a.h
float AMin3F1(float x, float y, float z) { return min(x, min(y, z)); }
float AMax3F1(float x, float y, float z) { return max(x, max(y, z)); }

// High precision reciprocal (required for limiters per AMD docs)
// Added epsilon to prevent division by zero in dark areas
float ARcpF1(float a) {
    return 1.0 / max(a, 1e-8);
}

// Medium precision reciprocal approximation (from AMD ffx_a.h)
// Only used for noise detection and final resolve
float APrxMedRcpF1(float a) {
    return uintBitsToFloat(uint(0x7ef19fff) - floatBitsToUint(a));
}

void main() {
    // Get texture size and texel offset
    vec2 texSize = vec2(textureSize(tex, 0));
    vec2 texelSize = 1.0 / texSize;
    
    // Algorithm uses minimal 3x3 pixel neighborhood
    //    b 
    //  d e f
    //    h
    // Clamp inputs to [0,1] - FSR expects sRGB normalized input
    vec3 b = clamp(texture(tex, texCoord + vec2(0.0, -texelSize.y)).rgb, 0.0, 1.0);
    vec3 d = clamp(texture(tex, texCoord + vec2(-texelSize.x, 0.0)).rgb, 0.0, 1.0);
    vec4 ee = texture(tex, texCoord);
    vec3 e = clamp(ee.rgb, 0.0, 1.0);
    vec3 f = clamp(texture(tex, texCoord + vec2(texelSize.x, 0.0)).rgb, 0.0, 1.0);
    vec3 h = clamp(texture(tex, texCoord + vec2(0.0, texelSize.y)).rgb, 0.0, 1.0);
    
    // Luma times 2 (AMD's luma calculation: B*0.5 + R*0.5 + G)
    float bL = b.b * 0.5 + (b.r * 0.5 + b.g);
    float dL = d.b * 0.5 + (d.r * 0.5 + d.g);
    float eL = e.b * 0.5 + (e.r * 0.5 + e.g);
    float fL = f.b * 0.5 + (f.r * 0.5 + f.g);
    float hL = h.b * 0.5 + (h.r * 0.5 + h.g);
    
    // Noise detection (official AMD algorithm with safety for flat areas)
    float nz = 0.25 * bL + 0.25 * dL + 0.25 * fL + 0.25 * hL - eL;
    float range = AMax3F1(AMax3F1(bL, dL, eL), fL, hL) - AMin3F1(AMin3F1(bL, dL, eL), fL, hL);
    // Use safe division instead of APrxMedRcpF1 for range to avoid NaN in flat areas
    nz = clamp(abs(nz) / max(range, 1e-5), 0.0, 1.0);
    nz = -0.5 * nz + 1.0;
    
    // Min and max of ring (per channel)
    float mn4R = min(AMin3F1(b.r, d.r, f.r), h.r);
    float mn4G = min(AMin3F1(b.g, d.g, f.g), h.g);
    float mn4B = min(AMin3F1(b.b, d.b, f.b), h.b);
    float mx4R = max(AMax3F1(b.r, d.r, f.r), h.r);
    float mx4G = max(AMax3F1(b.g, d.g, f.g), h.g);
    float mx4B = max(AMax3F1(b.b, d.b, f.b), h.b);
    
    // Immediate constants for peak range
    vec2 peakC = vec2(1.0, -4.0);
    
    // Limiters - these need HIGH PRECISION reciprocals (per AMD docs)
    float hitMinR = min(mn4R, e.r) * ARcpF1(4.0 * mx4R);
    float hitMinG = min(mn4G, e.g) * ARcpF1(4.0 * mx4G);
    float hitMinB = min(mn4B, e.b) * ARcpF1(4.0 * mx4B);
    float hitMaxR = (peakC.x - max(mx4R, e.r)) * ARcpF1(4.0 * mn4R + peakC.y);
    float hitMaxG = (peakC.x - max(mx4G, e.g)) * ARcpF1(4.0 * mn4G + peakC.y);
    float hitMaxB = (peakC.x - max(mx4B, e.b)) * ARcpF1(4.0 * mn4B + peakC.y);
    float lobeR = max(-hitMinR, hitMaxR);
    float lobeG = max(-hitMinG, hitMaxG);
    float lobeB = max(-hitMinB, hitMaxB);
    
    // Apply sharpness (convert from stops to linear)
    float sharpness = exp2(-SHARPNESS_STOPS);
    float lobe = max(-FSR_RCAS_LIMIT, min(AMax3F1(lobeR, lobeG, lobeB), 0.0)) * sharpness;
    
    // Apply noise removal
    lobe *= nz;
    
    // Resolve using safe reciprocal to avoid any edge case issues
    float denom = 4.0 * lobe + 1.0;
    float rcpL = 1.0 / max(denom, 0.25); // denom should be in [0.25, 1.0] range
    vec3 pix;
    pix.r = (lobe * b.r + lobe * d.r + lobe * h.r + lobe * f.r + e.r) * rcpL;
    pix.g = (lobe * b.g + lobe * d.g + lobe * h.g + lobe * f.g + e.g) * rcpL;
    pix.b = (lobe * b.b + lobe * d.b + lobe * h.b + lobe * f.b + e.b) * rcpL;
    
    // Ensure output is clamped to valid range
    fragColor = vec4(clamp(pix, 0.0, 1.0), ee.a);
}