#include <iostream>
#include <vector>
#include <cmath>
#include <ctime>
#include <cstdio>
typedef long long ll;
using namespace std;
//#define RECU 1
static ll p[200]= {0}; // add
ll coprime(ll l, ll h, const vector<int>& pf, size_t i, ll prd,int sign)
{
     #ifdef RECU
     if ( i == pf.size() ) {
         return sign * (h/prd - l/prd);
     } else {
         return coprime(l, h, pf, i+1, prd, sign) + coprime(l, h, pf, i+1, prd*pf[i], -sign);
     }
    #else
    ll sum=0;

    for(int i=0; i<(1<<pf.size()); i++)
        sum+= (h/p[i] - l/p[i]);
    return sum;
    #endif
}
void calc_p(const vector<int>& pf) //add
{
    p[0]=1;
    for(int i=0; i<pf.size(); i++)
        for(int j=(1<<i); j<(1<<(i+1)); j++)
            p[j]=-p[j-(1<<i)]*pf[pf.size()-i-1];
}
int main()
{
    int t=clock();
    int N = 5000000;
    ll N2 = 1LL*N*N;
    vector<vector<int> > primeFactor(N);
    for ( int i = 2; i < N; i++ )
    {
        if ( primeFactor[i].empty() )
        {
            for ( int j = i; j < N; j+=i )
            {
                primeFactor[j].push_back(i);
            }
        }
    }
    //

    ll ans = N2/3; // equilateral triangle
    for ( ll c = 2; c < N; c++ )
    {
        ll a0 = 0.5*(sqrt(5.0)-1)*c; // limit of triangle inequality a*a + a*c > c*c
        ll a = c-1; 
        #ifndef RECU
        calc_p(primeFactor[c]); //add
        #endif 
       while ( a > a0 )
        {
            ll p = a*a + a*c + c*c;
            ll f = N2/(N2/p+1);
            ll an = 0.5*(sqrt(4.0*f-3.0*c*c)-c); // solve f = a*a + a*c + c*c for a
            an = max(an, a0);

            ans += coprime(an, a, primeFactor[c], 0, 1,1) * (N2/p);
            a = an;
        }
    }
    cout << ans << endl;
    cout << clock()-t;
    return 0;
}

修改前70s,修改后60s 如果把calc_p函数的内容移到coprime函数中,增加了冗余运算,比目前的版本慢,仍然比递归版本快.

把a,c,a0,an,coprime等换成int,快了一倍,真是意外的结果。

#include <iostream>
#include <vector>
#include <cmath>
#include <ctime>
#include <cstdio>
typedef long long ll;
using namespace std;

static int p[200]= {0}; // add
//#define RECU 1

int coprime(int l, int h, const vector<int>& pf, size_t i, ll prd,int sign)
{
     #ifdef RECU
     if ( i == pf.size() ) {
         return sign * (h/prd - l/prd);
     } else {
         return coprime(l, h, pf, i+1, prd, sign) + coprime(l, h, pf, i+1, prd*pf[i], -sign);
     }
    #else
    ll sum=0;

    for(int i=0; i<(1<<pf.size()); i++)
        sum+= (h/p[i] - l/p[i]);
    return sum;
    #endif
}
void calc_p(const vector<int>& pf) //add
{
    p[0]=1;
    for(int i=0; i<pf.size(); i++)
        for(int j=(1<<i); j<(1<<(i+1)); j++)
            p[j]=-p[j-(1<<i)]*pf[pf.size()-i-1];
}
int main()
{
    int t=clock();
    int N = 5e6;
    ll N2 = 1LL*N*N;
    vector<vector<int> > primeFactor(N);
    for ( int i = 2; i < N; i++ )
    {
        if ( primeFactor[i].empty() )
        {
            for ( int j = i; j < N; j+=i )
            {
                primeFactor[j].push_back(i);
            }
        }
    }
    //

    ll ans = N2/3; // equilateral triangle
    for ( int c = 2; c < N; c++ )
    {
        int a0 = 0.5*(sqrt(5.0)-1)*c; // limit of triangle inequality a*a + a*c > c*c
        int a = c-1;
        while ( a > a0 )
        {
            ll p = ll(a+c)*(a+c)-ll(a)*c;
            ll f = N2/(N2/p+1);
            int an = 0.5*(sqrt(4.0*f-3.0*c*c)-c); // solve f = a*a + a*c + c*c for a
            an = max(an, a0);
            #ifndef RECU
            calc_p(primeFactor[c]); //add
            #endif

            ans += coprime(an, a, primeFactor[c], 0, 1,1) * (N2/p);
            a = an;
        }
    }
    cout << ans << endl;
    cout << clock()-t;
    return 0;
}