@ARTICLE{Timarchi, author = {Akbarzadeh, Negar and Timarchi, Somayeh and }, title = {Modulo 2n+1 Multiply and MAC Units Specified for DSPs}, volume = {15}, number = {1}, abstract ={Nowadays, digital signal processors (DSPs) are appropriate choices for real-time image and video processing in embedded multimedia applications not only due to their superior signal processing performance, but also of the high levels of integration and very low-power consumption. Filtering which consists of multiple addition and multiplication operations, is one of the most fundamental operations of DSPs. Therefore, there is a need for an additional unit just after the multiplication unit in DSPs. By combining multiply and add units, new structure named MAC (Multiply and ACcumulate) unit is provided. Residue Number System (RNS) can improve speed and power consumption of arithmetic circuits as it offers parallel arithmetic operations on each moduli and confines carry propagation to each moduli. In order to improve the efficiency of the MAC unit, RNS could be utilized. RNS divides large numbers to smaller numbers, called residues, according to a moduli set and enables performing arithmetic operations on each moduli independently. The moduli set {2n-1,2n,2n+1} is the most famous among others because of its simple and efficient implementation. Among this moduli set, modulo 2n+1 circuits are the critical path due to (n+1)-bit wide data path despite other two modules which all have n-bit wide operands. In order to overcome the problem of (n+1) bits operands, three representations has been suggested: diminished-1, Signed-LSB and Stored-Unibit. Although different multipliers have been proposed for diminished-1 representation, no multiplication structure has been proposed for the last two ones. Modulo 2n+1 multipliers are divided into 3 categories depending on their inputs and outputs types: both operands use standard (weighted) representation; one input uses standard representation, while the other one utilizes diminished-1 representation; both inputs use diminished-1 representation. Although several multiply and add units have been proposed for the first 2 categories, no MAC unit is proposed for the multipliers of a third category which outperform multipliers of other categories. In this article at first, one modulo 2n+1 MAC unit for the third category is proposed and then for further improvement, pipeline and multi-voltage techniques are utilized. Pipeline structure enables a trade-off between power consumption and delay. Whenever high-performance with least delay is desirable, nominal supply voltage can be chosen (high performance mode) otherwise by reducing supply voltage to the amount at which pipeline circuit and normal circuit without pipeline would have the same performance, power consumption decreases significantly (low power mode). Simulations are performed in two phases. At first phase, proposed MAC unit without pipeline structure is described via VHDL code and synthesized with synopsys design vision tool. Results indicate that the proposed structure outperforms PDP (Power-Delay-Product) up to 39% compared to the state of the art MAC units. At second phase, CMOS transistor level implementation in two modes i.e. low power and high performance modes with Cadence Design Systems tool is provided. Simulation results indicate that at low power condition, proposed pipeline MAC unit yields to 71% power savings compared to existing circuits without declining efficiency. Furthermore, at high performance condition, however power consumption has increased, reducing delay up to 54% yields to 39% PDP savings for proposed pipeline MAC unit. }, URL = {http://jsdp.rcisp.ac.ir/article-1-543-en.html}, eprint = {http://jsdp.rcisp.ac.ir/article-1-543-en.pdf}, journal = {Signal and Data Processing}, doi = {10.29252/jsdp.15.1.127}, year = {2018} }